From 370d6686683f44b6e869210eea2cf6a905b38a55 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Sat, 12 Oct 2013 06:33:47 +0200
Subject: [PATCH 01/94] mg_disk: remove deprecated IRQF_DISABLED

This patch proposes to remove the use of the IRQF_DISABLED flag

It's a NOOP since 2.6.35 and it will be removed one day.

Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mg_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 77a60bedd7a3..7bc363f1ee82 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -936,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev)
 			goto probe_err_3b;
 		}
 		err = request_irq(host->irq, mg_irq,
-				IRQF_DISABLED | IRQF_TRIGGER_RISING,
+				IRQF_TRIGGER_RISING,
 				MG_DEV_NAME, host);
 		if (err) {
 			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",

From fbe363c476afe8ec992d3baf682670a4bd1b6ce6 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 12 Aug 2013 12:53:44 +0200
Subject: [PATCH 02/94] xen-blkfront: revoke foreign access for grants not
 mapped by the backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's no need to keep the foreign access in a grant if it is not
persistently mapped by the backend. This allows us to free grants that
are not mapped by the backend, thus preventing blkfront from hoarding
all grants.

The main effect of this is that blkfront will only persistently map
the same grants as the backend, and it will always try to use grants
that are already mapped by the backend. Also the number of persistent
grants in blkfront is the same as in blkback (and is controlled by the
value in blkback).

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Matt Wilson <msw@amazon.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8d53ed293606..429d52640940 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1013,13 +1013,38 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 	}
 	/* Add the persistent grant into the list of free grants */
 	for (i = 0; i < nseg; i++) {
-		list_add(&s->grants_used[i]->node, &info->persistent_gnts);
-		info->persistent_gnts_c++;
+		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
+			/*
+			 * If the grant is still mapped by the backend (the
+			 * backend has chosen to make this grant persistent)
+			 * we add it at the head of the list, so it will be
+			 * reused first.
+			 */
+			list_add(&s->grants_used[i]->node, &info->persistent_gnts);
+			info->persistent_gnts_c++;
+		} else {
+			/*
+			 * If the grant is not mapped by the backend we end the
+			 * foreign access and add it to the tail of the list,
+			 * so it will not be picked again unless we run out of
+			 * persistent grants.
+			 */
+			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
+			s->grants_used[i]->gref = GRANT_INVALID_REF;
+			list_add_tail(&s->grants_used[i]->node, &info->persistent_gnts);
+		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
 		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
-			list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
-			info->persistent_gnts_c++;
+			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
+				list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
+				info->persistent_gnts_c++;
+			} else {
+				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+				list_add_tail(&s->indirect_grants[i]->node,
+				              &info->persistent_gnts);
+			}
 		}
 	}
 }

From c47206e25f28232ff979994c32758c82841d81cd Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 12 Aug 2013 12:53:43 +0200
Subject: [PATCH 03/94] xen-blkfront: improve aproximation of required grants
 per request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improve the calculation of required grants to process a request by
using nr_phys_segments instead of always assuming a request is going
to use all posible segments.

nr_phys_segments contains the number of scatter-gather DMA addr+len
pairs, which is basically what we put at every granted page.
for_each_sg iterates over the DMA addr+len pairs and uses a grant
page for each of them.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 429d52640940..5b8a15483a4c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -400,10 +400,13 @@ static int blkif_queue_request(struct request *req)
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
-	max_grefs = info->max_indirect_segments ?
-		    info->max_indirect_segments +
-		    INDIRECT_GREFS(info->max_indirect_segments) :
-		    BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	max_grefs = req->nr_phys_segments;
+	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		/*
+		 * If we are using indirect segments we need to account
+		 * for the indirect grefs used in the request.
+		 */
+		max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
 
 	/* Check if we have enough grants to allocate a requests */
 	if (info->persistent_gnts_c < max_grefs) {

From ea5ec76d76da9279d12027c1828544c5ccbe7932 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 5 Sep 2013 13:00:14 +0200
Subject: [PATCH 04/94] xen/blkback: fix reference counting

If the permission check fails, we drop a reference to the blkif without
having taken it in the first place. The bug was introduced in commit
604c499cbbcc3d5fe5fb8d53306aa0fae1990109 (xen/blkback: Check device
permissions before allowing OP_DISCARD).

Cc: stable@vger.kernel.org
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkback/blkback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index bf4b9d282c04..6620b73d0490 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -887,6 +887,8 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	unsigned long secure;
 	struct phys_req preq;
 
+	xen_blkif_get(blkif);
+
 	preq.sector_number = req->u.discard.sector_number;
 	preq.nr_sects      = req->u.discard.nr_sectors;
 
@@ -899,7 +901,6 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	}
 	blkif->st_ds_req++;
 
-	xen_blkif_get(blkif);
 	secure = (blkif->vbd.discard_secure &&
 		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 		 BLKDEV_DISCARD_SECURE : 0;

From ef7e7c82e02b602f29c2b87f42dcd6143a6777da Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 15 Oct 2013 14:14:38 -0600
Subject: [PATCH 05/94] loop: fix crash when using unassigned loop device

When the loop module is loaded, it creates 8 loop devices /dev/loop[0-7].
The devices have no request routine and thus, when they are used without
being assigned, a crash happens.

For example, these commands cause crash (assuming there are no used loop
devices):

Kernel Fault: Code=26 regs=000000007f420980 (Addr=0000000000000010)
CPU: 1 PID: 50 Comm: kworker/1:1 Not tainted 3.11.0 #1
Workqueue: ksnaphd do_metadata [dm_snapshot]
task: 000000007fcf4078 ti: 000000007f420000 task.ti: 000000007f420000
[  116.319988]
     YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
PSW: 00001000000001001111111100001111 Not tainted
r00-03  000000ff0804ff0f 00000000408bf5d0 00000000402d8204 000000007b7ff6c0
r04-07  00000000408a95d0 000000007f420950 000000007b7ff6c0 000000007d06c930
r08-11  000000007f4205c0 0000000000000001 000000007f4205c0 000000007f4204b8
r12-15  0000000000000010 0000000000000000 0000000000000000 0000000000000000
r16-19  000000001108dd48 000000004061cd7c 000000007d859800 000000000800000f
r20-23  0000000000000000 0000000000000008 0000000000000000 0000000000000000
r24-27  00000000ffffffff 000000007b7ff6c0 000000007d859800 00000000408a95d0
r28-31  0000000000000000 000000007f420950 000000007f420980 000000007f4208e8
sr00-03  0000000000000000 0000000000000000 0000000000000000 0000000000303000
sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  117.549988]
IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402d82fc 00000000402d8300
 IIR: 53820020    ISR: 0000000000000000  IOR: 0000000000000010
 CPU:        1   CR30: 000000007f420000 CR31: ffffffffffffffff
 ORIG_R28: 0000000000000001
 IAOQ[0]: generic_make_request+0x11c/0x1a0
 IAOQ[1]: generic_make_request+0x120/0x1a0
 RP(r2): generic_make_request+0x24/0x1a0
Backtrace:
 [<00000000402d83f0>] submit_bio+0x70/0x140
 [<0000000011087c4c>] dispatch_io+0x234/0x478 [dm_mod]
 [<0000000011087f44>] sync_io+0xb4/0x190 [dm_mod]
 [<00000000110883bc>] dm_io+0x2c4/0x310 [dm_mod]
 [<00000000110bfcd0>] do_metadata+0x28/0xb0 [dm_snapshot]
 [<00000000401591d8>] process_one_work+0x160/0x460
 [<0000000040159bc0>] worker_thread+0x300/0x478
 [<0000000040161a70>] kthread+0x118/0x128
 [<0000000040104020>] end_fault_vector+0x20/0x28
 [<0000000040177220>] task_tick_fair+0x420/0x4d0
 [<00000000401aa048>] invoke_rcu_core+0x50/0x60
 [<00000000401ad5b8>] rcu_check_callbacks+0x210/0x8d8
 [<000000004014aaa0>] update_process_times+0xa8/0xc0
 [<00000000401ab86c>] rcu_process_callbacks+0x4b4/0x598
 [<0000000040142408>] __do_softirq+0x250/0x2c0
 [<00000000401789d0>] find_busiest_group+0x3c0/0xc70
[  119.379988]
Kernel panic - not syncing: Kernel Fault
Rebooting in 1 seconds..

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dbdb88a4976c..c8dac7305244 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -894,13 +894,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	bio_list_init(&lo->lo_bio_list);
 
-	/*
-	 * set queue make_request_fn, and add limits based on lower level
-	 * device
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
-	lo->lo_queue->queuedata = lo;
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
@@ -1618,6 +1611,8 @@ static int loop_add(struct loop_device **l, int i)
 	if (!lo)
 		goto out;
 
+	lo->lo_state = Lo_unbound;
+
 	/* allocate id, if @id >= 0, we're requesting that specific id */
 	if (i >= 0) {
 		err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
@@ -1635,6 +1630,12 @@ static int loop_add(struct loop_device **l, int i)
 	if (!lo->lo_queue)
 		goto out_free_idr;
 
+	/*
+	 * set queue make_request_fn
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
+
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;

From e5feab229f199dadee91073fbef5b507046086fd Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Wed, 4 Sep 2013 13:59:02 -0500
Subject: [PATCH 06/94] rsxx: Handling failed pci_map_page on PowerPC and
 double free.

The rsxx driver was not checking the correct value during a
pci_map_page failure. Fixing this also uncovered a
double free if the bio was returned before it was
broken up into indiviadual 4k dmas, that is also
fixed here.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/core.c      |  3 ++-
 drivers/block/rsxx/dma.c       | 47 +++++++++++++++++++---------------
 drivers/block/rsxx/rsxx_priv.h |  9 ++++++-
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 6e85e21445eb..e740a650d546 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -654,7 +654,8 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
 	for (i = 0; i < card->n_targets; i++) {
 		spin_lock_bh(&card->ctrl[i].queue_lock);
 		cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
-					     &card->ctrl[i].queue);
+					     &card->ctrl[i].queue,
+					     COMPLETE_DMA);
 		spin_unlock_bh(&card->ctrl[i].queue_lock);
 
 		cnt += rsxx_dma_cancel(&card->ctrl[i]);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index bed32f16b084..71d1ca2a1444 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -221,6 +221,19 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
 }
 
 /*----------------- RSXX DMA Handling -------------------*/
+static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
+{
+	if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+		pci_unmap_page(ctrl->card->dev, dma->dma_addr,
+			       get_dma_size(dma),
+			       dma->cmd == HW_CMD_BLK_WRITE ?
+					   PCI_DMA_TODEVICE :
+					   PCI_DMA_FROMDEVICE);
+	}
+
+	kmem_cache_free(rsxx_dma_pool, dma);
+}
+
 static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
 				  struct rsxx_dma *dma,
 				  unsigned int status)
@@ -232,21 +245,14 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
 	if (status & DMA_CANCELLED)
 		ctrl->stats.dma_cancelled++;
 
-	if (dma->dma_addr)
-		pci_unmap_page(ctrl->card->dev, dma->dma_addr,
-			       get_dma_size(dma),
-			       dma->cmd == HW_CMD_BLK_WRITE ?
-					   PCI_DMA_TODEVICE :
-					   PCI_DMA_FROMDEVICE);
-
 	if (dma->cb)
 		dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0);
 
-	kmem_cache_free(rsxx_dma_pool, dma);
+	rsxx_free_dma(ctrl, dma);
 }
 
 int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
-			   struct list_head *q)
+			   struct list_head *q, unsigned int done)
 {
 	struct rsxx_dma *dma;
 	struct rsxx_dma *tmp;
@@ -254,7 +260,10 @@ int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
 
 	list_for_each_entry_safe(dma, tmp, q, list) {
 		list_del(&dma->list);
-		rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		if (done & COMPLETE_DMA)
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		else
+			rsxx_free_dma(ctrl, dma);
 		cnt++;
 	}
 
@@ -370,7 +379,7 @@ static void dma_engine_stalled(unsigned long data)
 
 		/* Clean up the DMA queue */
 		spin_lock(&ctrl->queue_lock);
-		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
 		spin_unlock(&ctrl->queue_lock);
 
 		cnt += rsxx_dma_cancel(ctrl);
@@ -623,7 +632,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 	dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
 				     dir ? PCI_DMA_TODEVICE :
 				     PCI_DMA_FROMDEVICE);
-	if (!dma->dma_addr) {
+	if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
 		kmem_cache_free(rsxx_dma_pool, dma);
 		return -ENOMEM;
 	}
@@ -736,11 +745,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	return 0;
 
 bvec_err:
-	for (i = 0; i < card->n_targets; i++) {
-		spin_lock_bh(&card->ctrl[i].queue_lock);
-		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
-		spin_unlock_bh(&card->ctrl[i].queue_lock);
-	}
+	for (i = 0; i < card->n_targets; i++)
+		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
+					FREE_DMA);
 
 	return st;
 }
@@ -990,7 +997,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
 
 		/* Clean up the DMA queue */
 		spin_lock_bh(&ctrl->queue_lock);
-		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
 		spin_unlock_bh(&ctrl->queue_lock);
 
 		rsxx_dma_cancel(ctrl);
@@ -1045,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 		card->ctrl[i].e_cnt = 0;
 
 		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			if (dma->dma_addr)
+			if (!pci_dma_mapping_error(card->dev, dma->dma_addr))
 				pci_unmap_page(card->dev, dma->dma_addr,
 					       get_dma_size(dma),
 					       dma->cmd == HW_CMD_BLK_WRITE ?
@@ -1073,7 +1080,7 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
 					dma->cmd == HW_CMD_BLK_WRITE ?
 					PCI_DMA_TODEVICE :
 					PCI_DMA_FROMDEVICE);
-			if (!dma->dma_addr) {
+			if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
 				spin_unlock_bh(&card->ctrl[i].queue_lock);
 				kmem_cache_free(rsxx_dma_pool, dma);
 				return -ENOMEM;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 5ad5055a4104..82779058e8ec 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -345,6 +345,11 @@ enum rsxx_creg_stat {
 	CREG_STAT_TAG_MASK	= 0x0000ff00,
 };
 
+enum rsxx_dma_finish {
+	FREE_DMA	= 0x0,
+	COMPLETE_DMA	= 0x1,
+};
+
 static inline unsigned int CREG_DATA(int N)
 {
 	return CREG_DATA0 + (N << 2);
@@ -379,7 +384,9 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
 int rsxx_dma_setup(struct rsxx_cardinfo *card);
 void rsxx_dma_destroy(struct rsxx_cardinfo *card);
 int rsxx_dma_init(void);
-int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+				struct list_head *q,
+				unsigned int done);
 int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
 void rsxx_dma_cleanup(void);
 void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);

From 1b21f5b2ad6047995b19b15024353a9fa64810f1 Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Wed, 4 Sep 2013 13:59:35 -0500
Subject: [PATCH 07/94] rsxx: Moving pci_map_page to prevent overflow.

The pci_map_page function has been moved into our
issued workqueue to prevent an us running out of
mappable addresses on non-HWWD PCIe x8 slots. The
maximum amount that can possible be mapped at one
time now is: 255 dmas X 4 dma channels X 4096 Bytes.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/core.c |  5 ---
 drivers/block/rsxx/dma.c  | 70 ++++++++++++++++-----------------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index e740a650d546..a8de2eec6ff3 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -749,10 +749,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 
 	card->eeh_state = 0;
 
-	st = rsxx_eeh_remap_dmas(card);
-	if (st)
-		goto failed_remap_dmas;
-
 	spin_lock_irqsave(&card->irq_lock, flags);
 	if (card->n_targets & RSXX_MAX_TARGETS)
 		rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G);
@@ -779,7 +775,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 	return PCI_ERS_RESULT_RECOVERED;
 
 failed_hw_buffers_init:
-failed_remap_dmas:
 	for (i = 0; i < card->n_targets; i++) {
 		if (card->ctrl[i].status.buf)
 			pci_free_consistent(card->dev,
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 71d1ca2a1444..34fd1018c8e5 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -397,6 +397,7 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 	int tag;
 	int cmds_pending = 0;
 	struct hw_cmd *hw_cmd_buf;
+	int dir;
 
 	hw_cmd_buf = ctrl->cmd.buf;
 
@@ -433,6 +434,28 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 			continue;
 		}
 
+		if (dma->cmd == HW_CMD_BLK_WRITE)
+			dir = PCI_DMA_TODEVICE;
+		else
+			dir = PCI_DMA_FROMDEVICE;
+
+		/*
+		 * The function pci_map_page is placed here because we can
+		 * only, by design, issue up to 255 commands to the hardware
+		 * at one time per DMA channel. So the maximum amount of mapped
+		 * memory would be 255 * 4 channels * 4096 Bytes which is less
+		 * than 2GB, the limit of a x8 Non-HWWD PCIe slot. This way the
+		 * pci_map_page function should never fail because of a
+		 * lack of mappable memory.
+		 */
+		dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+				     dma->pg_off, dma->sub_page.cnt << 9, dir);
+		if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+			push_tracker(ctrl->trackers, tag);
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+			continue;
+		}
+
 		set_tracker_dma(ctrl->trackers, tag, dma);
 		hw_cmd_buf[ctrl->cmd.idx].command  = dma->cmd;
 		hw_cmd_buf[ctrl->cmd.idx].tag      = tag;
@@ -629,14 +652,6 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 	if (!dma)
 		return -ENOMEM;
 
-	dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
-				     dir ? PCI_DMA_TODEVICE :
-				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
-		kmem_cache_free(rsxx_dma_pool, dma);
-		return -ENOMEM;
-	}
-
 	dma->cmd          = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
 	dma->laddr        = laddr;
 	dma->sub_page.off = (dma_off >> 9);
@@ -1039,6 +1054,11 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 			else
 				card->ctrl[i].stats.reads_issued--;
 
+			pci_unmap_page(card->dev, dma->dma_addr,
+				       get_dma_size(dma),
+				       dma->cmd == HW_CMD_BLK_WRITE ?
+				       PCI_DMA_TODEVICE :
+				       PCI_DMA_FROMDEVICE);
 			list_add_tail(&dma->list, &issued_dmas[i]);
 			push_tracker(card->ctrl[i].trackers, j);
 			cnt++;
@@ -1050,15 +1070,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 		atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
 		card->ctrl[i].stats.sw_q_depth += cnt;
 		card->ctrl[i].e_cnt = 0;
-
-		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			if (!pci_dma_mapping_error(card->dev, dma->dma_addr))
-				pci_unmap_page(card->dev, dma->dma_addr,
-					       get_dma_size(dma),
-					       dma->cmd == HW_CMD_BLK_WRITE ?
-					       PCI_DMA_TODEVICE :
-					       PCI_DMA_FROMDEVICE);
-		}
 		spin_unlock_bh(&card->ctrl[i].queue_lock);
 	}
 
@@ -1067,31 +1078,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 	return 0;
 }
 
-int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
-{
-	struct rsxx_dma *dma;
-	int i;
-
-	for (i = 0; i < card->n_targets; i++) {
-		spin_lock_bh(&card->ctrl[i].queue_lock);
-		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			dma->dma_addr = pci_map_page(card->dev, dma->page,
-					dma->pg_off, get_dma_size(dma),
-					dma->cmd == HW_CMD_BLK_WRITE ?
-					PCI_DMA_TODEVICE :
-					PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
-				spin_unlock_bh(&card->ctrl[i].queue_lock);
-				kmem_cache_free(rsxx_dma_pool, dma);
-				return -ENOMEM;
-			}
-		}
-		spin_unlock_bh(&card->ctrl[i].queue_lock);
-	}
-
-	return 0;
-}
-
 int rsxx_dma_init(void)
 {
 	rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);

From 8f8b899563f28ef26e381a6eb90d12dead77389f Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 11 Sep 2013 13:14:42 -0600
Subject: [PATCH 08/94] mtip32xx: Add SRSI support

This patch add support for SRSI(Surprise Removal Surprise Insertion).

Approach:
---------
Surprise Removal:
-----------------
On surprise removal of the device, gendisk, request queue, device index, sysfs
entries, etc are retained as long as device is in use - mounted filesystem,
device opened by an application, etc. The service thread breaks out of the main
while loop, waits for pci remove to exit, and then waits for device to become
free. When there no holders of the device, service thread cleans up the block
and device related stuff and returns.

Surprise Insertion:
-------------------
No change, this scenario follows the normal pci probe() function flow.

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 457 ++++++++++++++++++------------
 drivers/block/mtip32xx/mtip32xx.h |  18 +-
 2 files changed, 291 insertions(+), 184 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 952dbfe22126..76f3bc4f0c21 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -126,64 +126,30 @@ struct mtip_compat_ide_task_request_s {
 static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 {
 	u16 vendor_id = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (dd->sr)
+		return true;
 
        /* Read the vendorID from the configuration space */
 	pci_read_config_word(pdev, 0x00, &vendor_id);
-	if (vendor_id == 0xFFFF)
+	if (vendor_id == 0xFFFF) {
+		dd->sr = true;
+		if (dd->queue)
+			set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
+		else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->queue is NULL\n", __func__);
+		if (dd->port) {
+			set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+		} else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->port is NULL\n", __func__);
 		return true; /* device removed */
-
-	return false; /* device present */
-}
-
-/*
- * This function is called for clean the pending command in the
- * command slot during the surprise removal of device and return
- * error to the upper layer.
- *
- * @dd Pointer to the DRIVER_DATA structure.
- *
- * return value
- *	None
- */
-static void mtip_command_cleanup(struct driver_data *dd)
-{
-	int group = 0, commandslot = 0, commandindex = 0;
-	struct mtip_cmd *command;
-	struct mtip_port *port = dd->port;
-	static int in_progress;
-
-	if (in_progress)
-		return;
-
-	in_progress = 1;
-
-	for (group = 0; group < 4; group++) {
-		for (commandslot = 0; commandslot < 32; commandslot++) {
-			if (!(port->allocated[group] & (1 << commandslot)))
-				continue;
-
-			commandindex = group << 5 | commandslot;
-			command = &port->commands[commandindex];
-
-			if (atomic_read(&command->active)
-			    && (command->async_callback)) {
-				command->async_callback(command->async_data,
-					-ENODEV);
-				command->async_callback = NULL;
-				command->async_data = NULL;
-			}
-
-			dma_unmap_sg(&port->dd->pdev->dev,
-				command->sg,
-				command->scatter_ents,
-				command->direction);
-		}
 	}
 
-	up(&port->cmd_slot);
-
-	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
-	in_progress = 0;
+	return false; /* device present */
 }
 
 /*
@@ -222,10 +188,7 @@ static int get_slot(struct mtip_port *port)
 	}
 	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
 
-	if (mtip_check_surprise_removal(port->dd->pdev)) {
-		/* Device not present, clean outstanding commands */
-		mtip_command_cleanup(port->dd);
-	}
+	mtip_check_surprise_removal(port->dd->pdev);
 	return -1;
 }
 
@@ -245,6 +208,107 @@ static inline void release_slot(struct mtip_port *port, int tag)
 	smp_mb__after_clear_bit();
 }
 
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+	int tag = 0;
+	struct mtip_cmd *cmd;
+	struct mtip_port *port = dd->port;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+
+	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
+		return;
+
+	if (!port)
+		return;
+
+	cmd = &port->commands[MTIP_TAG_INTERNAL];
+	if (atomic_read(&cmd->active))
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) &
+					(1 << MTIP_TAG_INTERNAL))
+			if (cmd->comp_func)
+				cmd->comp_func(port, MTIP_TAG_INTERNAL,
+					 cmd->comp_data, -ENODEV);
+
+	while (1) {
+		tag = find_next_bit(port->allocated, num_cmd_slots, tag);
+		if (tag >= num_cmd_slots)
+			break;
+
+		cmd = &port->commands[tag];
+		if (atomic_read(&cmd->active))
+			mtip_async_complete(port, tag, dd, -ENODEV);
+	}
+
+	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+}
+
 /*
  * Reset the HBA (without sleeping)
  *
@@ -584,6 +648,9 @@ static void mtip_timeout_function(unsigned long int data)
 	if (unlikely(!port))
 		return;
 
+	if (unlikely(port->dd->sr))
+		return;
+
 	if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
 		mod_timer(&port->cmd_timer,
 			jiffies + msecs_to_jiffies(30000));
@@ -674,66 +741,6 @@ static void mtip_timeout_function(unsigned long int data)
 		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
 }
 
-/*
- * IO completion function.
- *
- * This completion function is called by the driver ISR when a
- * command that was issued by the kernel completes. It first calls the
- * asynchronous completion function which normally calls back into the block
- * layer passing the asynchronous callback data, then unmaps the
- * scatter list associated with the completed command, and finally
- * clears the allocated bit associated with the completed command.
- *
- * @port   Pointer to the port data structure.
- * @tag    Tag of the command.
- * @data   Pointer to driver_data.
- * @status Completion status.
- *
- * return value
- *	None
- */
-static void mtip_async_complete(struct mtip_port *port,
-				int tag,
-				void *data,
-				int status)
-{
-	struct mtip_cmd *command;
-	struct driver_data *dd = data;
-	int cb_status = status ? -EIO : 0;
-
-	if (unlikely(!dd) || unlikely(!port))
-		return;
-
-	command = &port->commands[tag];
-
-	if (unlikely(status == PORT_IRQ_TF_ERR)) {
-		dev_warn(&port->dd->pdev->dev,
-			"Command tag %d failed due to TFE\n", tag);
-	}
-
-	/* Upper layer callback */
-	if (likely(command->async_callback))
-		command->async_callback(command->async_data, cb_status);
-
-	command->async_callback = NULL;
-	command->comp_func = NULL;
-
-	/* Unmap the DMA scatter list entries */
-	dma_unmap_sg(&dd->pdev->dev,
-		command->sg,
-		command->scatter_ents,
-		command->direction);
-
-	/* Clear the allocated and active bits for the command */
-	atomic_set(&port->commands[tag].active, 0);
-	release_slot(port, tag);
-
-	if (unlikely(command->unaligned))
-		up(&port->cmd_slot_unal);
-	else
-		up(&port->cmd_slot);
-}
-
 /*
  * Internal command completion callback function.
  *
@@ -854,7 +861,6 @@ static void mtip_handle_tfe(struct driver_data *dd)
 					"Missing completion func for tag %d",
 					tag);
 				if (mtip_check_surprise_removal(dd->pdev)) {
-					mtip_command_cleanup(dd);
 					/* don't proceed further */
 					return;
 				}
@@ -1018,14 +1024,12 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
 					command->comp_data,
 					0);
 			} else {
-				dev_warn(&dd->pdev->dev,
-					"Null completion "
-					"for tag %d",
+				dev_dbg(&dd->pdev->dev,
+					"Null completion for tag %d",
 					tag);
 
 				if (mtip_check_surprise_removal(
 					dd->pdev)) {
-					mtip_command_cleanup(dd);
 					return;
 				}
 			}
@@ -1145,7 +1149,6 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
 
 		if (unlikely(port_stat & PORT_IRQ_ERR)) {
 			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
-				mtip_command_cleanup(dd);
 				/* don't proceed further */
 				return IRQ_HANDLED;
 			}
@@ -3006,6 +3009,46 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd)
 		debugfs_remove_recursive(dd->dfs_node);
 }
 
+static int mtip_free_orphan(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (dd->bdev) {
+		if (dd->bdev->bd_holders >= 1)
+			return -2;
+
+		bdput(dd->bdev);
+		dd->bdev = NULL;
+	}
+
+	mtip_hw_debugfs_exit(dd);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
+	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) &&
+			test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+		put_disk(dd->disk);
+	} else {
+		if (dd->disk) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+			del_gendisk(dd->disk);
+			dd->disk = NULL;
+		}
+		if (dd->queue) {
+			dd->queue->queuedata = NULL;
+			blk_cleanup_queue(dd->queue);
+			dd->queue = NULL;
+		}
+	}
+	kfree(dd);
+	return 0;
+}
 
 /*
  * Perform any init/resume time hardware setup
@@ -3154,6 +3197,7 @@ static int mtip_service_thread(void *data)
 	unsigned long slot, slot_start, slot_wrap;
 	unsigned int num_cmd_slots = dd->slot_groups * 32;
 	struct mtip_port *port = dd->port;
+	int ret;
 
 	while (1) {
 		/*
@@ -3164,13 +3208,18 @@ static int mtip_service_thread(void *data)
 			!(port->flags & MTIP_PF_PAUSE_IO));
 
 		if (kthread_should_stop())
+			goto st_out;
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		/* If I am an orphan, start self cleanup */
+		if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
 			break;
 
 		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
 				&dd->dd_flag)))
-			break;
+			goto st_out;
 
-		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
 			slot = 1;
 			/* used to restrict the loop to one iteration */
@@ -3201,7 +3250,7 @@ static int mtip_service_thread(void *data)
 
 			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
 		} else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
-			if (!mtip_ftl_rebuild_poll(dd))
+			if (mtip_ftl_rebuild_poll(dd) < 0)
 				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
 							&dd->dd_flag);
 			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
@@ -3209,8 +3258,30 @@ static int mtip_service_thread(void *data)
 		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 
 		if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
-			break;
+			goto st_out;
 	}
+
+	/* wait for pci remove to exit */
+	while (1) {
+		if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag))
+			break;
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
+	}
+
+	while (1) {
+		ret = mtip_free_orphan(dd);
+		if (!ret) {
+			/* NOTE: All data structures are invalid, do not
+			 * access any here */
+			return 0;
+		}
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
+	}
+st_out:
 	return 0;
 }
 
@@ -3437,13 +3508,13 @@ static int mtip_hw_init(struct driver_data *dd)
 		rv = -EFAULT;
 		goto out3;
 	}
+	mtip_dump_identify(dd->port);
 
 	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
 		MTIP_FTL_REBUILD_MAGIC) {
 		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
 		return MTIP_FTL_REBUILD_MAGIC;
 	}
-	mtip_dump_identify(dd->port);
 
 	/* check write protect, over temp and rebuild statuses */
 	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
@@ -3467,8 +3538,8 @@ static int mtip_hw_init(struct driver_data *dd)
 		}
 		if (buf[288] == 0xBF) {
 			dev_info(&dd->pdev->dev,
-				"Drive indicates rebuild has failed.\n");
-			/* TODO */
+				"Drive is in security locked state.\n");
+			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
 		}
 	}
 
@@ -3523,9 +3594,8 @@ static int mtip_hw_exit(struct driver_data *dd)
 	 * Send standby immediate (E0h) to the drive so that it
 	 * saves its state.
 	 */
-	if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
-
-		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+	if (!dd->sr) {
+		if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
 			if (mtip_standby_immediate(dd->port))
 				dev_warn(&dd->pdev->dev,
 					"STANDBY IMMEDIATE failed\n");
@@ -3551,6 +3621,7 @@ static int mtip_hw_exit(struct driver_data *dd)
 			dd->port->command_list_dma);
 	/* Free the memory allocated for the for structure. */
 	kfree(dd->port);
+	dd->port = NULL;
 
 	return 0;
 }
@@ -3572,7 +3643,8 @@ static int mtip_hw_shutdown(struct driver_data *dd)
 	 * Send standby immediate (E0h) to the drive so that it
 	 * saves its state.
 	 */
-	mtip_standby_immediate(dd->port);
+	if (!dd->sr && dd->port)
+		mtip_standby_immediate(dd->port);
 
 	return 0;
 }
@@ -3887,6 +3959,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 			bio_endio(bio, -ENODATA);
 			return;
 		}
+		if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+			bio_endio(bio, -ENXIO);
+			return;
+		}
 	}
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
@@ -4010,6 +4086,8 @@ static int mtip_block_initialize(struct driver_data *dd)
 	dd->disk->private_data	= dd;
 	dd->index		= index;
 
+	mtip_hw_debugfs_init(dd);
+
 	/*
 	 * if rebuild pending, start the service thread, and delay the block
 	 * queue creation and add_disk()
@@ -4068,6 +4146,7 @@ skip_create_disk:
 	/* Enable the block device and add it to /dev */
 	add_disk(dd->disk);
 
+	dd->bdev = bdget_disk(dd->disk, 0);
 	/*
 	 * Now that the disk is active, initialize any sysfs attributes
 	 * managed by the protocol layer.
@@ -4077,7 +4156,6 @@ skip_create_disk:
 		mtip_hw_sysfs_init(dd, kobj);
 		kobject_put(kobj);
 	}
-	mtip_hw_debugfs_init(dd);
 
 	if (dd->mtip_svc_handler) {
 		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
@@ -4103,7 +4181,8 @@ start_service_thread:
 	return rv;
 
 kthread_run_error:
-	mtip_hw_debugfs_exit(dd);
+	bdput(dd->bdev);
+	dd->bdev = NULL;
 
 	/* Delete our gendisk. This also removes the device from /dev */
 	del_gendisk(dd->disk);
@@ -4112,6 +4191,7 @@ read_capacity_error:
 	blk_cleanup_queue(dd->queue);
 
 block_queue_alloc_init_error:
+	mtip_hw_debugfs_exit(dd);
 disk_index_error:
 	spin_lock(&rssd_index_lock);
 	ida_remove(&rssd_index_ida, index);
@@ -4141,40 +4221,48 @@ static int mtip_block_remove(struct driver_data *dd)
 {
 	struct kobject *kobj;
 
-	if (dd->mtip_svc_handler) {
-		set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
-		wake_up_interruptible(&dd->port->svc_wait);
-		kthread_stop(dd->mtip_svc_handler);
-	}
+	if (!dd->sr) {
+		mtip_hw_debugfs_exit(dd);
 
-	/* Clean up the sysfs attributes, if created */
-	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
-		kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
-		if (kobj) {
-			mtip_hw_sysfs_exit(dd, kobj);
-			kobject_put(kobj);
+		if (dd->mtip_svc_handler) {
+			set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+			kthread_stop(dd->mtip_svc_handler);
 		}
+
+		/* Clean up the sysfs attributes, if created */
+		if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+		}
+		/*
+		 * Delete our gendisk structure. This also removes the device
+		 * from /dev
+		 */
+		if (dd->bdev) {
+			bdput(dd->bdev);
+			dd->bdev = NULL;
+		}
+		if (dd->disk) {
+			if (dd->disk->queue) {
+				del_gendisk(dd->disk);
+				blk_cleanup_queue(dd->queue);
+				dd->queue = NULL;
+			} else
+				put_disk(dd->disk);
+		}
+		dd->disk  = NULL;
+
+		spin_lock(&rssd_index_lock);
+		ida_remove(&rssd_index_ida, dd->index);
+		spin_unlock(&rssd_index_lock);
+	} else {
+		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
+						dd->disk->disk_name);
 	}
-	mtip_hw_debugfs_exit(dd);
-
-	/*
-	 * Delete our gendisk structure. This also removes the device
-	 * from /dev
-	 */
-	if (dd->disk) {
-		if (dd->disk->queue)
-			del_gendisk(dd->disk);
-		else
-			put_disk(dd->disk);
-	}
-
-	spin_lock(&rssd_index_lock);
-	ida_remove(&rssd_index_ida, dd->index);
-	spin_unlock(&rssd_index_lock);
-
-	blk_cleanup_queue(dd->queue);
-	dd->disk  = NULL;
-	dd->queue = NULL;
 
 	/* De-initialize the protocol layer. */
 	mtip_hw_exit(dd);
@@ -4490,8 +4578,7 @@ done:
 static void mtip_pci_remove(struct pci_dev *pdev)
 {
 	struct driver_data *dd = pci_get_drvdata(pdev);
-	int counter = 0;
-	unsigned long flags;
+	unsigned long flags, to;
 
 	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
 
@@ -4500,17 +4587,22 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 	list_add(&dd->remove_list, &removing_list);
 	spin_unlock_irqrestore(&dev_lock, flags);
 
-	if (mtip_check_surprise_removal(pdev)) {
-		while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
-			counter++;
-			msleep(20);
-			if (counter == 10) {
-				/* Cleanup the outstanding commands */
-				mtip_command_cleanup(dd);
-				break;
-			}
-		}
+	mtip_check_surprise_removal(pdev);
+	synchronize_irq(dd->pdev->irq);
+
+	/* Spin until workers are done */
+	to = jiffies + msecs_to_jiffies(4000);
+	do {
+		msleep(20);
+	} while (atomic_read(&dd->irq_workers_active) != 0 &&
+		time_before(jiffies, to));
+
+	if (atomic_read(&dd->irq_workers_active) != 0) {
+		dev_warn(&dd->pdev->dev,
+			"Completion workers still active!\n");
 	}
+	/* Cleanup the outstanding commands */
+	mtip_command_cleanup(dd);
 
 	/* Clean up the block layer. */
 	mtip_block_remove(dd);
@@ -4529,8 +4621,15 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 	list_del_init(&dd->remove_list);
 	spin_unlock_irqrestore(&dev_lock, flags);
 
-	kfree(dd);
+	if (!dd->sr)
+		kfree(dd);
+	else
+		set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag);
+
 	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+	pci_set_drvdata(pdev, NULL);
+	pci_dev_put(pdev);
+
 }
 
 /*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3bb8a295fbe4..9be7a1582ad3 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -140,6 +140,7 @@ enum {
 	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
 	MTIP_PF_ISSUE_CMDS_BIT      = 5,
 	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SR_CLEANUP_BIT      = 7,
 	MTIP_PF_SVC_THD_STOP_BIT    = 8,
 
 	/* below are bit numbers in 'dd_flag' defined in driver_data */
@@ -147,15 +148,18 @@ enum {
 	MTIP_DDF_REMOVE_PENDING_BIT = 1,
 	MTIP_DDF_OVER_TEMP_BIT      = 2,
 	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
-	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
-				(1 << MTIP_DDF_SEC_LOCK_BIT) |
-				(1 << MTIP_DDF_OVER_TEMP_BIT) |
-				(1 << MTIP_DDF_WRITE_PROTECT_BIT)),
-
+	MTIP_DDF_REMOVE_DONE_BIT    = 4,
 	MTIP_DDF_CLEANUP_BIT        = 5,
 	MTIP_DDF_RESUME_BIT         = 6,
 	MTIP_DDF_INIT_DONE_BIT      = 7,
 	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
+				(1 << MTIP_DDF_SEC_LOCK_BIT) |
+				(1 << MTIP_DDF_OVER_TEMP_BIT) |
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT) |
+				(1 << MTIP_DDF_REBUILD_FAILED_BIT)),
+
 };
 
 struct smart_attr {
@@ -499,6 +503,8 @@ struct driver_data {
 
 	bool trim_supp; /* flag indicating trim support */
 
+	bool sr;
+
 	int numa_node; /* NUMA support */
 
 	char workq_name[32];
@@ -511,6 +517,8 @@ struct driver_data {
 
 	int isr_binding;
 
+	struct block_device *bdev;
+
 	int unal_qdepth; /* qdepth of unaligned IO queue */
 
 	struct list_head online_list; /* linkage for online list */

From c8afd0dcbd14e2352258f2e2d359b36d0edd459f Mon Sep 17 00:00:00 2001
From: David Milburn <dmilburn@redhat.com>
Date: Thu, 23 May 2013 16:23:45 -0500
Subject: [PATCH 09/94] mtip32xx: dynamically allocate buffer in debugfs
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynamically allocate buf to prevent warnings:

drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_device_status’:
drivers/block/mtip32xx/mtip32xx.c:2823: warning: the frame size of 1056 bytes is larger than 1024 bytes
drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_registers’:
drivers/block/mtip32xx/mtip32xx.c:2894: warning: the frame size of 1056 bytes is larger than 1024 bytes
drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_flags’:
drivers/block/mtip32xx/mtip32xx.c:2917: warning: the frame size of 1056 bytes is larger than 1024 bytes

Signed-off-by: David Milburn <dmilburn@redhat.com>
Acked-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 47 ++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 76f3bc4f0c21..050c71267f14 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2809,34 +2809,51 @@ static ssize_t show_device_status(struct device_driver *drv, char *buf)
 static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf,
 						size_t len, loff_t *offset)
 {
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
 	int size = *offset;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
+	int rv = 0;
 
 	if (!len || *offset)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: status buffer\n");
+		return -ENOMEM;
+	}
+
 	size += show_device_status(NULL, buf);
 
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
 				  size_t len, loff_t *offset)
 {
 	struct driver_data *dd =  (struct driver_data *)f->private_data;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
 	u32 group_allocated;
 	int size = *offset;
-	int n;
+	int n, rv = 0;
 
 	if (!len || size)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: register buffer\n");
+		return -ENOMEM;
+	}
+
 	size += sprintf(&buf[size], "H/ S ACTive      : [ 0x");
 
 	for (n = dd->slot_groups-1; n >= 0; n--)
@@ -2891,21 +2908,30 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
 				  size_t len, loff_t *offset)
 {
 	struct driver_data *dd =  (struct driver_data *)f->private_data;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
 	int size = *offset;
+	int rv = 0;
 
 	if (!len || size)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: flag buffer\n");
+		return -ENOMEM;
+	}
+
 	size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
 							dd->port->flags);
 	size += sprintf(&buf[size], "Flag-dd   : [ %08lX ]\n",
@@ -2914,9 +2940,10 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static const struct file_operations mtip_device_status_fops = {

From 0317cd6de852a70e0374e7eb40a013072274386f Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Fri, 27 Sep 2013 20:42:50 -0600
Subject: [PATCH 10/94] rsxx: Kernel Panic caused by mapping Discards

This fixes a kernel panic injected by commit id
8d26750143341831bc312f61c5ed141eeb75b8d0 where discards
are getting mapped through the pci_map_page function call.

The driver will now start verifying that a dma is not a
discard before issuing a the pci_map_page function call.

Also, we are updating the driver version.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/dma.c       | 41 ++++++++++++++++++----------------
 drivers/block/rsxx/rsxx_priv.h |  2 +-
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 34fd1018c8e5..4103601ae675 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -434,26 +434,29 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 			continue;
 		}
 
-		if (dma->cmd == HW_CMD_BLK_WRITE)
-			dir = PCI_DMA_TODEVICE;
-		else
-			dir = PCI_DMA_FROMDEVICE;
+		if (dma->cmd != HW_CMD_BLK_DISCARD) {
+			if (dma->cmd == HW_CMD_BLK_WRITE)
+				dir = PCI_DMA_TODEVICE;
+			else
+				dir = PCI_DMA_FROMDEVICE;
 
-		/*
-		 * The function pci_map_page is placed here because we can
-		 * only, by design, issue up to 255 commands to the hardware
-		 * at one time per DMA channel. So the maximum amount of mapped
-		 * memory would be 255 * 4 channels * 4096 Bytes which is less
-		 * than 2GB, the limit of a x8 Non-HWWD PCIe slot. This way the
-		 * pci_map_page function should never fail because of a
-		 * lack of mappable memory.
-		 */
-		dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
-				     dma->pg_off, dma->sub_page.cnt << 9, dir);
-		if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
-			push_tracker(ctrl->trackers, tag);
-			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
-			continue;
+			/*
+			 * The function pci_map_page is placed here because we
+			 * can only, by design, issue up to 255 commands to the
+			 * hardware at one time per DMA channel. So the maximum
+			 * amount of mapped memory would be 255 * 4 channels *
+			 * 4096 Bytes which is less than 2GB, the limit of a x8
+			 * Non-HWWD PCIe slot. This way the pci_map_page
+			 * function should never fail because of a lack of
+			 * mappable memory.
+			 */
+			dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+					dma->pg_off, dma->sub_page.cnt << 9, dir);
+			if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+				push_tracker(ctrl->trackers, tag);
+				rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+				continue;
+			}
 		}
 
 		set_tracker_dma(ctrl->trackers, tag, dma);
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 82779058e8ec..913740e53d31 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -52,7 +52,7 @@ struct proc_cmd;
 #define RS70_PCI_REV_SUPPORTED	4
 
 #define DRIVER_NAME "rsxx"
-#define DRIVER_VERSION "4.0"
+#define DRIVER_VERSION "4.0.1.2498"
 
 /* Block size is 4096 */
 #define RSXX_HW_BLK_SHIFT		12

From e67f86b31ae5be8a88bec27b5ecb18dc2ffc9c56 Mon Sep 17 00:00:00 2001
From: Akhil Bhansali <abhansali@stec-inc.com>
Date: Tue, 15 Oct 2013 14:19:07 -0600
Subject: [PATCH 11/94] Add support for sTec's pci-e flash card Kronos

Signed-off-by: Akhil Bhansali <abhansali@stec-inc.com>
Signed-off-by: Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

Folded patch, contributions to clean up this driver from:

Jens Axboe
Dan Carpenter
Andrew Morton

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig     |   10 +
 drivers/block/Makefile    |    2 +
 drivers/block/skd_main.c  | 5817 +++++++++++++++++++++++++++++++++++++
 drivers/block/skd_s1120.h |  354 +++
 4 files changed, 6183 insertions(+)
 create mode 100644 drivers/block/skd_main.c
 create mode 100644 drivers/block/skd_s1120.h

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e07a5fd58ad7..555aed0b50dd 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -316,6 +316,16 @@ config BLK_DEV_NVME
 	  To compile this driver as a module, choose M here: the
 	  module will be called nvme.
 
+config BLK_DEV_SKD
+	tristate "STEC S1120 Block Driver"
+	depends on PCI
+	depends on 64BIT
+	---help---
+	Saying Y or M here will enable support for the
+	STEC, Inc. S1120 PCIe SSD.
+
+	Use device /dev/skd$N amd /dev/skd$Np$M.
+
 config BLK_DEV_OSD
 	tristate "OSD object-as-blkdev support"
 	depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index ca07399a8d99..f33b36694289 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
+obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
@@ -43,4 +44,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 
 nvme-y		:= nvme-core.o nvme-scsi.o
+skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
new file mode 100644
index 000000000000..3110f68ecedd
--- /dev/null
+++ b/drivers/block/skd_main.c
@@ -0,0 +1,5817 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ * Gordoni Waidhofer <gwaidhofer@stec-inc.com>
+ * Initial Driver Design!
+ * Thomas Swann <tswann@stec-inc.com>
+ * Interrupt handling.
+ * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
+ * biomode implementation.
+ * Akhil Bhansali <abhansali@stec-inc.com>
+ * Added support for DISCARD / FLUSH and FUA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
+#include <linux/completion.h>
+#include <linux/scatterlist.h>
+#include <linux/version.h>
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/aer.h>
+#include <linux/ctype.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm-generic/unaligned.h>
+
+#include "skd_s1120.h"
+
+static int skd_dbg_level;
+static int skd_isr_comp_limit = 4;
+
+enum {
+	STEC_LINK_2_5GTS = 0,
+	STEC_LINK_5GTS = 1,
+	STEC_LINK_8GTS = 2,
+	STEC_LINK_UNKNOWN = 0xFF
+};
+
+enum {
+	SKD_FLUSH_INITIALIZER,
+	SKD_FLUSH_ZERO_SIZE_FIRST,
+	SKD_FLUSH_DATA_SECOND,
+};
+
+#define DPRINTK(skdev, fmt, args ...) \
+	do { \
+		if (unlikely((skdev)->dbg_level > 0)) {	\
+			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
+			       __func__, __LINE__, ## args); \
+		} \
+	} while (0)
+
+#define SKD_ASSERT(expr) \
+	do { \
+		if (unlikely(!(expr))) { \
+			pr_err("Assertion failed! %s,%s,%s,line=%d\n",	\
+			       # expr, __FILE__, __func__, __LINE__); \
+		} \
+	} while (0)
+
+#define VPRINTK(skdev, fmt, args ...) \
+	do { \
+		if (unlikely((skdev)->dbg_level > 1)) {	\
+			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
+			       __func__, __LINE__, ## args); \
+		} \
+	} while (0)
+
+
+#define DRV_NAME "skd"
+#define DRV_VERSION "2.2.1"
+#define DRV_BUILD_ID "0260"
+#define PFX DRV_NAME ": "
+#define DRV_BIN_VERSION 0x100
+#define DRV_VER_COMPL   "2.2.1." DRV_BUILD_ID
+
+MODULE_AUTHOR("bug-reports: support@stec-inc.com");
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("STEC s1120 PCIe SSD block/BIO driver (b" DRV_BUILD_ID ")");
+MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
+
+#define PCI_VENDOR_ID_STEC      0x1B39
+#define PCI_DEVICE_ID_S1120     0x0001
+
+#define SKD_FUA_NV		(1 << 1)
+#define SKD_MINORS_PER_DEVICE   16
+
+#define SKD_MAX_QUEUE_DEPTH     200u
+
+#define SKD_PAUSE_TIMEOUT       (5 * 1000)
+
+#define SKD_N_FITMSG_BYTES      (512u)
+
+#define SKD_N_SPECIAL_CONTEXT   32u
+#define SKD_N_SPECIAL_FITMSG_BYTES      (128u)
+
+/* SG elements are 32 bytes, so we can make this 4096 and still be under the
+ * 128KB limit.  That allows 4096*4K = 16M xfer size
+ */
+#define SKD_N_SG_PER_REQ_DEFAULT 256u
+#define SKD_N_SG_PER_SPECIAL    256u
+
+#define SKD_N_COMPLETION_ENTRY  256u
+#define SKD_N_READ_CAP_BYTES    (8u)
+
+#define SKD_N_INTERNAL_BYTES    (512u)
+
+/* 5 bits of uniqifier, 0xF800 */
+#define SKD_ID_INCR             (0x400)
+#define SKD_ID_TABLE_MASK       (3u << 8u)
+#define  SKD_ID_RW_REQUEST      (0u << 8u)
+#define  SKD_ID_INTERNAL        (1u << 8u)
+#define  SKD_ID_SPECIAL_REQUEST (2u << 8u)
+#define  SKD_ID_FIT_MSG         (3u << 8u)
+#define SKD_ID_SLOT_MASK        0x00FFu
+#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
+
+#define SKD_N_TIMEOUT_SLOT      4u
+#define SKD_TIMEOUT_SLOT_MASK   3u
+
+#define SKD_N_MAX_SECTORS 2048u
+
+#define SKD_MAX_RETRIES 2u
+
+#define SKD_TIMER_SECONDS(seconds) (seconds)
+#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
+
+#define INQ_STD_NBYTES 36
+#define SKD_DISCARD_CDB_LENGTH	24
+
+enum skd_drvr_state {
+	SKD_DRVR_STATE_LOAD,
+	SKD_DRVR_STATE_IDLE,
+	SKD_DRVR_STATE_BUSY,
+	SKD_DRVR_STATE_STARTING,
+	SKD_DRVR_STATE_ONLINE,
+	SKD_DRVR_STATE_PAUSING,
+	SKD_DRVR_STATE_PAUSED,
+	SKD_DRVR_STATE_DRAINING_TIMEOUT,
+	SKD_DRVR_STATE_RESTARTING,
+	SKD_DRVR_STATE_RESUMING,
+	SKD_DRVR_STATE_STOPPING,
+	SKD_DRVR_STATE_FAULT,
+	SKD_DRVR_STATE_DISAPPEARED,
+	SKD_DRVR_STATE_PROTOCOL_MISMATCH,
+	SKD_DRVR_STATE_BUSY_ERASE,
+	SKD_DRVR_STATE_BUSY_SANITIZE,
+	SKD_DRVR_STATE_BUSY_IMMINENT,
+	SKD_DRVR_STATE_WAIT_BOOT,
+	SKD_DRVR_STATE_SYNCING,
+};
+
+#define SKD_WAIT_BOOT_TIMO      SKD_TIMER_SECONDS(90u)
+#define SKD_STARTING_TIMO       SKD_TIMER_SECONDS(8u)
+#define SKD_RESTARTING_TIMO     SKD_TIMER_MINUTES(4u)
+#define SKD_DRAINING_TIMO       SKD_TIMER_SECONDS(6u)
+#define SKD_BUSY_TIMO           SKD_TIMER_MINUTES(20u)
+#define SKD_STARTED_BUSY_TIMO   SKD_TIMER_SECONDS(60u)
+#define SKD_START_WAIT_SECONDS  90u
+
+enum skd_req_state {
+	SKD_REQ_STATE_IDLE,
+	SKD_REQ_STATE_SETUP,
+	SKD_REQ_STATE_BUSY,
+	SKD_REQ_STATE_COMPLETED,
+	SKD_REQ_STATE_TIMEOUT,
+	SKD_REQ_STATE_ABORTED,
+};
+
+enum skd_fit_msg_state {
+	SKD_MSG_STATE_IDLE,
+	SKD_MSG_STATE_BUSY,
+};
+
+enum skd_check_status_action {
+	SKD_CHECK_STATUS_REPORT_GOOD,
+	SKD_CHECK_STATUS_REPORT_SMART_ALERT,
+	SKD_CHECK_STATUS_REQUEUE_REQUEST,
+	SKD_CHECK_STATUS_REPORT_ERROR,
+	SKD_CHECK_STATUS_BUSY_IMMINENT,
+};
+
+struct skd_fitmsg_context {
+	enum skd_fit_msg_state state;
+
+	struct skd_fitmsg_context *next;
+
+	u32 id;
+	u16 outstanding;
+
+	u32 length;
+	u32 offset;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_request_context {
+	enum skd_req_state state;
+
+	struct skd_request_context *next;
+
+	u16 id;
+	u32 fitmsg_id;
+
+	struct request *req;
+	struct bio *bio;
+	unsigned long start_time;
+	u8 flush_cmd;
+	u8 discard_page;
+
+	u32 timeout_stamp;
+	u8 sg_data_dir;
+	struct scatterlist *sg;
+	u32 n_sg;
+	u32 sg_byte_count;
+
+	struct fit_sg_descriptor *sksg_list;
+	dma_addr_t sksg_dma_address;
+
+	struct fit_completion_entry_v1 completion;
+
+	struct fit_comp_error_info err_info;
+
+};
+#define SKD_DATA_DIR_HOST_TO_CARD       1
+#define SKD_DATA_DIR_CARD_TO_HOST       2
+#define SKD_DATA_DIR_NONE		3	/* especially for DISCARD requests. */
+
+struct skd_special_context {
+	struct skd_request_context req;
+
+	u8 orphaned;
+
+	void *data_buf;
+	dma_addr_t db_dma_address;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_sg_io {
+	fmode_t mode;
+	void __user *argp;
+
+	struct sg_io_hdr sg;
+
+	u8 cdb[16];
+
+	u32 dxfer_len;
+	u32 iovcnt;
+	struct sg_iovec *iov;
+	struct sg_iovec no_iov_iov;
+
+	struct skd_special_context *skspcl;
+};
+
+typedef enum skd_irq_type {
+	SKD_IRQ_LEGACY,
+	SKD_IRQ_MSI,
+	SKD_IRQ_MSIX
+} skd_irq_type_t;
+
+#define SKD_MAX_BARS                    2
+
+struct skd_device {
+	volatile void __iomem *mem_map[SKD_MAX_BARS];
+	resource_size_t mem_phys[SKD_MAX_BARS];
+	u32 mem_size[SKD_MAX_BARS];
+
+	skd_irq_type_t irq_type;
+	u32 msix_count;
+	struct skd_msix_entry *msix_entries;
+
+	struct pci_dev *pdev;
+	int pcie_error_reporting_is_enabled;
+
+	spinlock_t lock;
+	struct gendisk *disk;
+	struct request_queue *queue;
+	struct device *class_dev;
+	int gendisk_on;
+	int sync_done;
+
+	atomic_t device_count;
+	u32 devno;
+	u32 major;
+	char name[32];
+	char isr_name[30];
+
+	enum skd_drvr_state state;
+	u32 drive_state;
+
+	u32 in_flight;
+	u32 cur_max_queue_depth;
+	u32 queue_low_water_mark;
+	u32 dev_max_queue_depth;
+
+	u32 num_fitmsg_context;
+	u32 num_req_context;
+
+	u32 timeout_slot[SKD_N_TIMEOUT_SLOT];
+	u32 timeout_stamp;
+	struct skd_fitmsg_context *skmsg_free_list;
+	struct skd_fitmsg_context *skmsg_table;
+
+	struct skd_request_context *skreq_free_list;
+	struct skd_request_context *skreq_table;
+
+	struct skd_special_context *skspcl_free_list;
+	struct skd_special_context *skspcl_table;
+
+	struct skd_special_context internal_skspcl;
+	u32 read_cap_blocksize;
+	u32 read_cap_last_lba;
+	int read_cap_is_valid;
+	int inquiry_is_valid;
+	u8 inq_serial_num[13];  /*12 chars plus null term */
+	u8 id_str[80];          /* holds a composite name (pci + sernum) */
+
+	u8 skcomp_cycle;
+	u32 skcomp_ix;
+	struct fit_completion_entry_v1 *skcomp_table;
+	struct fit_comp_error_info *skerr_table;
+	dma_addr_t cq_dma_address;
+
+	wait_queue_head_t waitq;
+
+	struct timer_list timer;
+	u32 timer_countdown;
+	u32 timer_substate;
+
+	int n_special;
+	int sgs_per_request;
+	u32 last_mtd;
+
+	u32 proto_ver;
+
+	int dbg_level;
+	u32 connect_time_stamp;
+	int connect_retries;
+#define SKD_MAX_CONNECT_RETRIES 16
+	u32 drive_jiffies;
+
+	u32 timo_slot;
+
+
+	struct work_struct completion_worker;
+
+	struct bio_list bio_queue;
+	int queue_stopped;
+
+	struct list_head flush_list;
+};
+
+#define SKD_FLUSH_JOB   "skd-flush-jobs"
+struct kmem_cache *skd_flush_slab;
+
+/*
+ * These commands hold "nonzero size FLUSH bios",
+ * which are enqueud in skdev->flush_list during
+ * completion of "zero size FLUSH commands".
+ * It will be active in biomode.
+ */
+struct skd_flush_cmd {
+	void *cmd;
+	struct list_head flist;
+};
+
+#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
+#define SKD_READL(DEV, OFF)      skd_reg_read32(DEV, OFF)
+#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
+
+static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
+{
+	u32 val;
+
+	if (likely(skdev->dbg_level < 2))
+		return readl(skdev->mem_map[1] + offset);
+	else {
+		barrier();
+		val = readl(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+		return val;
+	}
+
+}
+
+static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+		readl(skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+		readl(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+	}
+}
+
+static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+		readq(skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+		readq(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %016llx\n", offset, val);
+	}
+}
+
+
+#define SKD_IRQ_DEFAULT SKD_IRQ_MSI
+static int skd_isr_type = SKD_IRQ_DEFAULT;
+
+module_param(skd_isr_type, int, 0444);
+MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability."
+		 " (0==legacy, 1==MSI, 2==MSI-X, default==1)");
+
+#define SKD_MAX_REQ_PER_MSG_DEFAULT 1
+static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+
+module_param(skd_max_req_per_msg, int, 0444);
+MODULE_PARM_DESC(skd_max_req_per_msg,
+		 "Maximum SCSI requests packed in a single message."
+		 " (1-14, default==1)");
+
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64"
+static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+
+module_param(skd_max_queue_depth, int, 0444);
+MODULE_PARM_DESC(skd_max_queue_depth,
+		 "Maximum SCSI requests issued to s1120."
+		 " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")");
+
+static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+module_param(skd_sgs_per_request, int, 0444);
+MODULE_PARM_DESC(skd_sgs_per_request,
+		 "Maximum SG elements per block request."
+		 " (1-4096, default==256)");
+
+static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+module_param(skd_max_pass_thru, int, 0444);
+MODULE_PARM_DESC(skd_max_pass_thru,
+		 "Maximum SCSI pass-thru at a time." " (1-50, default==32)");
+
+module_param(skd_dbg_level, int, 0444);
+MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
+
+module_param(skd_isr_comp_limit, int, 0444);
+MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
+
+static int skd_bio;
+module_param(skd_bio, int, 0444);
+MODULE_PARM_DESC(skd_bio,
+		 "Register as a bio device instead of block (0, 1) default=0");
+
+/* Major device number dynamically assigned. */
+static u32 skd_major;
+
+static struct skd_device *skd_construct(struct pci_dev *pdev);
+static void skd_destruct(struct skd_device *skdev);
+static const struct block_device_operations skd_blockdev_ops;
+static void skd_send_fitmsg(struct skd_device *skdev,
+			    struct skd_fitmsg_context *skmsg);
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+				    struct skd_special_context *skspcl);
+static void skd_request_fn(struct request_queue *rq);
+static void skd_end_request(struct skd_device *skdev,
+			    struct skd_request_context *skreq, int error);
+static int skd_preop_sg_list(struct skd_device *skdev,
+			     struct skd_request_context *skreq);
+static void skd_postop_sg_list(struct skd_device *skdev,
+			       struct skd_request_context *skreq);
+
+static void skd_restart_device(struct skd_device *skdev);
+static int skd_quiesce_dev(struct skd_device *skdev);
+static int skd_unquiesce_dev(struct skd_device *skdev);
+static void skd_release_special(struct skd_device *skdev,
+				struct skd_special_context *skspcl);
+static void skd_disable_interrupts(struct skd_device *skdev);
+static void skd_isr_fwstate(struct skd_device *skdev);
+static void skd_recover_requests(struct skd_device *skdev, int requeue);
+static void skd_soft_reset(struct skd_device *skdev);
+
+static const char *skd_name(struct skd_device *skdev);
+const char *skd_drive_state_to_str(int state);
+const char *skd_skdev_state_to_str(enum skd_drvr_state state);
+static void skd_log_skdev(struct skd_device *skdev, const char *event);
+static void skd_log_skmsg(struct skd_device *skdev,
+			  struct skd_fitmsg_context *skmsg, const char *event);
+static void skd_log_skreq(struct skd_device *skdev,
+			  struct skd_request_context *skreq, const char *event);
+
+/* FLUSH FUA flag handling. */
+static int skd_flush_cmd_enqueue(struct skd_device *, void *);
+static void *skd_flush_cmd_dequeue(struct skd_device *);
+
+
+/*
+ *****************************************************************************
+ * READ/WRITE REQUESTS
+ *****************************************************************************
+ */
+static void skd_stop_queue(struct skd_device *skdev)
+{
+	if (!skd_bio)
+		blk_stop_queue(skdev->queue);
+	else
+		skdev->queue_stopped = 1;
+}
+
+static void skd_unstop_queue(struct skd_device *skdev)
+{
+	if (!skd_bio)
+		queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
+	else
+		skdev->queue_stopped = 0;
+}
+
+static void skd_start_queue(struct skd_device *skdev)
+{
+	if (!skd_bio) {
+		blk_start_queue(skdev->queue);
+	} else {
+		pr_err("(%s): Starting queue\n", skd_name(skdev));
+		skdev->queue_stopped = 0;
+		skd_request_fn(skdev->queue);
+	}
+}
+
+static int skd_queue_stopped(struct skd_device *skdev)
+{
+	if (!skd_bio)
+		return blk_queue_stopped(skdev->queue);
+	else
+		return skdev->queue_stopped;
+}
+
+static void skd_fail_all_pending_blk(struct skd_device *skdev)
+{
+	struct request_queue *q = skdev->queue;
+	struct request *req;
+
+	for (;; ) {
+		req = blk_peek_request(q);
+		if (req == NULL)
+			break;
+		blk_start_request(req);
+		__blk_end_request_all(req, -EIO);
+	}
+}
+
+static void skd_fail_all_pending_bio(struct skd_device *skdev)
+{
+	struct bio *bio;
+	int error = -EIO;
+
+	for (;; ) {
+		bio = bio_list_pop(&skdev->bio_queue);
+
+		if (bio == NULL)
+			break;
+
+		bio_endio(bio, error);
+	}
+}
+
+static void skd_fail_all_pending(struct skd_device *skdev)
+{
+	if (!skd_bio)
+		skd_fail_all_pending_blk(skdev);
+	else
+		skd_fail_all_pending_bio(skdev);
+}
+
+static void skd_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct skd_device *skdev = q->queuedata;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	bio_list_add(&skdev->bio_queue, bio);
+	skd_request_fn(skdev->queue);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void
+skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
+		int data_dir, unsigned lba,
+		unsigned count)
+{
+	if (data_dir == READ)
+		scsi_req->cdb[0] = 0x28;
+	else
+		scsi_req->cdb[0] = 0x2a;
+
+	scsi_req->cdb[1] = 0;
+	scsi_req->cdb[2] = (lba & 0xff000000) >> 24;
+	scsi_req->cdb[3] = (lba & 0xff0000) >> 16;
+	scsi_req->cdb[4] = (lba & 0xff00) >> 8;
+	scsi_req->cdb[5] = (lba & 0xff);
+	scsi_req->cdb[6] = 0;
+	scsi_req->cdb[7] = (count & 0xff00) >> 8;
+	scsi_req->cdb[8] = count & 0xff;
+	scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
+			struct skd_request_context *skreq)
+{
+	skreq->flush_cmd = 1;
+
+	scsi_req->cdb[0] = 0x35;
+	scsi_req->cdb[1] = 0;
+	scsi_req->cdb[2] = 0;
+	scsi_req->cdb[3] = 0;
+	scsi_req->cdb[4] = 0;
+	scsi_req->cdb[5] = 0;
+	scsi_req->cdb[6] = 0;
+	scsi_req->cdb[7] = 0;
+	scsi_req->cdb[8] = 0;
+	scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
+			struct skd_request_context *skreq,
+			struct page *page,
+			u32 lba, u32 count)
+{
+	char *buf;
+	unsigned long len;
+	struct request *req;
+
+	buf = page_address(page);
+	len = SKD_DISCARD_CDB_LENGTH;
+
+	scsi_req->cdb[0] = UNMAP;
+	scsi_req->cdb[8] = len;
+
+	put_unaligned_be16(6 + 16, &buf[0]);
+	put_unaligned_be16(16, &buf[2]);
+	put_unaligned_be64(lba, &buf[8]);
+	put_unaligned_be32(count, &buf[16]);
+
+	if (!skd_bio) {
+		req = skreq->req;
+		blk_add_request_payload(req, page, len);
+		req->buffer = buf;
+	} else {
+		skreq->bio->bi_io_vec->bv_page = page;
+		skreq->bio->bi_io_vec->bv_offset = 0;
+		skreq->bio->bi_io_vec->bv_len = len;
+
+		skreq->bio->bi_vcnt = 1;
+		skreq->bio->bi_phys_segments = 1;
+	}
+}
+
+static void skd_request_fn_not_online(struct request_queue *q);
+
+static void skd_request_fn(struct request_queue *q)
+{
+	struct skd_device *skdev = q->queuedata;
+	struct skd_fitmsg_context *skmsg = NULL;
+	struct fit_msg_hdr *fmh = NULL;
+	struct skd_request_context *skreq;
+	struct request *req = NULL;
+	struct bio *bio = NULL;
+	struct skd_scsi_request *scsi_req;
+	struct page *page;
+	unsigned long io_flags;
+	int error;
+	u32 lba;
+	u32 count;
+	int data_dir;
+	u32 be_lba;
+	u32 be_count;
+	u64 be_dmaa;
+	u64 cmdctxt;
+	u32 timo_slot;
+	void *cmd_ptr;
+	int flush, fua;
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		skd_request_fn_not_online(q);
+		return;
+	}
+
+	if (skd_queue_stopped(skdev)) {
+		if (skdev->skmsg_free_list == NULL ||
+		    skdev->skreq_free_list == NULL ||
+		    skdev->in_flight >= skdev->queue_low_water_mark)
+			/* There is still some kind of shortage */
+			return;
+
+		skd_unstop_queue(skdev);
+	}
+
+	/*
+	 * Stop conditions:
+	 *  - There are no more native requests
+	 *  - There are already the maximum number of requests in progress
+	 *  - There are no more skd_request_context entries
+	 *  - There are no more FIT msg buffers
+	 */
+	for (;; ) {
+
+		flush = fua = 0;
+
+		if (!skd_bio) {
+			req = blk_peek_request(q);
+
+			/* Are there any native requests to start? */
+			if (req == NULL)
+				break;
+
+			lba = (u32)blk_rq_pos(req);
+			count = blk_rq_sectors(req);
+			data_dir = rq_data_dir(req);
+			io_flags = req->cmd_flags;
+
+			if (io_flags & REQ_FLUSH)
+				flush++;
+
+			if (io_flags & REQ_FUA)
+				fua++;
+
+			VPRINTK(skdev,
+				"new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				req, lba, lba, count, count, data_dir);
+		} else {
+			if (!list_empty(&skdev->flush_list)) {
+				/* Process data part of FLUSH request. */
+				bio = (struct bio *)skd_flush_cmd_dequeue(skdev);
+				flush++;
+				VPRINTK(skdev, "processing FLUSH request with data.\n");
+			} else {
+				/* peek at our bio queue */
+				bio = bio_list_peek(&skdev->bio_queue);
+			}
+
+			/* Are there any native requests to start? */
+			if (bio == NULL)
+				break;
+
+			lba = (u32)bio->bi_sector;
+			count = bio_sectors(bio);
+			data_dir = bio_data_dir(bio);
+			io_flags = bio->bi_rw;
+
+			VPRINTK(skdev,
+				"new bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				bio, lba, lba, count, count, data_dir);
+
+			if (io_flags & REQ_FLUSH)
+				flush++;
+
+			if (io_flags & REQ_FUA)
+				fua++;
+		}
+
+		/* At this point we know there is a request
+		 * (from our bio q or req q depending on the way
+		 * the driver is built do checks for resources.
+		 */
+
+		/* Are too many requets already in progress? */
+		if (skdev->in_flight >= skdev->cur_max_queue_depth) {
+			VPRINTK(skdev, "qdepth %d, limit %d\n",
+				skdev->in_flight, skdev->cur_max_queue_depth);
+			break;
+		}
+
+		/* Is a skd_request_context available? */
+		skreq = skdev->skreq_free_list;
+		if (skreq == NULL) {
+			VPRINTK(skdev, "Out of req=%p\n", q);
+			break;
+		}
+		SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
+		SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0);
+
+		/* Now we check to see if we can get a fit msg */
+		if (skmsg == NULL) {
+			if (skdev->skmsg_free_list == NULL) {
+				VPRINTK(skdev, "Out of msg\n");
+				break;
+			}
+		}
+
+		skreq->flush_cmd = 0;
+		skreq->n_sg = 0;
+		skreq->sg_byte_count = 0;
+		skreq->discard_page = 0;
+
+		/*
+		 * OK to now dequeue request from either bio or q.
+		 *
+		 * At this point we are comitted to either start or reject
+		 * the native request. Note that skd_request_context is
+		 * available but is still at the head of the free list.
+		 */
+		if (!skd_bio) {
+			blk_start_request(req);
+			skreq->req = req;
+			skreq->fitmsg_id = 0;
+		} else {
+			if (unlikely(flush == SKD_FLUSH_DATA_SECOND)) {
+				skreq->bio = bio;
+			} else {
+				skreq->bio = bio_list_pop(&skdev->bio_queue);
+				SKD_ASSERT(skreq->bio == bio);
+				skreq->start_time = jiffies;
+				part_inc_in_flight(&skdev->disk->part0,
+						   bio_data_dir(bio));
+			}
+
+			skreq->fitmsg_id = 0;
+		}
+
+		/* Either a FIT msg is in progress or we have to start one. */
+		if (skmsg == NULL) {
+			/* Are there any FIT msg buffers available? */
+			skmsg = skdev->skmsg_free_list;
+			if (skmsg == NULL) {
+				VPRINTK(skdev, "Out of msg skdev=%p\n", skdev);
+				break;
+			}
+			SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE);
+			SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0);
+
+			skdev->skmsg_free_list = skmsg->next;
+
+			skmsg->state = SKD_MSG_STATE_BUSY;
+			skmsg->id += SKD_ID_INCR;
+
+			/* Initialize the FIT msg header */
+			fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+			memset(fmh, 0, sizeof(*fmh));
+			fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+			skmsg->length = sizeof(*fmh);
+		}
+
+		skreq->fitmsg_id = skmsg->id;
+
+		/*
+		 * Note that a FIT msg may have just been started
+		 * but contains no SoFIT requests yet.
+		 */
+
+		/*
+		 * Transcode the request, checking as we go. The outcome of
+		 * the transcoding is represented by the error variable.
+		 */
+		cmd_ptr = &skmsg->msg_buf[skmsg->length];
+		memset(cmd_ptr, 0, 32);
+
+		be_lba = cpu_to_be32(lba);
+		be_count = cpu_to_be32(count);
+		be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address);
+		cmdctxt = skreq->id + SKD_ID_INCR;
+
+		scsi_req = cmd_ptr;
+		scsi_req->hdr.tag = cmdctxt;
+		scsi_req->hdr.sg_list_dma_address = be_dmaa;
+
+		if (data_dir == READ)
+			skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST;
+		else
+			skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD;
+
+		if (io_flags & REQ_DISCARD) {
+			page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+			if (!page) {
+				pr_err("request_fn:Page allocation failed.\n");
+				skd_end_request(skdev, skreq, -ENOMEM);
+				break;
+			}
+			skreq->discard_page = 1;
+			skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
+
+		} else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
+			skd_prep_zerosize_flush_cdb(scsi_req, skreq);
+			SKD_ASSERT(skreq->flush_cmd == 1);
+
+		} else {
+			skd_prep_rw_cdb(scsi_req, data_dir, lba, count);
+		}
+
+		if (fua)
+			scsi_req->cdb[1] |= SKD_FUA_NV;
+
+		if ((!skd_bio && !req->bio) ||
+			(skd_bio && flush == SKD_FLUSH_ZERO_SIZE_FIRST))
+			goto skip_sg;
+
+		error = skd_preop_sg_list(skdev, skreq);
+
+		if (error != 0) {
+			/*
+			 * Complete the native request with error.
+			 * Note that the request context is still at the
+			 * head of the free list, and that the SoFIT request
+			 * was encoded into the FIT msg buffer but the FIT
+			 * msg length has not been updated. In short, the
+			 * only resource that has been allocated but might
+			 * not be used is that the FIT msg could be empty.
+			 */
+			DPRINTK(skdev, "error Out\n");
+			skd_end_request(skdev, skreq, error);
+			continue;
+		}
+
+skip_sg:
+		scsi_req->hdr.sg_list_len_bytes =
+			cpu_to_be32(skreq->sg_byte_count);
+
+		/* Complete resource allocations. */
+		skdev->skreq_free_list = skreq->next;
+		skreq->state = SKD_REQ_STATE_BUSY;
+		skreq->id += SKD_ID_INCR;
+
+		skmsg->length += sizeof(struct skd_scsi_request);
+		fmh->num_protocol_cmds_coalesced++;
+
+		/*
+		 * Update the active request counts.
+		 * Capture the timeout timestamp.
+		 */
+		skreq->timeout_stamp = skdev->timeout_stamp;
+		timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+		skdev->timeout_slot[timo_slot]++;
+		skdev->in_flight++;
+		VPRINTK(skdev, "req=0x%x busy=%d\n",
+			skreq->id, skdev->in_flight);
+
+		/*
+		 * If the FIT msg buffer is full send it.
+		 */
+		if (skmsg->length >= SKD_N_FITMSG_BYTES ||
+		    fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) {
+			skd_send_fitmsg(skdev, skmsg);
+			skmsg = NULL;
+			fmh = NULL;
+		}
+	}
+
+	/*
+	 * Is a FIT msg in progress? If it is empty put the buffer back
+	 * on the free list. If it is non-empty send what we got.
+	 * This minimizes latency when there are fewer requests than
+	 * what fits in a FIT msg.
+	 */
+	if (skmsg != NULL) {
+		/* Bigger than just a FIT msg header? */
+		if (skmsg->length > sizeof(struct fit_msg_hdr)) {
+			VPRINTK(skdev, "sending msg=%p, len %d\n",
+				skmsg, skmsg->length);
+			skd_send_fitmsg(skdev, skmsg);
+		} else {
+			/*
+			 * The FIT msg is empty. It means we got started
+			 * on the msg, but the requests were rejected.
+			 */
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+			skmsg->next = skdev->skmsg_free_list;
+			skdev->skmsg_free_list = skmsg;
+		}
+		skmsg = NULL;
+		fmh = NULL;
+	}
+
+	/*
+	 * If req is non-NULL it means there is something to do but
+	 * we are out of a resource.
+	 */
+	if (((!skd_bio) && req) ||
+	    ((skd_bio) && bio_list_peek(&skdev->bio_queue)))
+		skd_stop_queue(skdev);
+}
+
+static void skd_end_request_blk(struct skd_device *skdev,
+				struct skd_request_context *skreq, int error)
+{
+	struct request *req = skreq->req;
+	unsigned int io_flags = req->cmd_flags;
+
+	if ((io_flags & REQ_DISCARD) &&
+		(skreq->discard_page == 1)) {
+		VPRINTK(skdev, "skd_end_request_blk, free the page!");
+		free_page((unsigned long)req->buffer);
+		req->buffer = NULL;
+	}
+
+	if (unlikely(error)) {
+		struct request *req = skreq->req;
+		char *cmd = (rq_data_dir(req) == READ) ? "read" : "write";
+		u32 lba = (u32)blk_rq_pos(req);
+		u32 count = blk_rq_sectors(req);
+
+		pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
+		       skd_name(skdev), cmd, lba, count, skreq->id);
+	} else
+		VPRINTK(skdev, "id=0x%x error=%d\n", skreq->id, error);
+
+	__blk_end_request_all(skreq->req, error);
+}
+
+static int skd_preop_sg_list_blk(struct skd_device *skdev,
+				 struct skd_request_context *skreq)
+{
+	struct request *req = skreq->req;
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+	struct scatterlist *sg = &skreq->sg[0];
+	int n_sg;
+	int i;
+
+	skreq->sg_byte_count = 0;
+
+	/* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD ||
+		   skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */
+
+	n_sg = blk_rq_map_sg(skdev->queue, req, sg);
+	if (n_sg <= 0)
+		return -EINVAL;
+
+	/*
+	 * Map scatterlist to PCI bus addresses.
+	 * Note PCI might change the number of entries.
+	 */
+	n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir);
+	if (n_sg <= 0)
+		return -EINVAL;
+
+	SKD_ASSERT(n_sg <= skdev->sgs_per_request);
+
+	skreq->n_sg = n_sg;
+
+	for (i = 0; i < n_sg; i++) {
+		struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+		u32 cnt = sg_dma_len(&sg[i]);
+		uint64_t dma_addr = sg_dma_address(&sg[i]);
+
+		sgd->control = FIT_SGD_CONTROL_NOT_LAST;
+		sgd->byte_count = cnt;
+		skreq->sg_byte_count += cnt;
+		sgd->host_side_addr = dma_addr;
+		sgd->dev_side_addr = 0;
+	}
+
+	skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
+	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		for (i = 0; i < n_sg; i++) {
+			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
+				"addr=0x%llx next=0x%llx\n",
+				i, sgd->byte_count, sgd->control,
+				sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	return 0;
+}
+
+static void skd_postop_sg_list_blk(struct skd_device *skdev,
+				   struct skd_request_context *skreq)
+{
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+
+	/*
+	 * restore the next ptr for next IO request so we
+	 * don't have to set it every time.
+	 */
+	skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
+		skreq->sksg_dma_address +
+		((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
+	pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
+}
+
+static void skd_end_request_bio(struct skd_device *skdev,
+				struct skd_request_context *skreq, int error)
+{
+	struct bio *bio = skreq->bio;
+	int rw = bio_data_dir(bio);
+	unsigned long io_flags = bio->bi_rw;
+
+	if ((io_flags & REQ_DISCARD) &&
+		(skreq->discard_page == 1)) {
+		VPRINTK(skdev, "biomode: skd_end_request: freeing DISCARD page.\n");
+		free_page((unsigned long)page_address(bio->bi_io_vec->bv_page));
+	}
+
+	if (unlikely(error)) {
+		u32 lba = (u32)skreq->bio->bi_sector;
+		u32 count = bio_sectors(skreq->bio);
+		char *cmd = (rw == WRITE) ? "write" : "read";
+		pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
+		       skd_name(skdev), cmd, lba, count, skreq->id);
+	}
+	{
+		int cpu = part_stat_lock();
+
+		if (likely(!error)) {
+			part_stat_inc(cpu, &skdev->disk->part0, ios[rw]);
+			part_stat_add(cpu, &skdev->disk->part0, sectors[rw],
+				      bio_sectors(bio));
+		}
+		part_stat_add(cpu, &skdev->disk->part0, ticks[rw],
+			      jiffies - skreq->start_time);
+		part_dec_in_flight(&skdev->disk->part0, rw);
+		part_stat_unlock();
+	}
+
+	VPRINTK(skdev, "id=0x%x error=%d\n", skreq->id, error);
+
+	bio_endio(skreq->bio, error);
+}
+
+static int skd_preop_sg_list_bio(struct skd_device *skdev,
+				 struct skd_request_context *skreq)
+{
+	struct bio *bio = skreq->bio;
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+	int n_sg;
+	int i;
+	struct bio_vec *vec;
+	struct fit_sg_descriptor *sgd;
+	u64 dma_addr;
+	u32 count;
+	int errs = 0;
+	unsigned int io_flags = 0;
+	io_flags |= bio->bi_rw;
+
+	skreq->sg_byte_count = 0;
+	n_sg = skreq->n_sg = skreq->bio->bi_vcnt;
+
+	if (n_sg <= 0)
+		return -EINVAL;
+
+	if (n_sg > skdev->sgs_per_request) {
+		pr_err("(%s): sg overflow n=%d\n",
+		       skd_name(skdev), n_sg);
+		skreq->n_sg = 0;
+		return -EIO;
+	}
+
+	for (i = 0; i < skreq->n_sg; i++) {
+		vec = bio_iovec_idx(bio, i);
+		dma_addr = pci_map_page(skdev->pdev,
+					vec->bv_page,
+					vec->bv_offset, vec->bv_len, pci_dir);
+		count = vec->bv_len;
+
+		if (count == 0 || count > 64u * 1024u || (count & 3) != 0
+		    || (dma_addr & 3) != 0) {
+			pr_err(
+			       "(%s): Bad sg ix=%d count=%d addr=0x%llx\n",
+			       skd_name(skdev), i, count, dma_addr);
+			errs++;
+		}
+
+		sgd = &skreq->sksg_list[i];
+
+		sgd->control = FIT_SGD_CONTROL_NOT_LAST;
+		sgd->byte_count = vec->bv_len;
+		skreq->sg_byte_count += vec->bv_len;
+		sgd->host_side_addr = dma_addr;
+		sgd->dev_side_addr = 0; /* not used */
+	}
+
+	skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
+	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
+
+
+	 if (!(io_flags & REQ_DISCARD)) {
+		count = bio_sectors(bio) << 9u;
+		if (count != skreq->sg_byte_count) {
+			pr_err("(%s): mismatch count sg=%d req=%d\n",
+			       skd_name(skdev), skreq->sg_byte_count, count);
+			errs++;
+		}
+	}
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		for (i = 0; i < n_sg; i++) {
+			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
+				"addr=0x%llx next=0x%llx\n",
+				i, sgd->byte_count, sgd->control,
+				sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	if (errs != 0) {
+		skd_postop_sg_list(skdev, skreq);
+		skreq->n_sg = 0;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int skd_preop_sg_list(struct skd_device *skdev,
+			     struct skd_request_context *skreq)
+{
+	if (!skd_bio)
+		return skd_preop_sg_list_blk(skdev, skreq);
+	else
+		return skd_preop_sg_list_bio(skdev, skreq);
+}
+
+static void skd_postop_sg_list_bio(struct skd_device *skdev,
+				   struct skd_request_context *skreq)
+{
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+	int i;
+	struct fit_sg_descriptor *sgd;
+
+	/*
+	 * restore the next ptr for next IO request so we
+	 * don't have to set it every time.
+	 */
+	skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
+		skreq->sksg_dma_address +
+		((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
+
+	for (i = 0; i < skreq->n_sg; i++) {
+		sgd = &skreq->sksg_list[i];
+		pci_unmap_page(skdev->pdev, sgd->host_side_addr,
+			       sgd->byte_count, pci_dir);
+	}
+}
+
+static void skd_postop_sg_list(struct skd_device *skdev,
+			       struct skd_request_context *skreq)
+{
+	if (!skd_bio)
+		skd_postop_sg_list_blk(skdev, skreq);
+	else
+		skd_postop_sg_list_bio(skdev, skreq);
+}
+
+static void skd_end_request(struct skd_device *skdev,
+			    struct skd_request_context *skreq, int error)
+{
+	if (likely(!skd_bio))
+		skd_end_request_blk(skdev, skreq, error);
+	else
+		skd_end_request_bio(skdev, skreq, error);
+}
+
+static void skd_request_fn_not_online(struct request_queue *q)
+{
+	struct skd_device *skdev = q->queuedata;
+	int error;
+
+	SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE);
+
+	skd_log_skdev(skdev, "req_not_online");
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_WAIT_BOOT:
+	/* In case of starting, we haven't started the queue,
+	 * so we can't get here... but requests are
+	 * possibly hanging out waiting for us because we
+	 * reported the dev/skd0 already.  They'll wait
+	 * forever if connect doesn't complete.
+	 * What to do??? delay dev/skd0 ??
+	 */
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		return;
+
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		error = -EIO;
+		break;
+	}
+
+	/* If we get here, terminate all pending block requeusts
+	 * with EIO and any scsi pass thru with appropriate sense
+	 */
+
+	skd_fail_all_pending(skdev);
+}
+
+/*
+ *****************************************************************************
+ * TIMER
+ *****************************************************************************
+ */
+
+static void skd_timer_tick_not_online(struct skd_device *skdev);
+
+static void skd_timer_tick(ulong arg)
+{
+	struct skd_device *skdev = (struct skd_device *)arg;
+
+	u32 timo_slot;
+	u32 overdue_timestamp;
+	unsigned long reqflags;
+	u32 state;
+
+	if (skdev->state == SKD_DRVR_STATE_FAULT)
+		/* The driver has declared fault, and we want it to
+		 * stay that way until driver is reloaded.
+		 */
+		return;
+
+	spin_lock_irqsave(&skdev->lock, reqflags);
+
+	state = SKD_READL(skdev, FIT_STATUS);
+	state &= FIT_SR_DRIVE_STATE_MASK;
+	if (state != skdev->drive_state)
+		skd_isr_fwstate(skdev);
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		skd_timer_tick_not_online(skdev);
+		goto timer_func_out;
+	}
+	skdev->timeout_stamp++;
+	timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+
+	/*
+	 * All requests that happened during the previous use of
+	 * this slot should be done by now. The previous use was
+	 * over 7 seconds ago.
+	 */
+	if (skdev->timeout_slot[timo_slot] == 0)
+		goto timer_func_out;
+
+	/* Something is overdue */
+	overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT;
+
+	DPRINTK(skdev, "found %d timeouts, draining busy=%d\n",
+		skdev->timeout_slot[timo_slot], skdev->in_flight);
+	pr_err("(%s): Overdue IOs (%d), busy %d\n",
+	       skd_name(skdev), skdev->timeout_slot[timo_slot],
+	       skdev->in_flight);
+
+	skdev->timer_countdown = SKD_DRAINING_TIMO;
+	skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT;
+	skdev->timo_slot = timo_slot;
+	skd_stop_queue(skdev);
+
+timer_func_out:
+	mod_timer(&skdev->timer, (jiffies + HZ));
+
+	spin_unlock_irqrestore(&skdev->lock, reqflags);
+}
+
+static void skd_timer_tick_not_online(struct skd_device *skdev)
+{
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_IDLE:
+	case SKD_DRVR_STATE_LOAD:
+		break;
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+		VPRINTK(skdev, "drive busy sanitize[%x], driver[%x]\n",
+			skdev->drive_state, skdev->state);
+		/* If we've been in sanitize for 3 seconds, we figure we're not
+		 * going to get anymore completions, so recover requests now
+		 */
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		skd_recover_requests(skdev, 0);
+		break;
+
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+		VPRINTK(skdev, "busy[%x], countdown=%d\n",
+			skdev->state, skdev->timer_countdown);
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		DPRINTK(skdev, "busy[%x], timedout=%d, restarting device.",
+			skdev->state, skdev->timer_countdown);
+		skd_restart_device(skdev);
+		break;
+
+	case SKD_DRVR_STATE_WAIT_BOOT:
+	case SKD_DRVR_STATE_STARTING:
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		/* For now, we fault the drive.  Could attempt resets to
+		 * revcover at some point. */
+		skdev->state = SKD_DRVR_STATE_FAULT;
+
+		pr_err("(%s): DriveFault Connect Timeout (%x)\n",
+		       skd_name(skdev), skdev->drive_state);
+
+		/*start the queue so we can respond with error to requests */
+		/* wakeup anyone waiting for startup complete */
+		skd_start_queue(skdev);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_ONLINE:
+		/* shouldn't get here. */
+		break;
+
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+		break;
+
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		DPRINTK(skdev,
+			"draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
+			skdev->timo_slot,
+			skdev->timer_countdown,
+			skdev->in_flight,
+			skdev->timeout_slot[skdev->timo_slot]);
+		/* if the slot has cleared we can let the I/O continue */
+		if (skdev->timeout_slot[skdev->timo_slot] == 0) {
+			DPRINTK(skdev, "Slot drained, starting queue.\n");
+			skdev->state = SKD_DRVR_STATE_ONLINE;
+			skd_start_queue(skdev);
+			return;
+		}
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		skd_restart_device(skdev);
+		break;
+
+	case SKD_DRVR_STATE_RESTARTING:
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		/* For now, we fault the drive. Could attempt resets to
+		 * revcover at some point. */
+		skdev->state = SKD_DRVR_STATE_FAULT;
+		pr_err("(%s): DriveFault Reconnect Timeout (%x)\n",
+		       skd_name(skdev), skdev->drive_state);
+
+		/*
+		 * Recovering does two things:
+		 * 1. completes IO with error
+		 * 2. reclaims dma resources
+		 * When is it safe to recover requests?
+		 * - if the drive state is faulted
+		 * - if the state is still soft reset after out timeout
+		 * - if the drive registers are dead (state = FF)
+		 * If it is "unsafe", we still need to recover, so we will
+		 * disable pci bus mastering and disable our interrupts.
+		 */
+
+		if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) ||
+		    (skdev->drive_state == FIT_SR_DRIVE_FAULT) ||
+		    (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK))
+			/* It never came out of soft reset. Try to
+			 * recover the requests and then let them
+			 * fail. This is to mitigate hung processes. */
+			skd_recover_requests(skdev, 0);
+		else {
+			pr_err("(%s): Disable BusMaster (%x)\n",
+			       skd_name(skdev), skdev->drive_state);
+			pci_disable_device(skdev->pdev);
+			skd_disable_interrupts(skdev);
+			skd_recover_requests(skdev, 0);
+		}
+
+		/*start the queue so we can respond with error to requests */
+		/* wakeup anyone waiting for startup complete */
+		skd_start_queue(skdev);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_RESUMING:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		break;
+	}
+}
+
+static int skd_start_timer(struct skd_device *skdev)
+{
+	int rc;
+
+	init_timer(&skdev->timer);
+	setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
+
+	rc = mod_timer(&skdev->timer, (jiffies + HZ));
+	if (rc)
+		pr_err("%s: failed to start timer %d\n",
+		       __func__, rc);
+	return rc;
+}
+
+static void skd_kill_timer(struct skd_device *skdev)
+{
+	del_timer_sync(&skdev->timer);
+}
+
+/*
+ *****************************************************************************
+ * IOCTL
+ *****************************************************************************
+ */
+static int skd_ioctl_sg_io(struct skd_device *skdev,
+			   fmode_t mode, void __user *argp);
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+					struct skd_sg_io *sksgio);
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+				   struct skd_sg_io *sksgio);
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio);
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio, int dxfer_dir);
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio);
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio);
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio);
+static int skd_sg_io_put_status(struct skd_device *skdev,
+				struct skd_sg_io *sksgio);
+
+static void skd_complete_special(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl);
+
+static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
+			  uint cmd_in, ulong arg)
+{
+	int rc = 0;
+	struct gendisk *disk = bdev->bd_disk;
+	struct skd_device *skdev = disk->private_data;
+	void __user *p = (void *)arg;
+
+	DPRINTK(skdev, "%s: CMD[%s] ioctl  mode 0x%x, cmd 0x%x arg %0lx\n",
+		disk->disk_name, current->comm, mode, cmd_in, arg);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd_in) {
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_VERSION_NUM:
+		rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p);
+		break;
+	case SG_IO:
+		rc = skd_ioctl_sg_io(skdev, mode, p);
+		break;
+
+	default:
+		rc = -ENOTTY;
+		break;
+	}
+
+	DPRINTK(skdev, "%s:  completion rc %d\n", disk->disk_name, rc);
+	return rc;
+}
+
+static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
+			   void __user *argp)
+{
+	int rc;
+	struct skd_sg_io sksgio;
+
+	memset(&sksgio, 0, sizeof(sksgio));
+	sksgio.mode = mode;
+	sksgio.argp = argp;
+	sksgio.iov = &sksgio.no_iov_iov;
+
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_ONLINE:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		break;
+
+	default:
+		DPRINTK(skdev, "drive not online\n");
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if ((rc = skd_sg_io_get_and_check_args(skdev, &sksgio)) ||
+	    (rc = skd_sg_io_obtain_skspcl(skdev, &sksgio)) ||
+	    (rc = skd_sg_io_prep_buffering(skdev, &sksgio)) ||
+	    (rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV)))
+		goto out;
+
+	if ((rc = skd_sg_io_send_fitmsg(skdev, &sksgio)) ||
+	    (rc = skd_sg_io_await(skdev, &sksgio)))
+		goto out;
+
+	if ((rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV)) ||
+	    (rc = skd_sg_io_put_status(skdev, &sksgio)))
+		goto out;
+
+	rc = 0;
+
+out:
+	skd_sg_io_release_skspcl(skdev, &sksgio);
+
+	if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov)
+		kfree(sksgio.iov);
+	return rc;
+}
+
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+					struct skd_sg_io *sksgio)
+{
+	struct sg_io_hdr *sgp = &sksgio->sg;
+	int i, acc;
+
+	if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) {
+		DPRINTK(skdev, "access sg failed %p\n", sksgio->argp);
+		return -EFAULT;
+	}
+
+	if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) {
+		DPRINTK(skdev, "copy_from_user sg failed %p\n", sksgio->argp);
+		return -EFAULT;
+	}
+
+	if (sgp->interface_id != SG_INTERFACE_ID_ORIG) {
+		DPRINTK(skdev, "interface_id invalid 0x%x\n",
+			sgp->interface_id);
+		return -EINVAL;
+	}
+
+	if (sgp->cmd_len > sizeof(sksgio->cdb)) {
+		DPRINTK(skdev, "cmd_len invalid %d\n", sgp->cmd_len);
+		return -EINVAL;
+	}
+
+	if (sgp->iovec_count > 256) {
+		DPRINTK(skdev, "iovec_count invalid %d\n", sgp->iovec_count);
+		return -EINVAL;
+	}
+
+	if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) {
+		DPRINTK(skdev, "dxfer_len invalid %d\n", sgp->dxfer_len);
+		return -EINVAL;
+	}
+
+	switch (sgp->dxfer_direction) {
+	case SG_DXFER_NONE:
+		acc = -1;
+		break;
+
+	case SG_DXFER_TO_DEV:
+		acc = VERIFY_READ;
+		break;
+
+	case SG_DXFER_FROM_DEV:
+	case SG_DXFER_TO_FROM_DEV:
+		acc = VERIFY_WRITE;
+		break;
+
+	default:
+		DPRINTK(skdev, "dxfer_dir invalid %d\n", sgp->dxfer_direction);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) {
+		DPRINTK(skdev, "copy_from_user cmdp failed %p\n", sgp->cmdp);
+		return -EFAULT;
+	}
+
+	if (sgp->mx_sb_len != 0) {
+		if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) {
+			DPRINTK(skdev, "access sbp failed %p\n", sgp->sbp);
+			return -EFAULT;
+		}
+	}
+
+	if (sgp->iovec_count == 0) {
+		sksgio->iov[0].iov_base = sgp->dxferp;
+		sksgio->iov[0].iov_len = sgp->dxfer_len;
+		sksgio->iovcnt = 1;
+		sksgio->dxfer_len = sgp->dxfer_len;
+	} else {
+		struct sg_iovec *iov;
+		uint nbytes = sizeof(*iov) * sgp->iovec_count;
+		size_t iov_data_len;
+
+		iov = kmalloc(nbytes, GFP_KERNEL);
+		if (iov == NULL) {
+			DPRINTK(skdev, "alloc iovec failed %d\n",
+				sgp->iovec_count);
+			return -ENOMEM;
+		}
+		sksgio->iov = iov;
+		sksgio->iovcnt = sgp->iovec_count;
+
+		if (copy_from_user(iov, sgp->dxferp, nbytes)) {
+			DPRINTK(skdev, "copy_from_user iovec failed %p\n",
+				sgp->dxferp);
+			return -EFAULT;
+		}
+
+		/*
+		 * Sum up the vecs, making sure they don't overflow
+		 */
+		iov_data_len = 0;
+		for (i = 0; i < sgp->iovec_count; i++) {
+			if (iov_data_len + iov[i].iov_len < iov_data_len)
+				return -EINVAL;
+			iov_data_len += iov[i].iov_len;
+		}
+
+		/* SG_IO howto says that the shorter of the two wins */
+		if (sgp->dxfer_len < iov_data_len) {
+			sksgio->iovcnt = iov_shorten((struct iovec *)iov,
+						     sgp->iovec_count,
+						     sgp->dxfer_len);
+			sksgio->dxfer_len = sgp->dxfer_len;
+		} else
+			sksgio->dxfer_len = iov_data_len;
+	}
+
+	if (sgp->dxfer_direction != SG_DXFER_NONE) {
+		struct sg_iovec *iov = sksgio->iov;
+		for (i = 0; i < sksgio->iovcnt; i++, iov++) {
+			if (!access_ok(acc, iov->iov_base, iov->iov_len)) {
+				DPRINTK(skdev, "access data failed %p/%d\n",
+					iov->iov_base, (int)iov->iov_len);
+				return -EFAULT;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+				   struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = NULL;
+	int rc;
+
+	for (;; ) {
+		ulong flags;
+
+		spin_lock_irqsave(&skdev->lock, flags);
+		skspcl = skdev->skspcl_free_list;
+		if (skspcl != NULL) {
+			skdev->skspcl_free_list =
+				(struct skd_special_context *)skspcl->req.next;
+			skspcl->req.id += SKD_ID_INCR;
+			skspcl->req.state = SKD_REQ_STATE_SETUP;
+			skspcl->orphaned = 0;
+			skspcl->req.n_sg = 0;
+		}
+		spin_unlock_irqrestore(&skdev->lock, flags);
+
+		if (skspcl != NULL) {
+			rc = 0;
+			break;
+		}
+
+		DPRINTK(skdev, "blocking\n");
+
+		rc = wait_event_interruptible_timeout(
+				skdev->waitq,
+				(skdev->skspcl_free_list != NULL),
+				msecs_to_jiffies(sksgio->sg.timeout));
+
+		DPRINTK(skdev, "unblocking, rc=%d\n", rc);
+
+		if (rc <= 0) {
+			if (rc == 0)
+				rc = -ETIMEDOUT;
+			else
+				rc = -EINTR;
+			break;
+		}
+		/*
+		 * If we get here rc > 0 meaning the timeout to
+		 * wait_event_interruptible_timeout() had time left, hence the
+		 * sought event -- non-empty free list -- happened.
+		 * Retry the allocation.
+		 */
+	}
+	sksgio->skspcl = skspcl;
+
+	return rc;
+}
+
+static int skd_skreq_prep_buffering(struct skd_device *skdev,
+				    struct skd_request_context *skreq,
+				    u32 dxfer_len)
+{
+	u32 resid = dxfer_len;
+
+	/*
+	 * The DMA engine must have aligned addresses and byte counts.
+	 */
+	resid += (-resid) & 3;
+	skreq->sg_byte_count = resid;
+
+	skreq->n_sg = 0;
+
+	while (resid > 0) {
+		u32 nbytes = PAGE_SIZE;
+		u32 ix = skreq->n_sg;
+		struct scatterlist *sg = &skreq->sg[ix];
+		struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+		struct page *page;
+
+		if (nbytes > resid)
+			nbytes = resid;
+
+		page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			return -ENOMEM;
+
+		sg_set_page(sg, page, nbytes, 0);
+
+		/* TODO: This should be going through a pci_???()
+		 * routine to do proper mapping. */
+		sksg->control = FIT_SGD_CONTROL_NOT_LAST;
+		sksg->byte_count = nbytes;
+
+		sksg->host_side_addr = sg_phys(sg);
+
+		sksg->dev_side_addr = 0;
+		sksg->next_desc_ptr = skreq->sksg_dma_address +
+				      (ix + 1) * sizeof(*sksg);
+
+		skreq->n_sg++;
+		resid -= nbytes;
+	}
+
+	if (skreq->n_sg > 0) {
+		u32 ix = skreq->n_sg - 1;
+		struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+
+		sksg->control = FIT_SGD_CONTROL_LAST;
+		sksg->next_desc_ptr = 0;
+	}
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u32 i;
+
+		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		for (i = 0; i < skreq->n_sg; i++) {
+			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+
+			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
+				"addr=0x%llx next=0x%llx\n",
+				i, sgd->byte_count, sgd->control,
+				sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	struct skd_request_context *skreq = &skspcl->req;
+	u32 dxfer_len = sksgio->dxfer_len;
+	int rc;
+
+	rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len);
+	/*
+	 * Eventually, errors or not, skd_release_special() is called
+	 * to recover allocations including partial allocations.
+	 */
+	return rc;
+}
+
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio, int dxfer_dir)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	u32 iov_ix = 0;
+	struct sg_iovec curiov;
+	u32 sksg_ix = 0;
+	u8 *bufp = NULL;
+	u32 buf_len = 0;
+	u32 resid = sksgio->dxfer_len;
+	int rc;
+
+	curiov.iov_len = 0;
+	curiov.iov_base = NULL;
+
+	if (dxfer_dir != sksgio->sg.dxfer_direction) {
+		if (dxfer_dir != SG_DXFER_TO_DEV ||
+		    sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV)
+			return 0;
+	}
+
+	while (resid > 0) {
+		u32 nbytes = PAGE_SIZE;
+
+		if (curiov.iov_len == 0) {
+			curiov = sksgio->iov[iov_ix++];
+			continue;
+		}
+
+		if (buf_len == 0) {
+			struct page *page;
+			page = sg_page(&skspcl->req.sg[sksg_ix++]);
+			bufp = page_address(page);
+			buf_len = PAGE_SIZE;
+		}
+
+		nbytes = min_t(u32, nbytes, resid);
+		nbytes = min_t(u32, nbytes, curiov.iov_len);
+		nbytes = min_t(u32, nbytes, buf_len);
+
+		if (dxfer_dir == SG_DXFER_TO_DEV)
+			rc = __copy_from_user(bufp, curiov.iov_base, nbytes);
+		else
+			rc = __copy_to_user(curiov.iov_base, bufp, nbytes);
+
+		if (rc)
+			return -EFAULT;
+
+		resid -= nbytes;
+		curiov.iov_len -= nbytes;
+		curiov.iov_base += nbytes;
+		buf_len -= nbytes;
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+	struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+	memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES);
+
+	/* Initialize the FIT msg header */
+	fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+	fmh->num_protocol_cmds_coalesced = 1;
+
+	/* Initialize the SCSI request */
+	if (sksgio->sg.dxfer_direction != SG_DXFER_NONE)
+		scsi_req->hdr.sg_list_dma_address =
+			cpu_to_be64(skspcl->req.sksg_dma_address);
+	scsi_req->hdr.tag = skspcl->req.id;
+	scsi_req->hdr.sg_list_len_bytes =
+		cpu_to_be32(skspcl->req.sg_byte_count);
+	memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb));
+
+	skspcl->req.state = SKD_REQ_STATE_BUSY;
+	skd_send_special_fitmsg(skdev, skspcl);
+
+	return 0;
+}
+
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
+{
+	unsigned long flags;
+	int rc;
+
+	rc = wait_event_interruptible_timeout(skdev->waitq,
+					      (sksgio->skspcl->req.state !=
+					       SKD_REQ_STATE_BUSY),
+					      msecs_to_jiffies(sksgio->sg.
+							       timeout));
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) {
+		DPRINTK(skdev, "skspcl %p aborted\n", sksgio->skspcl);
+
+		/* Build check cond, sense and let command finish. */
+		/* For a timeout, we must fabricate completion and sense
+		 * data to complete the command */
+		sksgio->skspcl->req.completion.status =
+			SAM_STAT_CHECK_CONDITION;
+
+		memset(&sksgio->skspcl->req.err_info, 0,
+		       sizeof(sksgio->skspcl->req.err_info));
+		sksgio->skspcl->req.err_info.type = 0x70;
+		sksgio->skspcl->req.err_info.key = ABORTED_COMMAND;
+		sksgio->skspcl->req.err_info.code = 0x44;
+		sksgio->skspcl->req.err_info.qual = 0;
+		rc = 0;
+	} else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY)
+		/* No longer on the adapter. We finish. */
+		rc = 0;
+	else {
+		/* Something's gone wrong. Still busy. Timeout or
+		 * user interrupted (control-C). Mark as an orphan
+		 * so it will be disposed when completed. */
+		sksgio->skspcl->orphaned = 1;
+		sksgio->skspcl = NULL;
+		if (rc == 0) {
+			DPRINTK(skdev, "timed out %p (%u ms)\n", sksgio,
+				sksgio->sg.timeout);
+			rc = -ETIMEDOUT;
+		} else {
+			DPRINTK(skdev, "cntlc %p\n", sksgio);
+			rc = -EINTR;
+		}
+	}
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	return rc;
+}
+
+static int skd_sg_io_put_status(struct skd_device *skdev,
+				struct skd_sg_io *sksgio)
+{
+	struct sg_io_hdr *sgp = &sksgio->sg;
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	int resid = 0;
+
+	u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes);
+
+	sgp->status = skspcl->req.completion.status;
+	resid = sksgio->dxfer_len - nb;
+
+	sgp->masked_status = sgp->status & STATUS_MASK;
+	sgp->msg_status = 0;
+	sgp->host_status = 0;
+	sgp->driver_status = 0;
+	sgp->resid = resid;
+	if (sgp->masked_status || sgp->host_status || sgp->driver_status)
+		sgp->info |= SG_INFO_CHECK;
+
+	DPRINTK(skdev, "status %x masked %x resid 0x%x\n", sgp->status,
+		sgp->masked_status, sgp->resid);
+
+	if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) {
+		if (sgp->mx_sb_len > 0) {
+			struct fit_comp_error_info *ei = &skspcl->req.err_info;
+			u32 nbytes = sizeof(*ei);
+
+			nbytes = min_t(u32, nbytes, sgp->mx_sb_len);
+
+			sgp->sb_len_wr = nbytes;
+
+			if (__copy_to_user(sgp->sbp, ei, nbytes)) {
+				DPRINTK(skdev, "copy_to_user sense failed %p\n",
+					sgp->sbp);
+				return -EFAULT;
+			}
+		}
+	}
+
+	if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) {
+		DPRINTK(skdev, "copy_to_user sg failed %p\n", sksgio->argp);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+
+	if (skspcl != NULL) {
+		ulong flags;
+
+		sksgio->skspcl = NULL;
+
+		spin_lock_irqsave(&skdev->lock, flags);
+		skd_release_special(skdev, skspcl);
+		spin_unlock_irqrestore(&skdev->lock, flags);
+	}
+
+	return 0;
+}
+
+/*
+ *****************************************************************************
+ * INTERNAL REQUESTS -- generated by driver itself
+ *****************************************************************************
+ */
+
+static int skd_format_internal_skspcl(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+	struct fit_msg_hdr *fmh;
+	uint64_t dma_address;
+	struct skd_scsi_request *scsi;
+
+	fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0];
+	fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+	fmh->num_protocol_cmds_coalesced = 1;
+
+	scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+	memset(scsi, 0, sizeof(*scsi));
+	dma_address = skspcl->req.sksg_dma_address;
+	scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address);
+	sgd->control = FIT_SGD_CONTROL_LAST;
+	sgd->byte_count = 0;
+	sgd->host_side_addr = skspcl->db_dma_address;
+	sgd->dev_side_addr = 0;
+	sgd->next_desc_ptr = 0LL;
+
+	return 1;
+}
+
+#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES
+
+static void skd_send_internal_skspcl(struct skd_device *skdev,
+				     struct skd_special_context *skspcl,
+				     u8 opcode)
+{
+	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+	struct skd_scsi_request *scsi;
+	unsigned char *buf = skspcl->data_buf;
+	int i;
+
+	if (skspcl->req.state != SKD_REQ_STATE_IDLE)
+		/*
+		 * A refresh is already in progress.
+		 * Just wait for it to finish.
+		 */
+		return;
+
+	SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0);
+	skspcl->req.state = SKD_REQ_STATE_BUSY;
+	skspcl->req.id += SKD_ID_INCR;
+
+	scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+	scsi->hdr.tag = skspcl->req.id;
+
+	memset(scsi->cdb, 0, sizeof(scsi->cdb));
+
+	switch (opcode) {
+	case TEST_UNIT_READY:
+		scsi->cdb[0] = TEST_UNIT_READY;
+		sgd->byte_count = 0;
+		scsi->hdr.sg_list_len_bytes = 0;
+		break;
+
+	case READ_CAPACITY:
+		scsi->cdb[0] = READ_CAPACITY;
+		sgd->byte_count = SKD_N_READ_CAP_BYTES;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		break;
+
+	case INQUIRY:
+		scsi->cdb[0] = INQUIRY;
+		scsi->cdb[1] = 0x01;    /* evpd */
+		scsi->cdb[2] = 0x80;    /* serial number page */
+		scsi->cdb[4] = 0x10;
+		sgd->byte_count = 16;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		break;
+
+	case SYNCHRONIZE_CACHE:
+		scsi->cdb[0] = SYNCHRONIZE_CACHE;
+		sgd->byte_count = 0;
+		scsi->hdr.sg_list_len_bytes = 0;
+		break;
+
+	case WRITE_BUFFER:
+		scsi->cdb[0] = WRITE_BUFFER;
+		scsi->cdb[1] = 0x02;
+		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+		sgd->byte_count = WR_BUF_SIZE;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		/* fill incrementing byte pattern */
+		for (i = 0; i < sgd->byte_count; i++)
+			buf[i] = i & 0xFF;
+		break;
+
+	case READ_BUFFER:
+		scsi->cdb[0] = READ_BUFFER;
+		scsi->cdb[1] = 0x02;
+		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+		sgd->byte_count = WR_BUF_SIZE;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		memset(skspcl->data_buf, 0, sgd->byte_count);
+		break;
+
+	default:
+		SKD_ASSERT("Don't know what to send");
+		return;
+
+	}
+	skd_send_special_fitmsg(skdev, skspcl);
+}
+
+static void skd_refresh_device_data(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+
+	skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY);
+}
+
+static int skd_chk_read_buf(struct skd_device *skdev,
+			    struct skd_special_context *skspcl)
+{
+	unsigned char *buf = skspcl->data_buf;
+	int i;
+
+	/* check for incrementing byte pattern */
+	for (i = 0; i < WR_BUF_SIZE; i++)
+		if (buf[i] != (i & 0xFF))
+			return 1;
+
+	return 0;
+}
+
+static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key,
+				 u8 code, u8 qual, u8 fruc)
+{
+	/* If the check condition is of special interest, log a message */
+	if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02)
+	    && (code == 0x04) && (qual == 0x06)) {
+		pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/"
+		       "ascq/fruc %02x/%02x/%02x/%02x\n",
+		       skd_name(skdev), key, code, qual, fruc);
+	}
+}
+
+static void skd_complete_internal(struct skd_device *skdev,
+				  volatile struct fit_completion_entry_v1
+				  *skcomp,
+				  volatile struct fit_comp_error_info *skerr,
+				  struct skd_special_context *skspcl)
+{
+	u8 *buf = skspcl->data_buf;
+	u8 status;
+	int i;
+	struct skd_scsi_request *scsi =
+		(struct skd_scsi_request *)&skspcl->msg_buf[64];
+
+	SKD_ASSERT(skspcl == &skdev->internal_skspcl);
+
+	DPRINTK(skdev, "complete internal %x\n", scsi->cdb[0]);
+
+	skspcl->req.completion = *skcomp;
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+	skspcl->req.id += SKD_ID_INCR;
+
+	status = skspcl->req.completion.status;
+
+	skd_log_check_status(skdev, status, skerr->key, skerr->code,
+			     skerr->qual, skerr->fruc);
+
+	switch (scsi->cdb[0]) {
+	case TEST_UNIT_READY:
+		if (status == SAM_STAT_GOOD)
+			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+		else if ((status == SAM_STAT_CHECK_CONDITION) &&
+			 (skerr->key == MEDIUM_ERROR))
+			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+		else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				VPRINTK(skdev, "TUR failed, don't send anymore"
+					"state 0x%x\n", skdev->state);
+				return;
+			}
+			DPRINTK(skdev, "**** TUR failed, retry skerr\n");
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case WRITE_BUFFER:
+		if (status == SAM_STAT_GOOD)
+			skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
+		else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				VPRINTK(skdev, "write buffer failed, don't send"
+					" anymore state 0x%x\n", skdev->state);
+				return;
+			}
+			DPRINTK(skdev,
+				"**** write buffer failed, retry skerr\n");
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case READ_BUFFER:
+		if (status == SAM_STAT_GOOD) {
+			if (skd_chk_read_buf(skdev, skspcl) == 0)
+				skd_send_internal_skspcl(skdev, skspcl,
+							 READ_CAPACITY);
+			else {
+				pr_err(
+				       "(%s):*** W/R Buffer mismatch %d ***\n",
+				       skd_name(skdev), skdev->connect_retries);
+				if (skdev->connect_retries <
+				    SKD_MAX_CONNECT_RETRIES) {
+					skdev->connect_retries++;
+					skd_soft_reset(skdev);
+				} else {
+					pr_err(
+					       "(%s): W/R Buffer Connect Error\n",
+					       skd_name(skdev));
+					return;
+				}
+			}
+
+		} else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				VPRINTK(skdev,
+					"read buffer failed, don't send anymore"
+					"state 0x%x\n", skdev->state);
+				return;
+			}
+			DPRINTK(skdev,
+				"**** read buffer failed, retry skerr\n");
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case READ_CAPACITY:
+		skdev->read_cap_is_valid = 0;
+		if (status == SAM_STAT_GOOD) {
+			skdev->read_cap_last_lba =
+				(buf[0] << 24) | (buf[1] << 16) |
+				(buf[2] << 8) | buf[3];
+			skdev->read_cap_blocksize =
+				(buf[4] << 24) | (buf[5] << 16) |
+				(buf[6] << 8) | buf[7];
+
+			DPRINTK(skdev, "last lba %d, bs %d\n",
+				skdev->read_cap_last_lba,
+				skdev->read_cap_blocksize);
+
+			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+
+			skdev->read_cap_is_valid = 1;
+
+			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+		} else if ((status == SAM_STAT_CHECK_CONDITION) &&
+			   (skerr->key == MEDIUM_ERROR)) {
+			skdev->read_cap_last_lba = ~0;
+			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+			DPRINTK(skdev,
+				"**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n");
+			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+		} else {
+			DPRINTK(skdev, "**** READCAP failed, retry TUR\n");
+			skd_send_internal_skspcl(skdev, skspcl,
+						 TEST_UNIT_READY);
+		}
+		break;
+
+	case INQUIRY:
+		skdev->inquiry_is_valid = 0;
+		if (status == SAM_STAT_GOOD) {
+			skdev->inquiry_is_valid = 1;
+
+			for (i = 0; i < 12; i++)
+				skdev->inq_serial_num[i] = buf[i + 4];
+			skdev->inq_serial_num[12] = 0;
+		}
+
+		if (skd_unquiesce_dev(skdev) < 0)
+			DPRINTK(skdev, "**** failed, to ONLINE device\n");
+		 /* connection is complete */
+		skdev->connect_retries = 0;
+		break;
+
+	case SYNCHRONIZE_CACHE:
+		if (status == SAM_STAT_GOOD)
+			skdev->sync_done = 1;
+		else
+			skdev->sync_done = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	default:
+		SKD_ASSERT("we didn't send this");
+	}
+}
+
+/*
+ *****************************************************************************
+ * FIT MESSAGES
+ *****************************************************************************
+ */
+
+static void skd_send_fitmsg(struct skd_device *skdev,
+			    struct skd_fitmsg_context *skmsg)
+{
+	u64 qcmd;
+	struct fit_msg_hdr *fmh;
+
+	VPRINTK(skdev, "dma address 0x%llx, busy=%d\n",
+		skmsg->mb_dma_address, skdev->in_flight);
+	VPRINTK(skdev, "msg_buf 0x%p, offset %x\n",
+		skmsg->msg_buf, skmsg->offset);
+
+	qcmd = skmsg->mb_dma_address;
+	qcmd |= FIT_QCMD_QID_NORMAL;
+
+	fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+	skmsg->outstanding = fmh->num_protocol_cmds_coalesced;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u8 *bp = (u8 *)skmsg->msg_buf;
+		int i;
+		for (i = 0; i < skmsg->length; i += 8) {
+			VPRINTK(skdev, "  msg[%2d] %02x %02x %02x %02x "
+				"%02x %02x %02x %02x\n",
+				i, bp[i + 0], bp[i + 1], bp[i + 2],
+				bp[i + 3], bp[i + 4], bp[i + 5],
+				bp[i + 6], bp[i + 7]);
+			if (i == 0)
+				i = 64 - 8;
+		}
+	}
+
+	if (skmsg->length > 256)
+		qcmd |= FIT_QCMD_MSGSIZE_512;
+	else if (skmsg->length > 128)
+		qcmd |= FIT_QCMD_MSGSIZE_256;
+	else if (skmsg->length > 64)
+		qcmd |= FIT_QCMD_MSGSIZE_128;
+	else
+		/*
+		 * This makes no sense because the FIT msg header is
+		 * 64 bytes. If the msg is only 64 bytes long it has
+		 * no payload.
+		 */
+		qcmd |= FIT_QCMD_MSGSIZE_64;
+
+	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+
+}
+
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+				    struct skd_special_context *skspcl)
+{
+	u64 qcmd;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u8 *bp = (u8 *)skspcl->msg_buf;
+		int i;
+
+		for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
+			VPRINTK(skdev,
+				"  spcl[%2d] %02x %02x %02x %02x  "
+				"%02x %02x %02x %02x\n", i,
+				bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
+				bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
+			if (i == 0)
+				i = 64 - 8;
+		}
+
+		VPRINTK(skdev, "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+			skspcl, skspcl->req.id, skspcl->req.sksg_list,
+			skspcl->req.sksg_dma_address);
+		for (i = 0; i < skspcl->req.n_sg; i++) {
+			struct fit_sg_descriptor *sgd =
+				&skspcl->req.sksg_list[i];
+
+			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
+				"addr=0x%llx next=0x%llx\n",
+				i, sgd->byte_count, sgd->control,
+				sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	/*
+	 * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr
+	 * and one 64-byte SSDI command.
+	 */
+	qcmd = skspcl->mb_dma_address;
+	qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128;
+
+	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+}
+
+/*
+ *****************************************************************************
+ * COMPLETION QUEUE
+ *****************************************************************************
+ */
+
+static void skd_complete_other(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr);
+
+
+static void skd_requeue_request(struct skd_device *skdev,
+				struct skd_request_context *skreq);
+
+struct sns_info {
+	u8 type;
+	u8 stat;
+	u8 key;
+	u8 asc;
+	u8 ascq;
+	u8 mask;
+	enum skd_check_status_action action;
+};
+
+static struct sns_info skd_chkstat_table[] = {
+	/* Good */
+	{ 0x70, 0x02, RECOVERED_ERROR, 0,    0,	   0x1c,
+	  SKD_CHECK_STATUS_REPORT_GOOD },
+
+	/* Smart alerts */
+	{ 0x70, 0x02, NO_SENSE,	       0x0B, 0x00, 0x1E,	/* warnings */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+	{ 0x70, 0x02, NO_SENSE,	       0x5D, 0x00, 0x1E,	/* thresholds */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+	{ 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F,        /* temperature over trigger */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+
+	/* Retry (with limits) */
+	{ 0x70, 0x02, 0x0B,	       0,    0,	   0x1C,        /* This one is for DMA ERROR */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x0B, 0x00, 0x1E,        /* warnings */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x5D, 0x00, 0x1E,        /* thresholds */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x80, 0x30, 0x1F,        /* backup power */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+
+	/* Busy (or about to be) */
+	{ 0x70, 0x02, 0x06,	       0x3f, 0x01, 0x1F, /* fw changed */
+	  SKD_CHECK_STATUS_BUSY_IMMINENT },
+};
+
+/*
+ * Look up status and sense data to decide how to handle the error
+ * from the device.
+ * mask says which fields must match e.g., mask=0x18 means check
+ * type and stat, ignore key, asc, ascq.
+ */
+
+static enum skd_check_status_action skd_check_status(struct skd_device *skdev,
+				u8 cmp_status,
+				volatile struct fit_comp_error_info *skerr)
+{
+	int i, n;
+
+	pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
+	       skd_name(skdev), skerr->key, skerr->code, skerr->qual,
+	       skerr->fruc);
+
+	VPRINTK(skdev, "stat: t=%02x stat=%02x k=%02x c=%02x q=%02x "
+		"fruc=%02x\n", skerr->type, cmp_status, skerr->key,
+		skerr->code, skerr->qual, skerr->fruc);
+
+	/* Does the info match an entry in the good category? */
+	n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]);
+	for (i = 0; i < n; i++) {
+		struct sns_info *sns = &skd_chkstat_table[i];
+
+		if (sns->mask & 0x10)
+			if (skerr->type != sns->type)
+				continue;
+
+		if (sns->mask & 0x08)
+			if (cmp_status != sns->stat)
+				continue;
+
+		if (sns->mask & 0x04)
+			if (skerr->key != sns->key)
+				continue;
+
+		if (sns->mask & 0x02)
+			if (skerr->code != sns->asc)
+				continue;
+
+		if (sns->mask & 0x01)
+			if (skerr->qual != sns->ascq)
+				continue;
+
+		if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) {
+			pr_err("(%s): SMART Alert: sense key/asc/ascq "
+			       "%02x/%02x/%02x\n",
+			       skd_name(skdev), skerr->key,
+			       skerr->code, skerr->qual);
+		}
+		return sns->action;
+	}
+
+	/* No other match, so nonzero status means error,
+	 * zero status means good
+	 */
+	if (cmp_status) {
+		DPRINTK(skdev, "status check: error\n");
+		return SKD_CHECK_STATUS_REPORT_ERROR;
+	}
+
+	DPRINTK(skdev, "status check good default\n");
+	return SKD_CHECK_STATUS_REPORT_GOOD;
+}
+
+static void skd_resolve_req_exception(struct skd_device *skdev,
+				      struct skd_request_context *skreq)
+{
+	u8 cmp_status = skreq->completion.status;
+
+	switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) {
+	case SKD_CHECK_STATUS_REPORT_GOOD:
+	case SKD_CHECK_STATUS_REPORT_SMART_ALERT:
+		skd_end_request(skdev, skreq, 0);
+		break;
+
+	case SKD_CHECK_STATUS_BUSY_IMMINENT:
+		skd_log_skreq(skdev, skreq, "retry(busy)");
+		skd_requeue_request(skdev, skreq);
+		pr_info("(%s) drive BUSY imminent\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
+		skdev->timer_countdown = SKD_TIMER_MINUTES(20);
+		skd_quiesce_dev(skdev);
+		break;
+
+	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
+		if (!skd_bio) {
+			if ((unsigned long) ++skreq->req->special <
+			    SKD_MAX_RETRIES) {
+				skd_log_skreq(skdev, skreq, "retry");
+				skd_requeue_request(skdev, skreq);
+				break;
+			}
+		}
+	/* fall through to report error */
+
+	case SKD_CHECK_STATUS_REPORT_ERROR:
+	default:
+		skd_end_request(skdev, skreq, -EIO);
+		break;
+	}
+}
+
+static void skd_requeue_request(struct skd_device *skdev,
+				struct skd_request_context *skreq)
+{
+	if (!skd_bio) {
+		blk_requeue_request(skdev->queue, skreq->req);
+	} else {
+		bio_list_add_head(&skdev->bio_queue, skreq->bio);
+		skreq->bio = NULL;
+	}
+}
+
+
+
+/* assume spinlock is already held */
+static void skd_release_skreq(struct skd_device *skdev,
+			      struct skd_request_context *skreq)
+{
+	u32 msg_slot;
+	struct skd_fitmsg_context *skmsg;
+
+	u32 timo_slot;
+
+	/*
+	 * Reclaim the FIT msg buffer if this is
+	 * the first of the requests it carried to
+	 * be completed. The FIT msg buffer used to
+	 * send this request cannot be reused until
+	 * we are sure the s1120 card has copied
+	 * it to its memory. The FIT msg might have
+	 * contained several requests. As soon as
+	 * any of them are completed we know that
+	 * the entire FIT msg was transferred.
+	 * Only the first completed request will
+	 * match the FIT msg buffer id. The FIT
+	 * msg buffer id is immediately updated.
+	 * When subsequent requests complete the FIT
+	 * msg buffer id won't match, so we know
+	 * quite cheaply that it is already done.
+	 */
+	msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK;
+	SKD_ASSERT(msg_slot < skdev->num_fitmsg_context);
+
+	skmsg = &skdev->skmsg_table[msg_slot];
+	if (skmsg->id == skreq->fitmsg_id) {
+		SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY);
+		SKD_ASSERT(skmsg->outstanding > 0);
+		skmsg->outstanding--;
+		if (skmsg->outstanding == 0) {
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+			skmsg->next = skdev->skmsg_free_list;
+			skdev->skmsg_free_list = skmsg;
+		}
+	}
+
+	/*
+	 * Decrease the number of active requests.
+	 * Also decrements the count in the timeout slot.
+	 */
+	SKD_ASSERT(skdev->in_flight > 0);
+	skdev->in_flight -= 1;
+
+	timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+	SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0);
+	skdev->timeout_slot[timo_slot] -= 1;
+
+	/*
+	 * Reset backpointer
+	 */
+	if (likely(!skd_bio))
+		skreq->req = NULL;
+	else
+		skreq->bio = NULL;
+
+
+	/*
+	 * Reclaim the skd_request_context
+	 */
+	skreq->state = SKD_REQ_STATE_IDLE;
+	skreq->id += SKD_ID_INCR;
+	skreq->next = skdev->skreq_free_list;
+	skdev->skreq_free_list = skreq;
+}
+
+#define DRIVER_INQ_EVPD_PAGE_CODE   0xDA
+
+static void skd_do_inq_page_00(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr,
+			       uint8_t *cdb, uint8_t *buf)
+{
+	uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size;
+
+	/* Caller requested "supported pages".  The driver needs to insert
+	 * its page.
+	 */
+	VPRINTK(skdev, "skd_do_driver_inquiry: modify supported pages.\n");
+
+	/* If the device rejected the request because the CDB was
+	 * improperly formed, then just leave.
+	 */
+	if (skcomp->status == SAM_STAT_CHECK_CONDITION &&
+	    skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24)
+		return;
+
+	/* Get the amount of space the caller allocated */
+	max_bytes = (cdb[3] << 8) | cdb[4];
+
+	/* Get the number of pages actually returned by the device */
+	drive_pages = (buf[2] << 8) | buf[3];
+	drive_bytes = drive_pages + 4;
+	new_size = drive_pages + 1;
+
+	/* Supported pages must be in numerical order, so find where
+	 * the driver page needs to be inserted into the list of
+	 * pages returned by the device.
+	 */
+	for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) {
+		if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE)
+			return; /* Device using this page code. abort */
+		else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE)
+			break;
+	}
+
+	if (insert_pt < max_bytes) {
+		uint16_t u;
+
+		/* Shift everything up one byte to make room. */
+		for (u = new_size + 3; u > insert_pt; u--)
+			buf[u] = buf[u - 1];
+		buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE;
+
+		/* SCSI byte order increment of num_returned_bytes by 1 */
+		skcomp->num_returned_bytes =
+			be32_to_cpu(skcomp->num_returned_bytes) + 1;
+		skcomp->num_returned_bytes =
+			be32_to_cpu(skcomp->num_returned_bytes);
+	}
+
+	/* update page length field to reflect the driver's page too */
+	buf[2] = (uint8_t)((new_size >> 8) & 0xFF);
+	buf[3] = (uint8_t)((new_size >> 0) & 0xFF);
+}
+
+static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width)
+{
+	int pcie_reg;
+	u16 pci_bus_speed;
+	u8 pci_lanes;
+
+	pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (pcie_reg) {
+		u16 linksta;
+		pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta);
+
+		pci_bus_speed = linksta & 0xF;
+		pci_lanes = (linksta & 0x3F0) >> 4;
+	} else {
+		*speed = STEC_LINK_UNKNOWN;
+		*width = 0xFF;
+		return;
+	}
+
+	switch (pci_bus_speed) {
+	case 1:
+		*speed = STEC_LINK_2_5GTS;
+		break;
+	case 2:
+		*speed = STEC_LINK_5GTS;
+		break;
+	case 3:
+		*speed = STEC_LINK_8GTS;
+		break;
+	default:
+		*speed = STEC_LINK_UNKNOWN;
+		break;
+	}
+
+	if (pci_lanes <= 0x20)
+		*width = pci_lanes;
+	else
+		*width = 0xFF;
+}
+
+static void skd_do_inq_page_da(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr,
+			       uint8_t *cdb, uint8_t *buf)
+{
+	unsigned max_bytes;
+	struct driver_inquiry_data inq;
+	u16 val;
+
+	VPRINTK(skdev, "skd_do_driver_inquiry: return driver page\n");
+
+	memset(&inq, 0, sizeof(inq));
+
+	inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE;
+
+	if (skdev->pdev && skdev->pdev->bus) {
+		skd_get_link_info(skdev->pdev,
+				  &inq.pcie_link_speed, &inq.pcie_link_lanes);
+		inq.pcie_bus_number = cpu_to_be16(skdev->pdev->bus->number);
+		inq.pcie_device_number = PCI_SLOT(skdev->pdev->devfn);
+		inq.pcie_function_number = PCI_FUNC(skdev->pdev->devfn);
+
+		pci_read_config_word(skdev->pdev, PCI_VENDOR_ID, &val);
+		inq.pcie_vendor_id = cpu_to_be16(val);
+
+		pci_read_config_word(skdev->pdev, PCI_DEVICE_ID, &val);
+		inq.pcie_device_id = cpu_to_be16(val);
+
+		pci_read_config_word(skdev->pdev, PCI_SUBSYSTEM_VENDOR_ID,
+				     &val);
+		inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
+
+		pci_read_config_word(skdev->pdev, PCI_SUBSYSTEM_ID, &val);
+		inq.pcie_subsystem_device_id = cpu_to_be16(val);
+	} else {
+		inq.pcie_bus_number = 0xFFFF;
+		inq.pcie_device_number = 0xFF;
+		inq.pcie_function_number = 0xFF;
+		inq.pcie_link_speed = 0xFF;
+		inq.pcie_link_lanes = 0xFF;
+		inq.pcie_vendor_id = 0xFFFF;
+		inq.pcie_device_id = 0xFFFF;
+		inq.pcie_subsystem_vendor_id = 0xFFFF;
+		inq.pcie_subsystem_device_id = 0xFFFF;
+	}
+
+	/* Driver version, fixed lenth, padded with spaces on the right */
+	inq.driver_version_length = sizeof(inq.driver_version);
+	memset(&inq.driver_version, ' ', sizeof(inq.driver_version));
+	memcpy(inq.driver_version, DRV_VER_COMPL,
+	       min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL)));
+
+	inq.page_length = cpu_to_be16((sizeof(inq) - 4));
+
+	/* Clear the error set by the device */
+	skcomp->status = SAM_STAT_GOOD;
+	memset((void *)skerr, 0, sizeof(*skerr));
+
+	/* copy response into output buffer */
+	max_bytes = (cdb[3] << 8) | cdb[4];
+	memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq)));
+
+	skcomp->num_returned_bytes =
+		be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq)));
+}
+
+static void skd_do_driver_inq(struct skd_device *skdev,
+			      volatile struct fit_completion_entry_v1 *skcomp,
+			      volatile struct fit_comp_error_info *skerr,
+			      uint8_t *cdb, uint8_t *buf)
+{
+	if (!buf)
+		return;
+	else if (cdb[0] != INQUIRY)
+		return;         /* Not an INQUIRY */
+	else if ((cdb[1] & 1) == 0)
+		return;         /* EVPD not set */
+	else if (cdb[2] == 0)
+		/* Need to add driver's page to supported pages list */
+		skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf);
+	else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE)
+		/* Caller requested driver's page */
+		skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf);
+}
+
+static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg)
+{
+	if (!sg)
+		return NULL;
+	if (!sg_page(sg))
+		return NULL;
+	return sg_virt(sg);
+}
+
+static void skd_process_scsi_inq(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl)
+{
+	uint8_t *buf;
+	struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+	struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+	dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg,
+			    skspcl->req.sg_data_dir);
+	buf = skd_sg_1st_page_ptr(skspcl->req.sg);
+
+	if (buf)
+		skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf);
+}
+
+
+static int skd_isr_completion_posted(struct skd_device *skdev,
+					int limit, int *enqueued)
+{
+	volatile struct fit_completion_entry_v1 *skcmp = NULL;
+	volatile struct fit_comp_error_info *skerr;
+	u16 req_id;
+	u32 req_slot;
+	struct skd_request_context *skreq;
+	u16 cmp_cntxt = 0;
+	u8 cmp_status = 0;
+	u8 cmp_cycle = 0;
+	u32 cmp_bytes = 0;
+	int rc = 0;
+	int processed = 0;
+	int ret;
+
+
+	for (;; ) {
+		SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
+
+		skcmp = &skdev->skcomp_table[skdev->skcomp_ix];
+		cmp_cycle = skcmp->cycle;
+		cmp_cntxt = skcmp->tag;
+		cmp_status = skcmp->status;
+		cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes);
+
+		skerr = &skdev->skerr_table[skdev->skcomp_ix];
+
+		VPRINTK(skdev,
+			"cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
+			"busy=%d rbytes=0x%x proto=%d\n", skdev->skcomp_cycle,
+			skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
+			skdev->in_flight, cmp_bytes, skdev->proto_ver);
+
+		if (cmp_cycle != skdev->skcomp_cycle) {
+			VPRINTK(skdev, "end of completions\n");
+			break;
+		}
+		/*
+		 * Update the completion queue head index and possibly
+		 * the completion cycle count. 8-bit wrap-around.
+		 */
+		skdev->skcomp_ix++;
+		if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) {
+			skdev->skcomp_ix = 0;
+			skdev->skcomp_cycle++;
+		}
+
+		/*
+		 * The command context is a unique 32-bit ID. The low order
+		 * bits help locate the request. The request is usually a
+		 * r/w request (see skd_start() above) or a special request.
+		 */
+		req_id = cmp_cntxt;
+		req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK;
+
+		/* Is this other than a r/w request? */
+		if (req_slot >= skdev->num_req_context) {
+			/*
+			 * This is not a completion for a r/w request.
+			 */
+			skd_complete_other(skdev, skcmp, skerr);
+			continue;
+		}
+
+		skreq = &skdev->skreq_table[req_slot];
+
+		/*
+		 * Make sure the request ID for the slot matches.
+		 */
+		if (skreq->id != req_id) {
+			DPRINTK(skdev, "mismatch comp_id=0x%x req_id=0x%x\n",
+				req_id, skreq->id);
+			{
+				u16 new_id = cmp_cntxt;
+				pr_err("(%s): Completion mismatch "
+				       "comp_id=0x%04x skreq=0x%04x new=0x%04x\n",
+				       skd_name(skdev), req_id,
+				       skreq->id, new_id);
+
+				continue;
+			}
+		}
+
+		SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
+
+		if (skreq->state == SKD_REQ_STATE_ABORTED) {
+			DPRINTK(skdev, "reclaim req %p id=%04x\n",
+				skreq, skreq->id);
+			/* a previously timed out command can
+			 * now be cleaned up */
+			skd_release_skreq(skdev, skreq);
+			continue;
+		}
+
+		skreq->completion = *skcmp;
+		if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) {
+			skreq->err_info = *skerr;
+			skd_log_check_status(skdev, cmp_status, skerr->key,
+					     skerr->code, skerr->qual,
+					     skerr->fruc);
+		}
+		/* Release DMA resources for the request. */
+		if (skreq->n_sg > 0)
+			skd_postop_sg_list(skdev, skreq);
+
+		if (((!skd_bio) && !skreq->req) ||
+		    ((skd_bio) && !skreq->bio)) {
+			DPRINTK(skdev, "NULL backptr skdreq %p, "
+				"req=0x%x req_id=0x%x\n",
+				skreq, skreq->id, req_id);
+		} else {
+			/*
+			 * Capture the outcome and post it back to the
+			 * native request.
+			 */
+			if (likely(cmp_status == SAM_STAT_GOOD)) {
+				if (unlikely(skreq->flush_cmd)) {
+					if (skd_bio) {
+						/* if empty size bio, we are all done */
+						if (bio_sectors(skreq->bio) == 0) {
+							skd_end_request(skdev, skreq, 0);
+						} else {
+							ret = skd_flush_cmd_enqueue(skdev, (void *)skreq->bio);
+							if (ret != 0) {
+								pr_err("Failed to enqueue flush bio with Data. Err=%d.\n", ret);
+								skd_end_request(skdev, skreq, ret);
+							} else {
+								((*enqueued)++);
+							}
+						}
+					} else {
+						skd_end_request(skdev, skreq, 0);
+					}
+				} else {
+					skd_end_request(skdev, skreq, 0);
+				}
+			} else {
+				skd_resolve_req_exception(skdev, skreq);
+			}
+		}
+
+		/*
+		 * Release the skreq, its FIT msg (if one), timeout slot,
+		 * and queue depth.
+		 */
+		skd_release_skreq(skdev, skreq);
+
+		/* skd_isr_comp_limit equal zero means no limit */
+		if (limit) {
+			if (++processed >= limit) {
+				rc = 1;
+				break;
+			}
+		}
+	}
+
+	if ((skdev->state == SKD_DRVR_STATE_PAUSING)
+		&& (skdev->in_flight) == 0) {
+		skdev->state = SKD_DRVR_STATE_PAUSED;
+		wake_up_interruptible(&skdev->waitq);
+	}
+
+	return rc;
+}
+
+static void skd_complete_other(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr)
+{
+	u32 req_id = 0;
+	u32 req_table;
+	u32 req_slot;
+	struct skd_special_context *skspcl;
+
+	req_id = skcomp->tag;
+	req_table = req_id & SKD_ID_TABLE_MASK;
+	req_slot = req_id & SKD_ID_SLOT_MASK;
+
+	DPRINTK(skdev, "table=0x%x id=0x%x slot=%d\n", req_table, req_id,
+		req_slot);
+
+	/*
+	 * Based on the request id, determine how to dispatch this completion.
+	 * This swich/case is finding the good cases and forwarding the
+	 * completion entry. Errors are reported below the switch.
+	 */
+	switch (req_table) {
+	case SKD_ID_RW_REQUEST:
+		/*
+		 * The caller, skd_completion_posted_isr() above,
+		 * handles r/w requests. The only way we get here
+		 * is if the req_slot is out of bounds.
+		 */
+		break;
+
+	case SKD_ID_SPECIAL_REQUEST:
+		/*
+		 * Make sure the req_slot is in bounds and that the id
+		 * matches.
+		 */
+		if (req_slot < skdev->n_special) {
+			skspcl = &skdev->skspcl_table[req_slot];
+			if (skspcl->req.id == req_id &&
+			    skspcl->req.state == SKD_REQ_STATE_BUSY) {
+				skd_complete_special(skdev,
+						     skcomp, skerr, skspcl);
+				return;
+			}
+		}
+		break;
+
+	case SKD_ID_INTERNAL:
+		if (req_slot == 0) {
+			skspcl = &skdev->internal_skspcl;
+			if (skspcl->req.id == req_id &&
+			    skspcl->req.state == SKD_REQ_STATE_BUSY) {
+				skd_complete_internal(skdev,
+						      skcomp, skerr, skspcl);
+				return;
+			}
+		}
+		break;
+
+	case SKD_ID_FIT_MSG:
+		/*
+		 * These id's should never appear in a completion record.
+		 */
+		break;
+
+	default:
+		/*
+		 * These id's should never appear anywhere;
+		 */
+		break;
+	}
+
+	/*
+	 * If we get here it is a bad or stale id.
+	 */
+}
+
+static void skd_complete_special(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl)
+{
+	DPRINTK(skdev, " completing special request %p\n", skspcl);
+	if (skspcl->orphaned) {
+		/* Discard orphaned request */
+		/* ?: Can this release directly or does it need
+		 * to use a worker? */
+		DPRINTK(skdev, "release orphaned %p\n", skspcl);
+		skd_release_special(skdev, skspcl);
+		return;
+	}
+
+	skd_process_scsi_inq(skdev, skcomp, skerr, skspcl);
+
+	skspcl->req.state = SKD_REQ_STATE_COMPLETED;
+	skspcl->req.completion = *skcomp;
+	skspcl->req.err_info = *skerr;
+
+	skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key,
+			     skerr->code, skerr->qual, skerr->fruc);
+
+	wake_up_interruptible(&skdev->waitq);
+}
+
+/* assume spinlock is already held */
+static void skd_release_special(struct skd_device *skdev,
+				struct skd_special_context *skspcl)
+{
+	int i, was_depleted;
+
+	for (i = 0; i < skspcl->req.n_sg; i++) {
+
+		struct page *page = sg_page(&skspcl->req.sg[i]);
+		__free_page(page);
+	}
+
+	was_depleted = (skdev->skspcl_free_list == NULL);
+
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+	skspcl->req.id += SKD_ID_INCR;
+	skspcl->req.next =
+		(struct skd_request_context *)skdev->skspcl_free_list;
+	skdev->skspcl_free_list = (struct skd_special_context *)skspcl;
+
+	if (was_depleted) {
+		DPRINTK(skdev, "skspcl was depleted\n");
+		/* Free list was depleted. Their might be waiters. */
+		wake_up_interruptible(&skdev->waitq);
+	}
+}
+
+static void skd_reset_skcomp(struct skd_device *skdev)
+{
+	u32 nbytes;
+	struct fit_completion_entry_v1 *skcomp;
+
+	nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+	nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+	memset(skdev->skcomp_table, 0, nbytes);
+
+	skdev->skcomp_ix = 0;
+	skdev->skcomp_cycle = 1;
+}
+
+/*
+ *****************************************************************************
+ * INTERRUPTS
+ *****************************************************************************
+ */
+static void skd_completion_worker(struct work_struct *work)
+{
+	struct skd_device *skdev =
+		container_of(work, struct skd_device, completion_worker);
+	unsigned long flags;
+	int flush_enqueued = 0;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	/*
+	 * pass in limit=0, which means no limit..
+	 * process everything in compq
+	 */
+	skd_isr_completion_posted(skdev, 0, &flush_enqueued);
+	skd_request_fn(skdev->queue);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev);
+
+irqreturn_t
+static skd_isr(int irq, void *ptr)
+{
+	struct skd_device *skdev;
+	u32 intstat;
+	u32 ack;
+	int rc = 0;
+	int deferred = 0;
+	int flush_enqueued = 0;
+
+	skdev = (struct skd_device *)ptr;
+	spin_lock(&skdev->lock);
+
+	for (;; ) {
+		intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+
+		ack = FIT_INT_DEF_MASK;
+		ack &= intstat;
+
+		VPRINTK(skdev, "intstat=0x%x ack=0x%x\n", intstat, ack);
+
+		/* As long as there is an int pending on device, keep
+		 * running loop.  When none, get out, but if we've never
+		 * done any processing, call completion handler?
+		 */
+		if (ack == 0) {
+			/* No interrupts on device, but run the completion
+			 * processor anyway?
+			 */
+			if (rc == 0)
+				if (likely (skdev->state
+					== SKD_DRVR_STATE_ONLINE))
+					deferred = 1;
+			break;
+		}
+
+		rc = IRQ_HANDLED;
+
+		SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST);
+
+		if (likely((skdev->state != SKD_DRVR_STATE_LOAD) &&
+			   (skdev->state != SKD_DRVR_STATE_STOPPING))) {
+			if (intstat & FIT_ISH_COMPLETION_POSTED) {
+				/*
+				 * If we have already deferred completion
+				 * processing, don't bother running it again
+				 */
+				if (deferred == 0)
+					deferred =
+						skd_isr_completion_posted(skdev,
+						skd_isr_comp_limit, &flush_enqueued);
+			}
+
+			if (intstat & FIT_ISH_FW_STATE_CHANGE) {
+				skd_isr_fwstate(skdev);
+				if (skdev->state == SKD_DRVR_STATE_FAULT ||
+				    skdev->state ==
+				    SKD_DRVR_STATE_DISAPPEARED) {
+					spin_unlock(&skdev->lock);
+					return rc;
+				}
+			}
+
+			if (intstat & FIT_ISH_MSG_FROM_DEV)
+				skd_isr_msg_from_dev(skdev);
+		}
+	}
+
+	if (unlikely(flush_enqueued))
+		skd_request_fn(skdev->queue);
+
+	if (deferred)
+		schedule_work(&skdev->completion_worker);
+	else if (!flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	spin_unlock(&skdev->lock);
+
+	return rc;
+}
+
+
+static void skd_drive_fault(struct skd_device *skdev)
+{
+	skdev->state = SKD_DRVR_STATE_FAULT;
+	pr_err("(%s): Drive FAULT\n", skd_name(skdev));
+}
+
+static void skd_drive_disappeared(struct skd_device *skdev)
+{
+	skdev->state = SKD_DRVR_STATE_DISAPPEARED;
+	pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev));
+}
+
+static void skd_isr_fwstate(struct skd_device *skdev)
+{
+	u32 sense;
+	u32 state;
+	u32 mtd;
+	int prev_driver_state = skdev->state;
+
+	sense = SKD_READL(skdev, FIT_STATUS);
+	state = sense & FIT_SR_DRIVE_STATE_MASK;
+
+	pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n",
+	       skd_name(skdev),
+	       skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+	       skd_drive_state_to_str(state), state);
+
+	skdev->drive_state = state;
+
+	switch (skdev->drive_state) {
+	case FIT_SR_DRIVE_INIT:
+		if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) {
+			skd_disable_interrupts(skdev);
+			break;
+		}
+		if (skdev->state == SKD_DRVR_STATE_RESTARTING)
+			skd_recover_requests(skdev, 0);
+		if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) {
+			skdev->timer_countdown = SKD_STARTING_TIMO;
+			skdev->state = SKD_DRVR_STATE_STARTING;
+			skd_soft_reset(skdev);
+			break;
+		}
+		mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_SR_DRIVE_ONLINE:
+		skdev->cur_max_queue_depth = skd_max_queue_depth;
+		if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth)
+			skdev->cur_max_queue_depth = skdev->dev_max_queue_depth;
+
+		skdev->queue_low_water_mark =
+			skdev->cur_max_queue_depth * 2 / 3 + 1;
+		if (skdev->queue_low_water_mark < 1)
+			skdev->queue_low_water_mark = 1;
+		pr_info(
+		       "(%s): Queue depth limit=%d dev=%d lowat=%d\n",
+		       skd_name(skdev),
+		       skdev->cur_max_queue_depth,
+		       skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+
+		skd_refresh_device_data(skdev);
+		break;
+
+	case FIT_SR_DRIVE_BUSY:
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		skdev->timer_countdown = SKD_BUSY_TIMO;
+		skd_quiesce_dev(skdev);
+		break;
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		/* set timer for 3 seconds, we'll abort any unfinished
+		 * commands after that expires
+		 */
+		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+		skdev->timer_countdown = SKD_TIMER_SECONDS(3);
+		skd_start_queue(skdev);
+		break;
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+		skdev->timer_countdown = SKD_BUSY_TIMO;
+		break;
+	case FIT_SR_DRIVE_OFFLINE:
+		skdev->state = SKD_DRVR_STATE_IDLE;
+		break;
+	case FIT_SR_DRIVE_SOFT_RESET:
+		switch (skdev->state) {
+		case SKD_DRVR_STATE_STARTING:
+		case SKD_DRVR_STATE_RESTARTING:
+			/* Expected by a caller of skd_soft_reset() */
+			break;
+		default:
+			skdev->state = SKD_DRVR_STATE_RESTARTING;
+			break;
+		}
+		break;
+	case FIT_SR_DRIVE_FW_BOOTING:
+		VPRINTK(skdev, "ISR FIT_SR_DRIVE_FW_BOOTING %s\n", skdev->name);
+		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_DEGRADED:
+	case FIT_SR_PCIE_LINK_DOWN:
+	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+		break;
+
+	case FIT_SR_DRIVE_FAULT:
+		skd_drive_fault(skdev);
+		skd_recover_requests(skdev, 0);
+		skd_start_queue(skdev);
+		break;
+
+	/* PCIe bus returned all Fs? */
+	case 0xFF:
+		pr_info("(%s): state=0x%x sense=0x%x\n",
+		       skd_name(skdev), state, sense);
+		skd_drive_disappeared(skdev);
+		skd_recover_requests(skdev, 0);
+		skd_start_queue(skdev);
+		break;
+	default:
+		/*
+		 * Uknown FW State. Wait for a state we recognize.
+		 */
+		break;
+	}
+	pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+	       skd_name(skdev),
+	       skd_skdev_state_to_str(prev_driver_state), prev_driver_state,
+	       skd_skdev_state_to_str(skdev->state), skdev->state);
+}
+
+static void skd_recover_requests(struct skd_device *skdev, int requeue)
+{
+	int i;
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq = &skdev->skreq_table[i];
+
+		if (skreq->state == SKD_REQ_STATE_BUSY) {
+			skd_log_skreq(skdev, skreq, "recover");
+
+			SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0);
+			if (!skd_bio)
+				SKD_ASSERT(skreq->req != NULL);
+			else
+				SKD_ASSERT(skreq->bio != NULL);
+
+			/* Release DMA resources for the request. */
+			if (skreq->n_sg > 0)
+				skd_postop_sg_list(skdev, skreq);
+
+			if (!skd_bio) {
+				if (requeue &&
+				    (unsigned long) ++skreq->req->special <
+				    SKD_MAX_RETRIES)
+					skd_requeue_request(skdev, skreq);
+				else
+				skd_end_request(skdev, skreq, -EIO);
+			} else
+				skd_end_request(skdev, skreq, -EIO);
+
+			if (!skd_bio)
+				skreq->req = NULL;
+			else
+				skreq->bio = NULL;
+
+			skreq->state = SKD_REQ_STATE_IDLE;
+			skreq->id += SKD_ID_INCR;
+
+
+		}
+		if (i > 0)
+			skreq[-1].next = skreq;
+		skreq->next = NULL;
+	}
+	skdev->skreq_free_list = skdev->skreq_table;
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i];
+
+		if (skmsg->state == SKD_MSG_STATE_BUSY) {
+			skd_log_skmsg(skdev, skmsg, "salvaged");
+			SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0);
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+		}
+		if (i > 0)
+			skmsg[-1].next = skmsg;
+		skmsg->next = NULL;
+	}
+	skdev->skmsg_free_list = skdev->skmsg_table;
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl = &skdev->skspcl_table[i];
+
+		/* If orphaned, reclaim it because it has already been reported
+		 * to the process as an error (it was just waiting for
+		 * a completion that didn't come, and now it will never come)
+		 * If busy, change to a state that will cause it to error
+		 * out in the wait routine and let it do the normal
+		 * reporting and reclaiming
+		 */
+		if (skspcl->req.state == SKD_REQ_STATE_BUSY) {
+			if (skspcl->orphaned) {
+				DPRINTK(skdev, "orphaned %p\n", skspcl);
+				skd_release_special(skdev, skspcl);
+			} else {
+				DPRINTK(skdev, "not orphaned %p\n", skspcl);
+				skspcl->req.state = SKD_REQ_STATE_ABORTED;
+			}
+		}
+	}
+	skdev->skspcl_free_list = skdev->skspcl_table;
+
+	for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++)
+		skdev->timeout_slot[i] = 0;
+
+	skdev->in_flight = 0;
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev)
+{
+	u32 mfd;
+	u32 mtd;
+	u32 data;
+
+	mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+
+	DPRINTK(skdev, "mfd=0x%x last_mtd=0x%x\n", mfd, skdev->last_mtd);
+
+	/* ignore any mtd that is an ack for something we didn't send */
+	if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
+		return;
+
+	switch (FIT_MXD_TYPE(mfd)) {
+	case FIT_MTD_FITFW_INIT:
+		skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd);
+
+		if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) {
+			pr_err("(%s): protocol mismatch\n",
+			       skdev->name);
+			pr_err("(%s):   got=%d support=%d\n",
+			       skdev->name, skdev->proto_ver,
+			       FIT_PROTOCOL_VERSION_1);
+			pr_err("(%s):   please upgrade driver\n",
+			       skdev->name);
+			skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH;
+			skd_soft_reset(skdev);
+			break;
+		}
+		mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_GET_CMDQ_DEPTH:
+		skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd);
+		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0,
+				   SKD_N_COMPLETION_ENTRY);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_SET_COMPQ_DEPTH:
+		SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG);
+		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_SET_COMPQ_ADDR:
+		skd_reset_skcomp(skdev);
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_HOST_ID:
+		skdev->connect_time_stamp = get_seconds();
+		data = skdev->connect_time_stamp & 0xFFFF;
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_TIME_STAMP_LO:
+		skdev->drive_jiffies = FIT_MXD_DATA(mfd);
+		data = (skdev->connect_time_stamp >> 16) & 0xFFFF;
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_TIME_STAMP_HI:
+		skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16);
+		mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+
+		pr_err("(%s): Time sync driver=0x%x device=0x%x\n",
+		       skd_name(skdev),
+		       skdev->connect_time_stamp, skdev->drive_jiffies);
+		break;
+
+	case FIT_MTD_ARM_QUEUE:
+		skdev->last_mtd = 0;
+		/*
+		 * State should be, or soon will be, FIT_SR_DRIVE_ONLINE.
+		 */
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void skd_disable_interrupts(struct skd_device *skdev)
+{
+	u32 sense;
+
+	sense = SKD_READL(skdev, FIT_CONTROL);
+	sense &= ~FIT_CR_ENABLE_INTERRUPTS;
+	SKD_WRITEL(skdev, sense, FIT_CONTROL);
+	DPRINTK(skdev, "sense 0x%x\n", sense);
+
+	/* Note that the 1s is written. A 1-bit means
+	 * disable, a 0 means enable.
+	 */
+	SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST);
+}
+
+static void skd_enable_interrupts(struct skd_device *skdev)
+{
+	u32 val;
+
+	/* unmask interrupts first */
+	val = FIT_ISH_FW_STATE_CHANGE +
+	      FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV;
+
+	/* Note that the compliment of mask is written. A 1-bit means
+	 * disable, a 0 means enable. */
+	SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
+	DPRINTK(skdev, "interrupt mask=0x%x\n", ~val);
+
+	val = SKD_READL(skdev, FIT_CONTROL);
+	val |= FIT_CR_ENABLE_INTERRUPTS;
+	DPRINTK(skdev, "control=0x%x\n", val);
+	SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+/*
+ *****************************************************************************
+ * START, STOP, RESTART, QUIESCE, UNQUIESCE
+ *****************************************************************************
+ */
+
+static void skd_soft_reset(struct skd_device *skdev)
+{
+	u32 val;
+
+	val = SKD_READL(skdev, FIT_CONTROL);
+	val |= (FIT_CR_SOFT_RESET);
+	DPRINTK(skdev, "control=0x%x\n", val);
+	SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+static void skd_start_device(struct skd_device *skdev)
+{
+	unsigned long flags;
+	u32 sense;
+	u32 state;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	/* ack all ghost interrupts */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+	sense = SKD_READL(skdev, FIT_STATUS);
+
+	DPRINTK(skdev, "initial status=0x%x\n", sense);
+
+	state = sense & FIT_SR_DRIVE_STATE_MASK;
+	skdev->drive_state = state;
+	skdev->last_mtd = 0;
+
+	skdev->state = SKD_DRVR_STATE_STARTING;
+	skdev->timer_countdown = SKD_STARTING_TIMO;
+
+	skd_enable_interrupts(skdev);
+
+	switch (skdev->drive_state) {
+	case FIT_SR_DRIVE_OFFLINE:
+		pr_err("(%s): Drive offline...\n", skd_name(skdev));
+		break;
+
+	case FIT_SR_DRIVE_FW_BOOTING:
+		VPRINTK(skdev, "FIT_SR_DRIVE_FW_BOOTING %s\n", skdev->name);
+		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		pr_info("(%s): Start: BUSY_SANITIZE\n",
+		       skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_INIT:
+	case FIT_SR_DRIVE_ONLINE:
+		skd_soft_reset(skdev);
+		break;
+
+	case FIT_SR_DRIVE_BUSY:
+		pr_err("(%s): Drive Busy...\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_SOFT_RESET:
+		pr_err("(%s) drive soft reset in prog\n",
+		       skd_name(skdev));
+		break;
+
+	case FIT_SR_DRIVE_FAULT:
+		/* Fault state is bad...soft reset won't do it...
+		 * Hard reset, maybe, but does it work on device?
+		 * For now, just fault so the system doesn't hang.
+		 */
+		skd_drive_fault(skdev);
+		/*start the queue so we can respond with error to requests */
+		VPRINTK(skdev, "starting %s queue\n", skdev->name);
+		skd_start_queue(skdev);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case 0xFF:
+		/* Most likely the device isn't there or isn't responding
+		 * to the BAR1 addresses. */
+		skd_drive_disappeared(skdev);
+		/*start the queue so we can respond with error to requests */
+		VPRINTK(skdev, "starting %s queue to error-out reqs\n",
+			skdev->name);
+		skd_start_queue(skdev);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	default:
+		pr_err("(%s) Start: unknown state %x\n",
+		       skd_name(skdev), skdev->drive_state);
+		break;
+	}
+
+	state = SKD_READL(skdev, FIT_CONTROL);
+	DPRINTK(skdev, "FIT Control Status=0x%x\n", state);
+
+	state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+	DPRINTK(skdev, "Intr Status=0x%x\n", state);
+
+	state = SKD_READL(skdev, FIT_INT_MASK_HOST);
+	DPRINTK(skdev, "Intr Mask=0x%x\n", state);
+
+	state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+	DPRINTK(skdev, "Msg from Dev=0x%x\n", state);
+
+	state = SKD_READL(skdev, FIT_HW_VERSION);
+	DPRINTK(skdev, "HW version=0x%x\n", state);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_stop_device(struct skd_device *skdev)
+{
+	unsigned long flags;
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+	u32 dev_state;
+	int i;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		pr_err("(%s): skd_stop_device not online no sync\n",
+		       skd_name(skdev));
+		goto stop_out;
+	}
+
+	if (skspcl->req.state != SKD_REQ_STATE_IDLE) {
+		pr_err("(%s): skd_stop_device no special\n",
+		       skd_name(skdev));
+		goto stop_out;
+	}
+
+	skdev->state = SKD_DRVR_STATE_SYNCING;
+	skdev->sync_done = 0;
+
+	skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	wait_event_interruptible_timeout(skdev->waitq,
+					 (skdev->sync_done), (10 * HZ));
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	switch (skdev->sync_done) {
+	case 0:
+		pr_err("(%s): skd_stop_device no sync\n",
+		       skd_name(skdev));
+		break;
+	case 1:
+		pr_err("(%s): skd_stop_device sync done\n",
+		       skd_name(skdev));
+		break;
+	default:
+		pr_err("(%s): skd_stop_device sync error\n",
+		       skd_name(skdev));
+	}
+
+stop_out:
+	skdev->state = SKD_DRVR_STATE_STOPPING;
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	skd_kill_timer(skdev);
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	skd_disable_interrupts(skdev);
+
+	/* ensure all ints on device are cleared */
+	/* soft reset the device to unload with a clean slate */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+	SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	/* poll every 100ms, 1 second timeout */
+	for (i = 0; i < 10; i++) {
+		dev_state =
+			SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK;
+		if (dev_state == FIT_SR_DRIVE_INIT)
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(100));
+	}
+
+	if (dev_state != FIT_SR_DRIVE_INIT)
+		pr_err("(%s): skd_stop_device state error 0x%02x\n",
+		       skd_name(skdev), dev_state);
+}
+
+/* assume spinlock is held */
+static void skd_restart_device(struct skd_device *skdev)
+{
+	u32 state;
+
+	/* ack all ghost interrupts */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+	state = SKD_READL(skdev, FIT_STATUS);
+
+	DPRINTK(skdev, "drive status=0x%x\n", state);
+
+	state &= FIT_SR_DRIVE_STATE_MASK;
+	skdev->drive_state = state;
+	skdev->last_mtd = 0;
+
+	skdev->state = SKD_DRVR_STATE_RESTARTING;
+	skdev->timer_countdown = SKD_RESTARTING_TIMO;
+
+	skd_soft_reset(skdev);
+}
+
+/* assume spinlock is held */
+static int skd_quiesce_dev(struct skd_device *skdev)
+{
+	int rc = 0;
+
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		VPRINTK(skdev, "stopping %s queue\n", skdev->name);
+		skd_stop_queue(skdev);
+		break;
+	case SKD_DRVR_STATE_ONLINE:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_RESUMING:
+	default:
+		rc = -EINVAL;
+		VPRINTK(skdev, "state [%d] not implemented\n", skdev->state);
+	}
+	return rc;
+}
+
+/* assume spinlock is held */
+static int skd_unquiesce_dev(struct skd_device *skdev)
+{
+	int prev_driver_state = skdev->state;
+
+	skd_log_skdev(skdev, "unquiesce");
+	if (skdev->state == SKD_DRVR_STATE_ONLINE) {
+		DPRINTK(skdev, "**** device already ONLINE\n");
+		return 0;
+	}
+	if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
+		/*
+		 * If there has been an state change to other than
+		 * ONLINE, we will rely on controller state change
+		 * to come back online and restart the queue.
+		 * The BUSY state means that driver is ready to
+		 * continue normal processing but waiting for controller
+		 * to become available.
+		 */
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		DPRINTK(skdev, "drive BUSY state\n");
+		return 0;
+	}
+
+	/*
+	 * Drive has just come online, driver is either in startup,
+	 * paused performing a task, or bust waiting for hardware.
+	 */
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_IDLE:
+	case SKD_DRVR_STATE_LOAD:
+		skdev->state = SKD_DRVR_STATE_ONLINE;
+		pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+		       skd_name(skdev),
+		       skd_skdev_state_to_str(prev_driver_state),
+		       prev_driver_state, skd_skdev_state_to_str(skdev->state),
+		       skdev->state);
+		DPRINTK(skdev, "**** device ONLINE...starting block queue\n");
+		VPRINTK(skdev, "starting %s queue\n", skdev->name);
+		pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
+		skd_start_queue(skdev);
+		skdev->gendisk_on = 1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		DPRINTK(skdev, "**** driver state %d, not implemented \n",
+			skdev->state);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X INTERRUPT HANDLERS
+ *****************************************************************************
+ */
+
+static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev),
+	       irq, SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
+	skd_isr_fwstate(skdev);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+	int flush_enqueued = 0;
+	int deferred;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
+	deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
+						&flush_enqueued);
+
+	if (flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	if (deferred)
+		schedule_work(&skdev->completion_worker);
+	else if (!flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
+	skd_isr_msg_from_dev(skdev);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X SETUP
+ *****************************************************************************
+ */
+
+struct skd_msix_entry {
+	int have_irq;
+	u32 vector;
+	u32 entry;
+	struct skd_device *rsp;
+	char isr_name[30];
+};
+
+struct skd_init_msix_entry {
+	const char *name;
+	irq_handler_t handler;
+};
+
+#define SKD_MAX_MSIX_COUNT              13
+#define SKD_MIN_MSIX_COUNT              7
+#define SKD_BASE_MSIX_IRQ               4
+
+static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
+	{ "(DMA 0)",	    skd_reserved_isr },
+	{ "(DMA 1)",	    skd_reserved_isr },
+	{ "(DMA 2)",	    skd_reserved_isr },
+	{ "(DMA 3)",	    skd_reserved_isr },
+	{ "(State Change)", skd_statec_isr   },
+	{ "(COMPL_Q)",	    skd_comp_q	     },
+	{ "(MSG)",	    skd_msg_isr	     },
+	{ "(Reserved)",	    skd_reserved_isr },
+	{ "(Reserved)",	    skd_reserved_isr },
+	{ "(Queue Full 0)", skd_qfull_isr    },
+	{ "(Queue Full 1)", skd_qfull_isr    },
+	{ "(Queue Full 2)", skd_qfull_isr    },
+	{ "(Queue Full 3)", skd_qfull_isr    },
+};
+
+static void skd_release_msix(struct skd_device *skdev)
+{
+	struct skd_msix_entry *qentry;
+	int i;
+
+	if (skdev->msix_entries == NULL)
+		return;
+	for (i = 0; i < skdev->msix_count; i++) {
+		qentry = &skdev->msix_entries[i];
+		skdev = qentry->rsp;
+
+		if (qentry->have_irq)
+			devm_free_irq(&skdev->pdev->dev,
+				      qentry->vector, qentry->rsp);
+	}
+	pci_disable_msix(skdev->pdev);
+	kfree(skdev->msix_entries);
+	skdev->msix_count = 0;
+	skdev->msix_entries = NULL;
+}
+
+static int skd_acquire_msix(struct skd_device *skdev)
+{
+	int i, rc;
+	struct pci_dev *pdev;
+	struct msix_entry *entries = NULL;
+	struct skd_msix_entry *qentry;
+
+	pdev = skdev->pdev;
+	skdev->msix_count = SKD_MAX_MSIX_COUNT;
+	entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT,
+			  GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
+		entries[i].entry = i;
+
+	rc = pci_enable_msix(pdev, entries, SKD_MAX_MSIX_COUNT);
+	if (rc < 0)
+		goto msix_out;
+	if (rc) {
+		if (rc < SKD_MIN_MSIX_COUNT) {
+			pr_err("(%s): failed to enable MSI-X %d\n",
+			       skd_name(skdev), rc);
+			goto msix_out;
+		}
+		DPRINTK(skdev, "%s: <%s> allocated %d MSI-X vectors\n",
+			pci_name(pdev), skdev->name, rc);
+
+		skdev->msix_count = rc;
+		rc = pci_enable_msix(pdev, entries, skdev->msix_count);
+		if (rc) {
+			pr_err("(%s): failed to enable MSI-X "
+			       "support (%d) %d\n",
+			       skd_name(skdev), skdev->msix_count, rc);
+			goto msix_out;
+		}
+	}
+	skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
+				      skdev->msix_count, GFP_KERNEL);
+	if (!skdev->msix_entries) {
+		rc = -ENOMEM;
+		skdev->msix_count = 0;
+		pr_err("(%s): msix table allocation error\n",
+		       skd_name(skdev));
+		goto msix_out;
+	}
+
+	qentry = skdev->msix_entries;
+	for (i = 0; i < skdev->msix_count; i++) {
+		qentry->vector = entries[i].vector;
+		qentry->entry = entries[i].entry;
+		qentry->rsp = NULL;
+		qentry->have_irq = 0;
+		DPRINTK(skdev, "%s: <%s> msix (%d) vec %d, entry %x\n",
+			pci_name(pdev), skdev->name,
+			i, qentry->vector, qentry->entry);
+		qentry++;
+	}
+
+	/* Enable MSI-X vectors for the base queue */
+	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
+		qentry = &skdev->msix_entries[i];
+		snprintf(qentry->isr_name, sizeof(qentry->isr_name),
+			 "%s%d-msix %s", DRV_NAME, skdev->devno,
+			 msix_entries[i].name);
+		rc = devm_request_irq(&skdev->pdev->dev, qentry->vector,
+				      msix_entries[i].handler, 0,
+				      qentry->isr_name, skdev);
+		if (rc) {
+			pr_err("(%s): Unable to register(%d) MSI-X "
+			       "handler %d: %s\n",
+			       skd_name(skdev), rc, i, qentry->isr_name);
+			goto msix_out;
+		} else {
+			qentry->have_irq = 1;
+			qentry->rsp = skdev;
+		}
+	}
+	DPRINTK(skdev, "%s: <%s> msix %d irq(s) enabled\n",
+		pci_name(pdev), skdev->name, skdev->msix_count);
+	return 0;
+
+msix_out:
+	if (entries)
+		kfree(entries);
+	skd_release_msix(skdev);
+	return rc;
+}
+
+static int skd_acquire_irq(struct skd_device *skdev)
+{
+	int rc;
+	struct pci_dev *pdev;
+
+	pdev = skdev->pdev;
+	skdev->msix_count = 0;
+
+RETRY_IRQ_TYPE:
+	switch (skdev->irq_type) {
+	case SKD_IRQ_MSIX:
+		rc = skd_acquire_msix(skdev);
+		if (!rc)
+			pr_info("(%s): MSI-X %d irqs enabled\n",
+			       skd_name(skdev), skdev->msix_count);
+		else {
+			pr_err(
+			       "(%s): failed to enable MSI-X, re-trying with MSI %d\n",
+			       skd_name(skdev), rc);
+			skdev->irq_type = SKD_IRQ_MSI;
+			goto RETRY_IRQ_TYPE;
+		}
+		break;
+	case SKD_IRQ_MSI:
+		snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi",
+			 DRV_NAME, skdev->devno);
+		rc = pci_enable_msi(pdev);
+		if (!rc) {
+			rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0,
+					      skdev->isr_name, skdev);
+			if (rc) {
+				pci_disable_msi(pdev);
+				pr_err(
+				       "(%s): failed to allocate the MSI interrupt %d\n",
+				       skd_name(skdev), rc);
+				goto RETRY_IRQ_LEGACY;
+			}
+			pr_info("(%s): MSI irq %d enabled\n",
+			       skd_name(skdev), pdev->irq);
+		} else {
+RETRY_IRQ_LEGACY:
+			pr_err(
+			       "(%s): failed to enable MSI, re-trying with LEGACY %d\n",
+			       skd_name(skdev), rc);
+			skdev->irq_type = SKD_IRQ_LEGACY;
+			goto RETRY_IRQ_TYPE;
+		}
+		break;
+	case SKD_IRQ_LEGACY:
+		snprintf(skdev->isr_name, sizeof(skdev->isr_name),
+			 "%s%d-legacy", DRV_NAME, skdev->devno);
+		rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
+				      IRQF_SHARED, skdev->isr_name, skdev);
+		if (!rc)
+			pr_info("(%s): LEGACY irq %d enabled\n",
+			       skd_name(skdev), pdev->irq);
+		else
+			pr_err("(%s): request LEGACY irq error %d\n",
+			       skd_name(skdev), rc);
+		break;
+	default:
+		pr_info("(%s): irq_type %d invalid, re-set to %d\n",
+		       skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT);
+		skdev->irq_type = SKD_IRQ_LEGACY;
+		goto RETRY_IRQ_TYPE;
+	}
+	return rc;
+}
+
+static void skd_release_irq(struct skd_device *skdev)
+{
+	switch (skdev->irq_type) {
+	case SKD_IRQ_MSIX:
+		skd_release_msix(skdev);
+		break;
+	case SKD_IRQ_MSI:
+		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+		pci_disable_msi(skdev->pdev);
+		break;
+	case SKD_IRQ_LEGACY:
+		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+		break;
+	default:
+		pr_err("(%s): wrong irq type %d!",
+		       skd_name(skdev), skdev->irq_type);
+		break;
+	}
+}
+
+/*
+ *****************************************************************************
+ * CONSTRUCT
+ *****************************************************************************
+ */
+
+static int skd_cons_skcomp(struct skd_device *skdev);
+static int skd_cons_skmsg(struct skd_device *skdev);
+static int skd_cons_skreq(struct skd_device *skdev);
+static int skd_cons_skspcl(struct skd_device *skdev);
+static int skd_cons_sksb(struct skd_device *skdev);
+static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
+						  u32 n_sg,
+						  dma_addr_t *ret_dma_addr);
+static int skd_cons_disk(struct skd_device *skdev);
+
+#define SKD_N_DEV_TABLE         16u
+static u32 skd_next_devno;
+
+static struct skd_device *skd_construct(struct pci_dev *pdev)
+{
+	struct skd_device *skdev;
+	int blk_major = skd_major;
+	int rc;
+
+	skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
+
+	if (!skdev) {
+		pr_err(PFX "(%s): memory alloc failure\n",
+		       pci_name(pdev));
+		return NULL;
+	}
+
+	skdev->state = SKD_DRVR_STATE_LOAD;
+	skdev->pdev = pdev;
+	skdev->devno = skd_next_devno++;
+	skdev->major = blk_major;
+	skdev->irq_type = skd_isr_type;
+	sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
+	skdev->dev_max_queue_depth = 0;
+
+	skdev->num_req_context = skd_max_queue_depth;
+	skdev->num_fitmsg_context = skd_max_queue_depth;
+	skdev->n_special = skd_max_pass_thru;
+	skdev->cur_max_queue_depth = 1;
+	skdev->queue_low_water_mark = 1;
+	skdev->proto_ver = 99;
+	skdev->sgs_per_request = skd_sgs_per_request;
+	skdev->dbg_level = skd_dbg_level;
+
+	if (skd_bio)
+		bio_list_init(&skdev->bio_queue);
+
+
+	atomic_set(&skdev->device_count, 0);
+
+	spin_lock_init(&skdev->lock);
+
+	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
+	INIT_LIST_HEAD(&skdev->flush_list);
+
+	VPRINTK(skdev, "skcomp\n");
+	rc = skd_cons_skcomp(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	VPRINTK(skdev, "skmsg\n");
+	rc = skd_cons_skmsg(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	VPRINTK(skdev, "skreq\n");
+	rc = skd_cons_skreq(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	VPRINTK(skdev, "skspcl\n");
+	rc = skd_cons_skspcl(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	VPRINTK(skdev, "sksb\n");
+	rc = skd_cons_sksb(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	VPRINTK(skdev, "disk\n");
+	rc = skd_cons_disk(skdev);
+	if (rc < 0)
+		goto err_out;
+
+
+
+	DPRINTK(skdev, "VICTORY\n");
+	return skdev;
+
+err_out:
+	DPRINTK(skdev, "construct failed\n");
+	skd_destruct(skdev);
+	return NULL;
+}
+
+static int skd_cons_skcomp(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct fit_completion_entry_v1 *skcomp;
+	u32 nbytes;
+
+	nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+	nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+	VPRINTK(skdev, "comp pci_alloc, total bytes %d entries %d\n", nbytes,
+		SKD_N_COMPLETION_ENTRY);
+
+	skcomp = pci_alloc_consistent(skdev->pdev, nbytes,
+				      &skdev->cq_dma_address);
+
+	if (skcomp == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skcomp, 0, nbytes);
+
+	skdev->skcomp_table = skcomp;
+	skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp +
+							   sizeof(*skcomp) *
+							   SKD_N_COMPLETION_ENTRY);
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_skmsg(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i;
+
+	VPRINTK(skdev, "skmsg_table kzalloc, struct %u, count %u total %lu\n",
+		sizeof(struct skd_fitmsg_context),
+		skdev->num_fitmsg_context,
+		(unsigned long) sizeof(struct skd_fitmsg_context) *
+					skdev->num_fitmsg_context);
+
+	skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
+				     *skdev->num_fitmsg_context, GFP_KERNEL);
+	if (skdev->skmsg_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg;
+
+		skmsg = &skdev->skmsg_table[i];
+
+		skmsg->id = i + SKD_ID_FIT_MSG;
+
+		skmsg->state = SKD_MSG_STATE_IDLE;
+		skmsg->msg_buf = pci_alloc_consistent(skdev->pdev,
+						      SKD_N_FITMSG_BYTES + 64,
+						      &skmsg->mb_dma_address);
+
+		if (skmsg->msg_buf == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skmsg->offset = (u32)((u64)skmsg->msg_buf &
+				      (~FIT_QCMD_BASE_ADDRESS_MASK));
+		skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK;
+		skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf &
+				       FIT_QCMD_BASE_ADDRESS_MASK);
+		skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK;
+		skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK;
+		memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
+
+		skmsg->next = &skmsg[1];
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skmsg_table[i - 1].next = NULL;
+	skdev->skmsg_free_list = skdev->skmsg_table;
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_skreq(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i;
+
+	VPRINTK(skdev, "skreq_table kzalloc, struct %u, count %u total %u\n",
+		sizeof(struct skd_request_context),
+		skdev->num_req_context,
+		sizeof(struct skd_request_context) * skdev->num_req_context);
+
+	skdev->skreq_table = kzalloc(sizeof(struct skd_request_context)
+				     * skdev->num_req_context, GFP_KERNEL);
+	if (skdev->skreq_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	VPRINTK(skdev, "alloc sg_table sg_per_req %u scatlist %u total %u\n",
+		skdev->sgs_per_request, sizeof(struct scatterlist),
+		skdev->sgs_per_request * sizeof(struct scatterlist));
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq;
+
+		skreq = &skdev->skreq_table[i];
+
+		skreq->id = i + SKD_ID_RW_REQUEST;
+		skreq->state = SKD_REQ_STATE_IDLE;
+
+		skreq->sg = kzalloc(sizeof(struct scatterlist) *
+				    skdev->sgs_per_request, GFP_KERNEL);
+		if (skreq->sg == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+		sg_init_table(skreq->sg, skdev->sgs_per_request);
+
+		skreq->sksg_list = skd_cons_sg_list(skdev,
+						    skdev->sgs_per_request,
+						    &skreq->sksg_dma_address);
+
+		if (skreq->sksg_list == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skreq->next = &skreq[1];
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skreq_table[i - 1].next = NULL;
+	skdev->skreq_free_list = skdev->skreq_table;
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_skspcl(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i, nbytes;
+
+	VPRINTK(skdev, "skspcl_table kzalloc, struct %u, count %u total %u\n",
+		sizeof(struct skd_special_context),
+		skdev->n_special,
+		sizeof(struct skd_special_context) * skdev->n_special);
+
+	skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context)
+				      * skdev->n_special, GFP_KERNEL);
+	if (skdev->skspcl_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl;
+
+		skspcl = &skdev->skspcl_table[i];
+
+		skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST;
+		skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+		skspcl->req.next = &skspcl[1].req;
+
+		nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+
+		skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+						       &skspcl->mb_dma_address);
+		if (skspcl->msg_buf == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		memset(skspcl->msg_buf, 0, nbytes);
+
+		skspcl->req.sg = kzalloc(sizeof(struct scatterlist) *
+					 SKD_N_SG_PER_SPECIAL, GFP_KERNEL);
+		if (skspcl->req.sg == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skspcl->req.sksg_list = skd_cons_sg_list(skdev,
+							 SKD_N_SG_PER_SPECIAL,
+							 &skspcl->req.
+							 sksg_dma_address);
+		if (skspcl->req.sksg_list == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skspcl_table[i - 1].req.next = NULL;
+	skdev->skspcl_free_list = skdev->skspcl_table;
+
+	return rc;
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_sksb(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct skd_special_context *skspcl;
+	u32 nbytes;
+
+	skspcl = &skdev->internal_skspcl;
+
+	skspcl->req.id = 0 + SKD_ID_INTERNAL;
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+	nbytes = SKD_N_INTERNAL_BYTES;
+
+	skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+						&skspcl->db_dma_address);
+	if (skspcl->data_buf == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skspcl->data_buf, 0, nbytes);
+
+	nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+	skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+					       &skspcl->mb_dma_address);
+	if (skspcl->msg_buf == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skspcl->msg_buf, 0, nbytes);
+
+	skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1,
+						 &skspcl->req.sksg_dma_address);
+	if (skspcl->req.sksg_list == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	if (!skd_format_internal_skspcl(skdev)) {
+		rc = -EINVAL;
+		goto err_out;
+	}
+
+err_out:
+	return rc;
+}
+
+static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
+						  u32 n_sg,
+						  dma_addr_t *ret_dma_addr)
+{
+	struct fit_sg_descriptor *sg_list;
+	u32 nbytes;
+
+	nbytes = sizeof(*sg_list) * n_sg;
+
+	sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
+
+	if (sg_list != NULL) {
+		uint64_t dma_address = *ret_dma_addr;
+		u32 i;
+
+		memset(sg_list, 0, nbytes);
+
+		for (i = 0; i < n_sg - 1; i++) {
+			uint64_t ndp_off;
+			ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
+
+			sg_list[i].next_desc_ptr = dma_address + ndp_off;
+		}
+		sg_list[i].next_desc_ptr = 0LL;
+	}
+
+	return sg_list;
+}
+
+static int skd_cons_disk(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct gendisk *disk;
+	struct request_queue *q;
+	unsigned long flags;
+
+	disk = alloc_disk(SKD_MINORS_PER_DEVICE);
+	if (!disk) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	skdev->disk = disk;
+	sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno);
+
+	disk->major = skdev->major;
+	disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE;
+	disk->fops = &skd_blockdev_ops;
+	disk->private_data = skdev;
+
+	if (!skd_bio) {
+		q = blk_init_queue(skd_request_fn, &skdev->lock);
+	} else {
+		q = blk_alloc_queue(GFP_KERNEL);
+		q->queue_flags = QUEUE_FLAG_IO_STAT | QUEUE_FLAG_STACKABLE;
+	}
+
+	if (!q) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	skdev->queue = q;
+	disk->queue = q;
+	q->queuedata = skdev;
+
+	if (skd_bio) {
+		q->queue_lock = &skdev->lock;
+		blk_queue_make_request(q, skd_make_request);
+	}
+
+	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	blk_queue_max_segments(q, skdev->sgs_per_request);
+	blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
+
+	/* set sysfs ptimal_io_size to 8K */
+	blk_queue_io_opt(q, 8192);
+
+	/* DISCARD Flag initialization. */
+	q->limits.discard_granularity = 8192;
+	q->limits.discard_alignment = 0;
+	q->limits.max_discard_sectors = UINT_MAX >> 9;
+	q->limits.discard_zeroes_data = 1;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	VPRINTK(skdev, "stopping %s queue\n", skdev->name);
+	skd_stop_queue(skdev);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+err_out:
+	return rc;
+}
+
+/*
+ *****************************************************************************
+ * DESTRUCT (FREE)
+ *****************************************************************************
+ */
+
+static void skd_free_skcomp(struct skd_device *skdev);
+static void skd_free_skmsg(struct skd_device *skdev);
+static void skd_free_skreq(struct skd_device *skdev);
+static void skd_free_skspcl(struct skd_device *skdev);
+static void skd_free_sksb(struct skd_device *skdev);
+static void skd_free_sg_list(struct skd_device *skdev,
+			     struct fit_sg_descriptor *sg_list,
+			     u32 n_sg, dma_addr_t dma_addr);
+static void skd_free_disk(struct skd_device *skdev);
+
+static void skd_destruct(struct skd_device *skdev)
+{
+	if (skdev == NULL)
+		return;
+
+
+	VPRINTK(skdev, "disk\n");
+	skd_free_disk(skdev);
+
+	VPRINTK(skdev, "sksb\n");
+	skd_free_sksb(skdev);
+
+	VPRINTK(skdev, "skspcl\n");
+	skd_free_skspcl(skdev);
+
+	VPRINTK(skdev, "skreq\n");
+	skd_free_skreq(skdev);
+
+	VPRINTK(skdev, "skmsg\n");
+	skd_free_skmsg(skdev);
+
+	VPRINTK(skdev, "skcomp\n");
+	skd_free_skcomp(skdev);
+
+	VPRINTK(skdev, "skdev\n");
+	kfree(skdev);
+}
+
+static void skd_free_skcomp(struct skd_device *skdev)
+{
+	if (skdev->skcomp_table != NULL) {
+		u32 nbytes;
+
+		nbytes = sizeof(skdev->skcomp_table[0]) *
+			 SKD_N_COMPLETION_ENTRY;
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skdev->skcomp_table, skdev->cq_dma_address);
+	}
+
+	skdev->skcomp_table = NULL;
+	skdev->cq_dma_address = 0;
+}
+
+static void skd_free_skmsg(struct skd_device *skdev)
+{
+	u32 i;
+
+	if (skdev->skmsg_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg;
+
+		skmsg = &skdev->skmsg_table[i];
+
+		if (skmsg->msg_buf != NULL) {
+			skmsg->msg_buf += skmsg->offset;
+			skmsg->mb_dma_address += skmsg->offset;
+			pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES,
+					    skmsg->msg_buf,
+					    skmsg->mb_dma_address);
+		}
+		skmsg->msg_buf = NULL;
+		skmsg->mb_dma_address = 0;
+	}
+
+	kfree(skdev->skmsg_table);
+	skdev->skmsg_table = NULL;
+}
+
+static void skd_free_skreq(struct skd_device *skdev)
+{
+	u32 i;
+
+	if (skdev->skreq_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq;
+
+		skreq = &skdev->skreq_table[i];
+
+		skd_free_sg_list(skdev, skreq->sksg_list,
+				 skdev->sgs_per_request,
+				 skreq->sksg_dma_address);
+
+		skreq->sksg_list = NULL;
+		skreq->sksg_dma_address = 0;
+
+		kfree(skreq->sg);
+	}
+
+	kfree(skdev->skreq_table);
+	skdev->skreq_table = NULL;
+}
+
+static void skd_free_skspcl(struct skd_device *skdev)
+{
+	u32 i;
+	u32 nbytes;
+
+	if (skdev->skspcl_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl;
+
+		skspcl = &skdev->skspcl_table[i];
+
+		if (skspcl->msg_buf != NULL) {
+			nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+			pci_free_consistent(skdev->pdev, nbytes,
+					    skspcl->msg_buf,
+					    skspcl->mb_dma_address);
+		}
+
+		skspcl->msg_buf = NULL;
+		skspcl->mb_dma_address = 0;
+
+		skd_free_sg_list(skdev, skspcl->req.sksg_list,
+				 SKD_N_SG_PER_SPECIAL,
+				 skspcl->req.sksg_dma_address);
+
+		skspcl->req.sksg_list = NULL;
+		skspcl->req.sksg_dma_address = 0;
+
+		kfree(skspcl->req.sg);
+	}
+
+	kfree(skdev->skspcl_table);
+	skdev->skspcl_table = NULL;
+}
+
+static void skd_free_sksb(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl;
+	u32 nbytes;
+
+	skspcl = &skdev->internal_skspcl;
+
+	if (skspcl->data_buf != NULL) {
+		nbytes = SKD_N_INTERNAL_BYTES;
+
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skspcl->data_buf, skspcl->db_dma_address);
+	}
+
+	skspcl->data_buf = NULL;
+	skspcl->db_dma_address = 0;
+
+	if (skspcl->msg_buf != NULL) {
+		nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skspcl->msg_buf, skspcl->mb_dma_address);
+	}
+
+	skspcl->msg_buf = NULL;
+	skspcl->mb_dma_address = 0;
+
+	skd_free_sg_list(skdev, skspcl->req.sksg_list, 1,
+			 skspcl->req.sksg_dma_address);
+
+	skspcl->req.sksg_list = NULL;
+	skspcl->req.sksg_dma_address = 0;
+}
+
+static void skd_free_sg_list(struct skd_device *skdev,
+			     struct fit_sg_descriptor *sg_list,
+			     u32 n_sg, dma_addr_t dma_addr)
+{
+	if (sg_list != NULL) {
+		u32 nbytes;
+
+		nbytes = sizeof(*sg_list) * n_sg;
+
+		pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
+	}
+}
+
+static void skd_free_disk(struct skd_device *skdev)
+{
+	struct gendisk *disk = skdev->disk;
+
+	if (disk != NULL) {
+		struct request_queue *q = disk->queue;
+
+		if (disk->flags & GENHD_FL_UP)
+			del_gendisk(disk);
+		if (q)
+			blk_cleanup_queue(q);
+		put_disk(disk);
+	}
+	skdev->disk = NULL;
+}
+
+
+
+/*
+ *****************************************************************************
+ * BLOCK DEVICE (BDEV) GLUE
+ *****************************************************************************
+ */
+
+static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct skd_device *skdev;
+	u64 capacity;
+
+	skdev = bdev->bd_disk->private_data;
+
+	DPRINTK(skdev, "%s: CMD[%s] getgeo device\n",
+		bdev->bd_disk->disk_name, current->comm);
+
+	if (skdev->read_cap_is_valid) {
+		capacity = get_capacity(skdev->disk);
+		geo->heads = 64;
+		geo->sectors = 255;
+		geo->cylinders = (capacity) / (255 * 64);
+
+		return 0;
+	}
+	return -EIO;
+}
+
+static int skd_bdev_attach(struct skd_device *skdev)
+{
+	DPRINTK(skdev, "add_disk\n");
+	add_disk(skdev->disk);
+	return 0;
+}
+
+static const struct block_device_operations skd_blockdev_ops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= skd_bdev_ioctl,
+	.getgeo		= skd_bdev_getgeo,
+};
+
+
+/*
+ *****************************************************************************
+ * PCIe DRIVER GLUE
+ *****************************************************************************
+ */
+
+static DEFINE_PCI_DEVICE_TABLE(skd_pci_tbl) = {
+	{ PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
+	{ 0 }                     /* terminate list */
+};
+
+MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
+
+static char *skd_pci_info(struct skd_device *skdev, char *str)
+{
+	int pcie_reg;
+
+	strcpy(str, "PCIe (");
+	pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
+
+	if (pcie_reg) {
+
+		char lwstr[6];
+		uint16_t pcie_lstat, lspeed, lwidth;
+
+		pcie_reg += 0x12;
+		pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat);
+		lspeed = pcie_lstat & (0xF);
+		lwidth = (pcie_lstat & 0x3F0) >> 4;
+
+		if (lspeed == 1)
+			strcat(str, "2.5GT/s ");
+		else if (lspeed == 2)
+			strcat(str, "5.0GT/s ");
+		else
+			strcat(str, "<unknown> ");
+		snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
+		strcat(str, lwstr);
+	}
+	return str;
+}
+
+static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int i;
+	int rc = 0;
+	char pci_str[32];
+	struct skd_device *skdev;
+
+	pr_info("STEC s1120 Driver(%s) version %s-b%s\n",
+	       DRV_NAME, DRV_VERSION, DRV_BUILD_ID);
+	pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n",
+	       pci_name(pdev), pdev->vendor, pdev->device);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		return rc;
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_out;
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!rc) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+			pr_err("(%s): consistent DMA mask error %d\n",
+			       pci_name(pdev), rc);
+		}
+	} else {
+		(rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)));
+		if (rc) {
+
+			pr_err("(%s): DMA mask error %d\n",
+			       pci_name(pdev), rc);
+			goto err_out_regions;
+		}
+	}
+
+	skdev = skd_construct(pdev);
+	if (skdev == NULL)
+		goto err_out_regions;
+
+	skd_pci_info(skdev, pci_str);
+	pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str);
+
+	pci_set_master(pdev);
+	rc = pci_enable_pcie_error_reporting(pdev);
+	if (rc) {
+		pr_err(
+		       "(%s): bad enable of PCIe error reporting rc=%d\n",
+		       skd_name(skdev), rc);
+		skdev->pcie_error_reporting_is_enabled = 0;
+	} else
+		skdev->pcie_error_reporting_is_enabled = 1;
+
+
+	pci_set_drvdata(pdev, skdev);
+	skdev->pdev = pdev;
+	skdev->disk->driverfs_dev = &pdev->dev;
+
+	for (i = 0; i < SKD_MAX_BARS; i++) {
+		skdev->mem_phys[i] = pci_resource_start(pdev, i);
+		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+					    skdev->mem_size[i]);
+		if (!skdev->mem_map[i]) {
+			pr_err("(%s): Unable to map adapter memory!\n",
+			       skd_name(skdev));
+			rc = -ENODEV;
+			goto err_out_iounmap;
+		}
+		DPRINTK(skdev, "mem_map=%p, phyd=%016llx, size=%d\n",
+			skdev->mem_map[i],
+			(uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+	}
+
+	rc = skd_acquire_irq(skdev);
+	if (rc) {
+		pr_err("(%s): interrupt resource error %d\n",
+		       skd_name(skdev), rc);
+		goto err_out_iounmap;
+	}
+
+	rc = skd_start_timer(skdev);
+	if (rc)
+		goto err_out_timer;
+
+	init_waitqueue_head(&skdev->waitq);
+
+	skd_start_device(skdev);
+
+	rc = wait_event_interruptible_timeout(skdev->waitq,
+					      (skdev->gendisk_on),
+					      (SKD_START_WAIT_SECONDS * HZ));
+	if (skdev->gendisk_on > 0) {
+		/* device came on-line after reset */
+		skd_bdev_attach(skdev);
+		rc = 0;
+	} else {
+		/* we timed out, something is wrong with the device,
+		   don't add the disk structure */
+		pr_err(
+		       "(%s): error: waiting for s1120 timed out %d!\n",
+		       skd_name(skdev), rc);
+		/* in case of no error; we timeout with ENXIO */
+		if (!rc)
+			rc = -ENXIO;
+		goto err_out_timer;
+	}
+
+
+#ifdef SKD_VMK_POLL_HANDLER
+	if (skdev->irq_type == SKD_IRQ_MSIX) {
+		/* MSIX completion handler is being used for coredump */
+		vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+						  skdev->msix_entries[5].vector,
+						  skd_comp_q, skdev);
+	} else {
+		vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+						  skdev->pdev->irq, skd_isr,
+						  skdev);
+	}
+#endif  /* SKD_VMK_POLL_HANDLER */
+
+	return rc;
+
+err_out_timer:
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+err_out_iounmap:
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap(skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	skd_destruct(skdev);
+
+err_out_regions:
+	pci_release_regions(pdev);
+
+err_out:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return rc;
+}
+
+static void skd_pci_remove(struct pci_dev *pdev)
+{
+	int i;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return;
+	}
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap((u32 *)skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	skd_destruct(skdev);
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+
+	return;
+}
+
+static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	int i;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return -EIO;
+	}
+
+	skd_stop_device(skdev);
+
+	skd_release_irq(skdev);
+
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap((u32 *)skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	pci_release_regions(pdev);
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int skd_pci_resume(struct pci_dev *pdev)
+{
+	int i;
+	int rc = 0;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return -1;
+	}
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_enable_wake(pdev, PCI_D0, 0);
+	pci_restore_state(pdev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		return rc;
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_out;
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!rc) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+			pr_err("(%s): consistent DMA mask error %d\n",
+			       pci_name(pdev), rc);
+		}
+	} else {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc) {
+
+			pr_err("(%s): DMA mask error %d\n",
+			       pci_name(pdev), rc);
+			goto err_out_regions;
+		}
+	}
+
+	pci_set_master(pdev);
+	rc = pci_enable_pcie_error_reporting(pdev);
+	if (rc) {
+		pr_err("(%s): bad enable of PCIe error reporting rc=%d\n",
+		       skdev->name, rc);
+		skdev->pcie_error_reporting_is_enabled = 0;
+	} else
+		skdev->pcie_error_reporting_is_enabled = 1;
+
+	for (i = 0; i < SKD_MAX_BARS; i++) {
+
+		skdev->mem_phys[i] = pci_resource_start(pdev, i);
+		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+					    skdev->mem_size[i]);
+		if (!skdev->mem_map[i]) {
+			pr_err("(%s): Unable to map adapter memory!\n",
+			       skd_name(skdev));
+			rc = -ENODEV;
+			goto err_out_iounmap;
+		}
+		DPRINTK(skdev, "mem_map=%p, phyd=%016llx, size=%d\n",
+			skdev->mem_map[i],
+			(uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+	}
+	rc = skd_acquire_irq(skdev);
+	if (rc) {
+
+		pr_err("(%s): interrupt resource error %d\n",
+		       pci_name(pdev), rc);
+		goto err_out_iounmap;
+	}
+
+	rc = skd_start_timer(skdev);
+	if (rc)
+		goto err_out_timer;
+
+	init_waitqueue_head(&skdev->waitq);
+
+	skd_start_device(skdev);
+
+	return rc;
+
+err_out_timer:
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+err_out_iounmap:
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap(skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+err_out_regions:
+	pci_release_regions(pdev);
+
+err_out:
+	pci_disable_device(pdev);
+	return rc;
+}
+
+static void skd_pci_shutdown(struct pci_dev *pdev)
+{
+	struct skd_device *skdev;
+
+	pr_err("skd_pci_shutdown called\n");
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return;
+	}
+
+	pr_err("%s: calling stop\n", skd_name(skdev));
+	skd_stop_device(skdev);
+}
+
+static struct pci_driver skd_driver = {
+	.name		= DRV_NAME,
+	.id_table	= skd_pci_tbl,
+	.probe		= skd_pci_probe,
+	.remove		= skd_pci_remove,
+	.suspend	= skd_pci_suspend,
+	.resume		= skd_pci_resume,
+	.shutdown	= skd_pci_shutdown,
+};
+
+/*
+ *****************************************************************************
+ * LOGGING SUPPORT
+ *****************************************************************************
+ */
+
+static const char *skd_name(struct skd_device *skdev)
+{
+	memset(skdev->id_str, 0, sizeof(skdev->id_str));
+
+	if (skdev->inquiry_is_valid)
+		snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]",
+			 skdev->name, skdev->inq_serial_num,
+			 pci_name(skdev->pdev));
+	else
+		snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]",
+			 skdev->name, pci_name(skdev->pdev));
+
+	return skdev->id_str;
+}
+
+const char *skd_drive_state_to_str(int state)
+{
+	switch (state) {
+	case FIT_SR_DRIVE_OFFLINE:
+		return "OFFLINE";
+	case FIT_SR_DRIVE_INIT:
+		return "INIT";
+	case FIT_SR_DRIVE_ONLINE:
+		return "ONLINE";
+	case FIT_SR_DRIVE_BUSY:
+		return "BUSY";
+	case FIT_SR_DRIVE_FAULT:
+		return "FAULT";
+	case FIT_SR_DRIVE_DEGRADED:
+		return "DEGRADED";
+	case FIT_SR_PCIE_LINK_DOWN:
+		return "INK_DOWN";
+	case FIT_SR_DRIVE_SOFT_RESET:
+		return "SOFT_RESET";
+	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+		return "NEED_FW";
+	case FIT_SR_DRIVE_INIT_FAULT:
+		return "INIT_FAULT";
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		return "BUSY_SANITIZE";
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		return "BUSY_ERASE";
+	case FIT_SR_DRIVE_FW_BOOTING:
+		return "FW_BOOTING";
+	default:
+		return "???";
+	}
+}
+
+const char *skd_skdev_state_to_str(enum skd_drvr_state state)
+{
+	switch (state) {
+	case SKD_DRVR_STATE_LOAD:
+		return "LOAD";
+	case SKD_DRVR_STATE_IDLE:
+		return "IDLE";
+	case SKD_DRVR_STATE_BUSY:
+		return "BUSY";
+	case SKD_DRVR_STATE_STARTING:
+		return "STARTING";
+	case SKD_DRVR_STATE_ONLINE:
+		return "ONLINE";
+	case SKD_DRVR_STATE_PAUSING:
+		return "PAUSING";
+	case SKD_DRVR_STATE_PAUSED:
+		return "PAUSED";
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		return "DRAINING_TIMEOUT";
+	case SKD_DRVR_STATE_RESTARTING:
+		return "RESTARTING";
+	case SKD_DRVR_STATE_RESUMING:
+		return "RESUMING";
+	case SKD_DRVR_STATE_STOPPING:
+		return "STOPPING";
+	case SKD_DRVR_STATE_SYNCING:
+		return "SYNCING";
+	case SKD_DRVR_STATE_FAULT:
+		return "FAULT";
+	case SKD_DRVR_STATE_DISAPPEARED:
+		return "DISAPPEARED";
+	case SKD_DRVR_STATE_BUSY_ERASE:
+		return "BUSY_ERASE";
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+		return "BUSY_SANITIZE";
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		return "BUSY_IMMINENT";
+	case SKD_DRVR_STATE_WAIT_BOOT:
+		return "WAIT_BOOT";
+
+	default:
+		return "???";
+	}
+}
+
+const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
+{
+	switch (state) {
+	case SKD_MSG_STATE_IDLE:
+		return "IDLE";
+	case SKD_MSG_STATE_BUSY:
+		return "BUSY";
+	default:
+		return "???";
+	}
+}
+
+const char *skd_skreq_state_to_str(enum skd_req_state state)
+{
+	switch (state) {
+	case SKD_REQ_STATE_IDLE:
+		return "IDLE";
+	case SKD_REQ_STATE_SETUP:
+		return "SETUP";
+	case SKD_REQ_STATE_BUSY:
+		return "BUSY";
+	case SKD_REQ_STATE_COMPLETED:
+		return "COMPLETED";
+	case SKD_REQ_STATE_TIMEOUT:
+		return "TIMEOUT";
+	case SKD_REQ_STATE_ABORTED:
+		return "ABORTED";
+	default:
+		return "???";
+	}
+}
+
+static void skd_log_skdev(struct skd_device *skdev, const char *event)
+{
+	DPRINTK(skdev, "(%s) skdev=%p event='%s'\n", skdev->name, skdev, event);
+	DPRINTK(skdev, "  drive_state=%s(%d) driver_state=%s(%d)\n",
+		skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+		skd_skdev_state_to_str(skdev->state), skdev->state);
+	DPRINTK(skdev, "  busy=%d limit=%d dev=%d lowat=%d\n",
+		skdev->in_flight, skdev->cur_max_queue_depth,
+		skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+	DPRINTK(skdev, "  timestamp=0x%x cycle=%d cycle_ix=%d\n",
+		skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
+}
+
+static void skd_log_skmsg(struct skd_device *skdev,
+			  struct skd_fitmsg_context *skmsg, const char *event)
+{
+	DPRINTK(skdev, "(%s) skmsg=%p event='%s'\n", skdev->name, skmsg, event);
+	DPRINTK(skdev, "  state=%s(%d) id=0x%04x length=%d\n",
+		skd_skmsg_state_to_str(skmsg->state), skmsg->state,
+		skmsg->id, skmsg->length);
+}
+
+static void skd_log_skreq(struct skd_device *skdev,
+			  struct skd_request_context *skreq, const char *event)
+{
+	DPRINTK(skdev, "(%s) skreq=%p event='%s'\n", skdev->name, skreq, event);
+	DPRINTK(skdev, "  state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
+		skd_skreq_state_to_str(skreq->state), skreq->state,
+		skreq->id, skreq->fitmsg_id);
+	DPRINTK(skdev, "  timo=0x%x sg_dir=%d n_sg=%d\n",
+		skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
+
+	if (!skd_bio) {
+		if (skreq->req != NULL) {
+			struct request *req = skreq->req;
+			u32 lba = (u32)blk_rq_pos(req);
+			u32 count = blk_rq_sectors(req);
+
+			DPRINTK(skdev,
+				"  req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				req, lba, lba, count, count,
+				(int)rq_data_dir(req));
+		} else
+			DPRINTK(skdev, "  req=NULL\n");
+	} else {
+		if (skreq->bio != NULL) {
+			struct bio *bio = skreq->bio;
+			u32 lba = (u32)bio->bi_sector;
+			u32 count = bio_sectors(bio);
+
+			DPRINTK(skdev,
+				"  bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				bio, lba, lba, count, count,
+				(int)bio_data_dir(bio));
+		} else
+			DPRINTK(skdev, "  req=NULL\n");
+	}
+}
+
+/*
+ *****************************************************************************
+ * MODULE GLUE
+ *****************************************************************************
+ */
+
+static int __init skd_init(void)
+{
+	int rc = 0;
+
+	pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
+
+	switch (skd_isr_type) {
+	case SKD_IRQ_LEGACY:
+	case SKD_IRQ_MSI:
+	case SKD_IRQ_MSIX:
+		break;
+	default:
+		pr_info("skd_isr_type %d invalid, re-set to %d\n",
+		       skd_isr_type, SKD_IRQ_DEFAULT);
+		skd_isr_type = SKD_IRQ_DEFAULT;
+	}
+
+	skd_flush_slab = kmem_cache_create(SKD_FLUSH_JOB,
+						sizeof(struct skd_flush_cmd),
+						0, 0, NULL);
+
+	if (!skd_flush_slab) {
+		pr_err("failed to allocated flush slab.\n");
+		return -ENOMEM;
+	}
+
+	if (skd_max_queue_depth < 1
+	    || skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
+		pr_info(
+		       "skd_max_queue_depth %d invalid, re-set to %d\n",
+		       skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
+		skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+	}
+
+	if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) {
+		pr_info(
+		       "skd_max_req_per_msg %d invalid, re-set to %d\n",
+		       skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
+		skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+	}
+
+	if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
+		pr_info(
+		       "skd_sg_per_request %d invalid, re-set to %d\n",
+		       skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
+		skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+	}
+
+	if (skd_dbg_level < 0 || skd_dbg_level > 2) {
+		pr_info("skd_dbg_level %d invalid, re-set to %d\n",
+		       skd_dbg_level, 0);
+		skd_dbg_level = 0;
+	}
+
+	if (skd_isr_comp_limit < 0) {
+		pr_info("skd_isr_comp_limit %d invalid, set to %d\n",
+		       skd_isr_comp_limit, 0);
+		skd_isr_comp_limit = 0;
+	}
+
+	if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) {
+		pr_info("skd_max_pass_thru %d invalid, re-set to %d\n",
+		       skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT);
+		skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+	}
+
+	/* Obtain major device number. */
+	rc = register_blkdev(0, DRV_NAME);
+	if (rc < 0)
+		return rc;
+
+	skd_major = rc;
+
+	return pci_register_driver(&skd_driver);
+
+}
+
+static void __exit skd_exit(void)
+{
+	pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
+
+	unregister_blkdev(skd_major, DRV_NAME);
+	pci_unregister_driver(&skd_driver);
+
+	kmem_cache_destroy(skd_flush_slab);
+}
+
+static int
+skd_flush_cmd_enqueue(struct skd_device *skdev, void *cmd)
+{
+	struct skd_flush_cmd *item;
+
+	item = kmem_cache_zalloc(skd_flush_slab, GFP_ATOMIC);
+	if (!item) {
+		pr_err("skd_flush_cmd_enqueue: Failed to allocated item.\n");
+		return -ENOMEM;
+	}
+
+	item->cmd = cmd;
+	list_add_tail(&item->flist, &skdev->flush_list);
+	return 0;
+}
+
+static void *
+skd_flush_cmd_dequeue(struct skd_device *skdev)
+{
+	void *cmd;
+	struct skd_flush_cmd *item;
+
+	item = list_entry(skdev->flush_list.next, struct skd_flush_cmd, flist);
+	list_del_init(&item->flist);
+	cmd = item->cmd;
+	kmem_cache_free(skd_flush_slab, item);
+	return cmd;
+}
+
+module_init(skd_init);
+module_exit(skd_exit);
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
new file mode 100644
index 000000000000..bf01941cdd62
--- /dev/null
+++ b/drivers/block/skd_s1120.h
@@ -0,0 +1,354 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ */
+
+
+#ifndef SKD_S1120_H
+#define SKD_S1120_H
+
+#pragma pack(push, s1120_h, 1)
+
+/*
+ * Q-channel, 64-bit r/w
+ */
+#define FIT_Q_COMMAND               0x400u
+#define  FIT_QCMD_QID_MASK              (0x3 << 1)
+#define  FIT_QCMD_QID0                  (0x0 << 1)
+#define  FIT_QCMD_QID_NORMAL            FIT_QCMD_QID0
+#ifndef SKD_OMIT_FROM_SRC_DIST
+#define  FIT_QCMD_QID1                  (0x1 << 1)
+#define  FIT_QCMD_QID2                  (0x2 << 1)
+#define  FIT_QCMD_QID3                  (0x3 << 1)
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+#define  FIT_QCMD_FLUSH_QUEUE           (0ull)      /* add QID */
+#define  FIT_QCMD_MSGSIZE_MASK          (0x3 << 4)
+#define  FIT_QCMD_MSGSIZE_64            (0x0 << 4)
+#define  FIT_QCMD_MSGSIZE_128           (0x1 << 4)
+#define  FIT_QCMD_MSGSIZE_256           (0x2 << 4)
+#define  FIT_QCMD_MSGSIZE_512           (0x3 << 4)
+#define  FIT_QCMD_BASE_ADDRESS_MASK     (0xFFFFFFFFFFFFFFC0ull)
+
+
+/*
+ * Control, 32-bit r/w
+ */
+#define FIT_CONTROL                 0x500u
+#ifndef SKD_OMIT_FROM_SRC_DIST
+#define  FIT_CR_HARD_RESET              (1u << 0u)
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+#define  FIT_CR_SOFT_RESET              (1u << 1u)
+#ifndef SKD_OMIT_FROM_SRC_DIST
+#define	 FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+#define  FIT_CR_ENABLE_INTERRUPTS       (1u << 7u)
+
+/*
+ * Status, 32-bit, r/o
+ */
+#define FIT_STATUS			0x510u
+#define FIT_SR_DRIVE_STATE_MASK		0x000000FFu
+#ifndef SKD_OMIT_FROM_SRC_DIST
+#define	FIT_SR_SIGNATURE		(0xFF << 8)
+#define	FIT_SR_PIO_DMA			(1 << 16)
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+#define FIT_SR_DRIVE_OFFLINE		0x00
+#define FIT_SR_DRIVE_INIT		0x01
+/* #define FIT_SR_DRIVE_READY		0x02 */
+#define FIT_SR_DRIVE_ONLINE		0x03
+#define FIT_SR_DRIVE_BUSY		0x04
+#define FIT_SR_DRIVE_FAULT		0x05
+#define FIT_SR_DRIVE_DEGRADED		0x06
+#define FIT_SR_PCIE_LINK_DOWN		0x07
+#define FIT_SR_DRIVE_SOFT_RESET		0x08
+#define FIT_SR_DRIVE_INIT_FAULT		0x09
+#define FIT_SR_DRIVE_BUSY_SANITIZE	0x0A
+#define FIT_SR_DRIVE_BUSY_ERASE		0x0B
+#define FIT_SR_DRIVE_FW_BOOTING		0x0C
+#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD	0xFE
+#define FIT_SR_DEVICE_MISSING           0xFF
+#define FIT_SR__RESERVED		0xFFFFFF00u
+
+#ifndef SKD_OMIT_FROM_SRC_DIST
+/*
+ * FIT_STATUS - Status register data definition
+ */
+#define	FIT_SR_STATE_MASK		(0xFF << 0)
+#define	FIT_SR_SIGNATURE		(0xFF << 8)
+#define	FIT_SR_PIO_DMA			(1 << 16)
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+
+
+/*
+ * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
+ */
+#define FIT_INT_STATUS_HOST         0x520u
+#define  FIT_ISH_FW_STATE_CHANGE        (1u << 0u)
+#define  FIT_ISH_COMPLETION_POSTED      (1u << 1u)
+#define  FIT_ISH_MSG_FROM_DEV           (1u << 2u)
+#define  FIT_ISH_UNDEFINED_3            (1u << 3u)
+#define  FIT_ISH_UNDEFINED_4            (1u << 4u)
+#define  FIT_ISH_Q0_FULL                (1u << 5u)
+#define  FIT_ISH_Q1_FULL                (1u << 6u)
+#define  FIT_ISH_Q2_FULL                (1u << 7u)
+#define  FIT_ISH_Q3_FULL                (1u << 8u)
+#define  FIT_ISH_QCMD_FIFO_OVERRUN      (1u << 9u)
+#define  FIT_ISH_BAD_EXP_ROM_READ       (1u << 10u)
+
+
+#define FIT_INT_DEF_MASK	\
+		(FIT_ISH_FW_STATE_CHANGE | \
+		 FIT_ISH_COMPLETION_POSTED | \
+		 FIT_ISH_MSG_FROM_DEV | \
+		 FIT_ISH_Q0_FULL | \
+		 FIT_ISH_Q1_FULL | \
+		 FIT_ISH_Q2_FULL | \
+		 FIT_ISH_Q3_FULL | \
+		 FIT_ISH_QCMD_FIFO_OVERRUN | \
+		 FIT_ISH_BAD_EXP_ROM_READ)
+
+#define FIT_INT_QUEUE_FULL	\
+		(FIT_ISH_Q0_FULL | \
+		FIT_ISH_Q1_FULL | \
+		FIT_ISH_Q2_FULL | \
+		FIT_ISH_Q3_FULL)
+
+
+#define MSI_MSG_NWL_ERROR_0             0x00000000
+#define MSI_MSG_NWL_ERROR_1             0x00000001
+#define MSI_MSG_NWL_ERROR_2             0x00000002
+#define MSI_MSG_NWL_ERROR_3             0x00000003
+#define MSI_MSG_STATE_CHANGE            0x00000004
+#define MSI_MSG_COMPLETION_POSTED       0x00000005
+#define MSI_MSG_MSG_FROM_DEV            0x00000006
+#define MSI_MSG_RESERVED_0              0x00000007
+#define MSI_MSG_RESERVED_1              0x00000008
+#define MSI_MSG_QUEUE_0_FULL            0x00000009
+#define MSI_MSG_QUEUE_1_FULL            0x0000000A
+#define MSI_MSG_QUEUE_2_FULL            0x0000000B
+#define MSI_MSG_QUEUE_3_FULL            0x0000000C
+
+
+
+#define FIT_INT_RESERVED_MASK	\
+		(FIT_ISH_UNDEFINED_3 | \
+		FIT_ISH_UNDEFINED_4)
+/*
+ * Interrupt mask, 32-bit r/w
+ * Bit definitions are the same as FIT_INT_STATUS_HOST
+ */
+#define FIT_INT_MASK_HOST           0x528u
+
+
+/*
+ * Message to device, 32-bit r/w
+ */
+#define FIT_MSG_TO_DEVICE           0x540u
+
+/*
+ * Message from device, 32-bit, r/o
+ */
+#define FIT_MSG_FROM_DEVICE         0x548u
+
+
+/*
+ * 32-bit messages to/from device, composition/extraction macros
+ */
+#define FIT_MXD_CONS(TYPE, PARAM, DATA) \
+	((((TYPE)  & 0xFFu) << 24u) | \
+	(((PARAM) & 0xFFu) << 16u) | \
+	(((DATA)  & 0xFFFFu) << 0u))
+#define FIT_MXD_TYPE(MXD)               (((MXD) >> 24u) & 0xFFu)
+#define FIT_MXD_PARAM(MXD)              (((MXD) >> 16u) & 0xFFu)
+#define FIT_MXD_DATA(MXD)               (((MXD) >> 0u) & 0xFFFFu)
+
+
+/*
+ * Types of messages to/from device
+ */
+#define FIT_MTD_FITFW_INIT              0x01u
+#define FIT_MTD_GET_CMDQ_DEPTH          0x02u
+#define FIT_MTD_SET_COMPQ_DEPTH         0x03u
+#define FIT_MTD_SET_COMPQ_ADDR          0x04u
+#define FIT_MTD_ARM_QUEUE               0x05u
+#define FIT_MTD_CMD_LOG_HOST_ID         0x07u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_LO   0x08u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_HI   0x09u
+#define FIT_MFD_SMART_EXCEEDED          0x10u
+#define FIT_MFD_POWER_DOWN		        0x11u
+#define FIT_MFD_OFFLINE			        0x12u
+#define FIT_MFD_ONLINE			        0x13u
+#define FIT_MFD_FW_RESTARTING		    0x14u
+#define FIT_MFD_PM_ACTIVE		        0x15u
+#define FIT_MFD_PM_STANDBY		        0x16u
+#define FIT_MFD_PM_SLEEP		        0x17u
+#define FIT_MFD_CMD_PROGRESS		    0x18u
+
+#ifndef SKD_OMIT_FROM_SRC_DIST
+#define FIT_MTD_DEBUG                   0xFEu
+#define FIT_MFD_DEBUG                   0xFFu
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+
+#define FIT_MFD_MASK			(0xFFu)
+#define FIT_MFD_DATA_MASK		(0xFFu)
+#define FIT_MFD_MSG(x)			(((x) >> 24) & FIT_MFD_MASK)
+#define FIT_MFD_DATA(x)			((x) & FIT_MFD_MASK)
+
+
+/*
+ * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
+ * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
+ * (was Response buffer in docs)
+ */
+#define FIT_MSG_TO_DEVICE_ARG       0x580u
+
+/*
+ * Hardware (ASIC) version, 32-bit r/o
+ */
+#define FIT_HW_VERSION              0x588u
+
+/*
+ * Scatter/gather list descriptor.
+ * 32-bytes and must be aligned on a 32-byte boundary.
+ * All fields are in little endian order.
+ */
+struct fit_sg_descriptor {
+	uint32_t control;
+	uint32_t byte_count;
+	uint64_t host_side_addr;
+	uint64_t dev_side_addr;
+	uint64_t next_desc_ptr;
+};
+
+#define FIT_SGD_CONTROL_NOT_LAST    0x000u
+#define FIT_SGD_CONTROL_LAST        0x40Eu
+
+/*
+ * Header at the beginning of a FIT message. The header
+ * is followed by SSDI requests each 64 bytes.
+ * A FIT message can be up to 512 bytes long and must start
+ * on a 64-byte boundary.
+ */
+struct fit_msg_hdr {
+	uint8_t protocol_id;
+	uint8_t num_protocol_cmds_coalesced;
+	uint8_t _reserved[62];
+};
+
+#define FIT_PROTOCOL_ID_FIT     1
+#define FIT_PROTOCOL_ID_SSDI    2
+#define FIT_PROTOCOL_ID_SOFIT   3
+
+
+#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
+#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
+
+#ifndef SKD_OMIT_FROM_SRC_DIST
+/*
+ * Format of a completion entry. The completion queue is circular
+ * and must have at least as many entries as the maximum number
+ * of commands that may be issued to the device.
+ *
+ * There are no head/tail pointers. The cycle value is used to
+ * infer the presence of new completion records.
+ * Initially the cycle in all entries is 0, the index is 0, and
+ * the cycle value to expect is 1. When completions are added
+ * their cycle values are set to 1. When the index wraps the
+ * cycle value to expect is incremented.
+ *
+ * Command_context is opaque and taken verbatim from the SSDI command.
+ * All other fields are big endian.
+ */
+#endif /* SKD_OMIT_FROM_SRC_DIST */
+#define FIT_PROTOCOL_VERSION_0          0
+
+/*
+ *  Protocol major version 1 completion entry.
+ *  The major protocol version is found in bits
+ *  20-23 of the FIT_MTD_FITFW_INIT response.
+ */
+struct fit_completion_entry_v1 {
+	uint32_t	num_returned_bytes;
+	uint16_t	tag;
+	uint8_t		status;  /* SCSI status */
+	uint8_t		cycle;
+};
+#define FIT_PROTOCOL_VERSION_1          1
+#define FIT_PROTOCOL_VERSION_CURRENT    FIT_PROTOCOL_VERSION_1
+
+struct fit_comp_error_info {
+	uint8_t		type:7; /* 00: Bits0-6 indicates the type of sense data. */
+	uint8_t		valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */
+	uint8_t		reserved0; /* 01: Obsolete field */
+	uint8_t		key:4; /* 02: Bits0-3 indicate the sense key. */
+	uint8_t		reserved2:1; /* 02: Reserved bit. */
+	uint8_t		bad_length:1; /* 02: Incorrect Length Indicator */
+	uint8_t		end_medium:1; /* 02: End of Medium */
+	uint8_t		file_mark:1; /* 02: Filemark */
+	uint8_t		info[4]; /* 03: */
+	uint8_t		reserved1; /* 07: Additional Sense Length */
+	uint8_t		cmd_spec[4]; /* 08: Command Specific Information */
+	uint8_t		code; /* 0C: Additional Sense Code */
+	uint8_t		qual; /* 0D: Additional Sense Code Qualifier */
+	uint8_t		fruc; /* 0E: Field Replaceable Unit Code */
+	uint8_t		sks_high:7; /* 0F: Sense Key Specific (MSB) */
+	uint8_t		sks_valid:1; /* 0F: Sense Key Specific Valid */
+	uint16_t	sks_low; /* 10: Sense Key Specific (LSW) */
+	uint16_t	reserved3; /* 12: Part of additional sense bytes (unused) */
+	uint16_t	uec; /* 14: Additional Sense Bytes */
+	uint64_t	per; /* 16: Additional Sense Bytes */
+	uint8_t		reserved4[2]; /* 1E: Additional Sense Bytes (unused) */
+};
+
+
+/* Task management constants */
+#define SOFT_TASK_SIMPLE            0x00
+#define SOFT_TASK_HEAD_OF_QUEUE     0x01
+#define SOFT_TASK_ORDERED           0x02
+
+
+/* Version zero has the last 32 bits reserved,
+ * Version one has the last 32 bits sg_list_len_bytes;
+ */
+struct skd_command_header {
+	uint64_t	sg_list_dma_address;
+	uint16_t	tag;
+	uint8_t		attribute;
+	uint8_t		add_cdb_len;     /* In 32 bit words */
+	uint32_t	sg_list_len_bytes;
+};
+
+struct skd_scsi_request {
+	struct		skd_command_header hdr;
+	unsigned char	cdb[16];
+/*	unsigned char _reserved[16]; */
+};
+
+struct driver_inquiry_data {
+	uint8_t		peripheral_device_type:5;
+	uint8_t		qualifier:3;
+	uint8_t		page_code;
+	uint16_t	page_length;
+	uint16_t	pcie_bus_number;
+	uint8_t		pcie_device_number;
+	uint8_t		pcie_function_number;
+	uint8_t		pcie_link_speed;
+	uint8_t		pcie_link_lanes;
+	uint16_t	pcie_vendor_id;
+	uint16_t	pcie_device_id;
+	uint16_t	pcie_subsystem_vendor_id;
+	uint16_t	pcie_subsystem_device_id;
+	uint8_t		reserved1[2];
+	uint8_t		reserved2[3];
+	uint8_t		driver_version_length;
+	uint8_t		driver_version[0x14];
+};
+
+#pragma pack(pop, s1120_h)
+
+#endif /* SKD_S1120_H */

From 7badfb1c34f85e2b68252e4596d96e9b76f2b0ac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Oct 2013 16:38:30 -0600
Subject: [PATCH 12/94] block: disable cpqarray in Kconfig

Mike writes:

"cpqarray hasn't been used in over 12 years. It's doubtful that anyone
 still uses the board. It's time the driver was removed from the mainline
 kernel.  The only updates these days are minor and mostly done by people
 outside of HP."

If nobody yells, we'll remove it from the kernel tree completely
for 3.15.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 555aed0b50dd..f616109a57a6 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -107,7 +107,7 @@ source "drivers/block/mtip32xx/Kconfig"
 
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
-	depends on PCI && VIRT_TO_BUS
+	depends on PCI && VIRT_TO_BUS && 0
 	help
 	  This is the driver for Compaq Smart Array controllers.  Everyone
 	  using these boards should say Y here.  See the file

From 69babf05cbe909a9a520b39772655f88b407f257 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 23 Oct 2013 10:59:15 +0200
Subject: [PATCH 13/94] drbd: fix NULL pointer deref in module init error path

If we want to iterate over the (as of yet still empty) list in the
cleanup path, we need to initialize the list before the first goto fail.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_main.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 55635edf563b..9e3818b1bc83 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2750,13 +2750,6 @@ int __init drbd_init(void)
 		return err;
 	}
 
-	err = drbd_genl_register();
-	if (err) {
-		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
-		goto fail;
-	}
-
-
 	register_reboot_notifier(&drbd_notifier);
 
 	/*
@@ -2767,6 +2760,15 @@ int __init drbd_init(void)
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
 	idr_init(&minors);
 
+	rwlock_init(&global_state_lock);
+	INIT_LIST_HEAD(&drbd_tconns);
+
+	err = drbd_genl_register();
+	if (err) {
+		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+		goto fail;
+	}
+
 	err = drbd_create_mempools();
 	if (err)
 		goto fail;
@@ -2778,9 +2780,6 @@ int __init drbd_init(void)
 		goto fail;
 	}
 
-	rwlock_init(&global_state_lock);
-	INIT_LIST_HEAD(&drbd_tconns);
-
 	retry.wq = create_singlethread_workqueue("drbd-reissue");
 	if (!retry.wq) {
 		printk(KERN_ERR "drbd: unable to create retry workqueue\n");

From b874d231e115af9b2c4a7ed1a4c5ae2db8a21aaf Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 23 Oct 2013 10:59:16 +0200
Subject: [PATCH 14/94] drbd: Fix an connection drop issue after enabling
 allow-two-primaries

Since drbd-8.4.0 it is possible to change the allow-two-primaries
network option while the connection is established.

The sequence code used to partially order packets from the
data socket with packets from the meta-data socket, still assued
that the allow-two-primaries option is constant while the
connection is established.

I.e.
On a node that has the RESOLVE_CONFLICTS bits set, after enabling
allow-two-primaries, when receiving the next data packet it timed out
while waiting for the necessary packets on the data socket to arrive
(wait_for_and_update_peer_seq() function).

Fixed that by always tracking the sequence number, but only waiting
for it if allow-two-primaries is set.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 39 ++++++++++++------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index cc29cd3bf78b..12c59eb3b127 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1890,29 +1890,11 @@ static u32 seq_max(u32 a, u32 b)
 	return seq_greater(a, b) ? a : b;
 }
 
-static bool need_peer_seq(struct drbd_conf *mdev)
-{
-	struct drbd_tconn *tconn = mdev->tconn;
-	int tp;
-
-	/*
-	 * We only need to keep track of the last packet_seq number of our peer
-	 * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
-	 * handle_write_conflicts().
-	 */
-
-	rcu_read_lock();
-	tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
-	rcu_read_unlock();
-
-	return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
-}
-
 static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
 {
 	unsigned int newest_peer_seq;
 
-	if (need_peer_seq(mdev)) {
+	if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) {
 		spin_lock(&mdev->peer_seq_lock);
 		newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
 		mdev->peer_seq = newest_peer_seq;
@@ -1972,22 +1954,31 @@ static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_s
 {
 	DEFINE_WAIT(wait);
 	long timeout;
-	int ret;
+	int ret = 0, tp;
 
-	if (!need_peer_seq(mdev))
+	if (!test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags))
 		return 0;
 
 	spin_lock(&mdev->peer_seq_lock);
 	for (;;) {
 		if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
 			mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
-			ret = 0;
 			break;
 		}
+
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
+
+		rcu_read_lock();
+		tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+		rcu_read_unlock();
+
+		if (!tp)
+			break;
+
+		/* Only need to wait if two_primaries is enabled */
 		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
 		spin_unlock(&mdev->peer_seq_lock);
 		rcu_read_lock();
@@ -2228,8 +2219,10 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
 			}
 			goto out_interrupted;
 		}
-	} else
+	} else {
+		update_peer_seq(mdev, peer_seq);
 		spin_lock_irq(&mdev->tconn->req_lock);
+	}
 	list_add(&peer_req->w.list, &mdev->active_ee);
 	spin_unlock_irq(&mdev->tconn->req_lock);
 

From 57737adc965e45fcb03662fe6f93f6efb19e2c0a Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 23 Oct 2013 10:59:17 +0200
Subject: [PATCH 15/94] drbd: Fix adding of new minors with freshly created
 meta data

Online adding of new minors with freshly created meta data
to an resource with an established connection failed, with a
wrong state transition on one side on one side of the new minor.

Freshly created meta-data has a la_size (last agreed size) of 0.
When we online add such devices, the code wrongly got into
the code path for resyncing new storage that was added while
the disk was detached.

Fixed that by making the GREW from ZERO a special case.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_int.h | 3 ++-
 drivers/block/drbd/drbd_nl.c  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2d7f608d181c..0e06f0c5dd1e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1474,7 +1474,8 @@ enum determine_dev_size {
 	DS_ERROR = -1,
 	DS_UNCHANGED = 0,
 	DS_SHRUNK = 1,
-	DS_GREW = 2
+	DS_GREW = 2,
+	DS_GREW_FROM_ZERO = 3,
 };
 extern enum determine_dev_size
 drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 8cc1e640f485..37dad18ba153 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -955,7 +955,7 @@ drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct res
 	}
 
 	if (size > la_size_sect)
-		rv = DS_GREW;
+		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
 	if (size < la_size_sect)
 		rv = DS_SHRUNK;
 

From d2da5b0cb522c48f8e2f311e6e9b212535371b56 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 23 Oct 2013 10:59:18 +0200
Subject: [PATCH 16/94] drbd: fix decoding of bitmap vli rle for device sizes >
 64 TB

Symptoms: disconnect after bitmap exchange due to
bitmap overflow (e:49731075554) while decoding bm RLE packet

In the decoding step of the variable length integer run length encoding
there was potentially an uncatched bitshift by wordsize (variable >> 64).

The result of which is "undefined" :(
(only "sometimes" the result is the desired 0)

Fix: don't do any bit shift magic for shift == 64, just assign.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 12c59eb3b127..6fa6673b36b3 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4125,7 +4125,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
 				(unsigned int)bs.buf_len);
 			return -EIO;
 		}
-		look_ahead >>= bits;
+		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+		if (likely(bits < 64))
+			look_ahead >>= bits;
+		else
+			look_ahead = 0;
 		have -= bits;
 
 		bits = bitstream_get_bits(&bs, &tmp, 64 - have);

From 35f47ef1a1f069cd2f346314fb8212bb49571eac Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 23 Oct 2013 10:59:19 +0200
Subject: [PATCH 17/94] drbd: avoid to shrink max_bio_size due to peer
 re-configuration

For a long time, the receiving side has spread "too large" incoming
requests over multiple bios.  No need to shrink our max_bio_size
(max_hw_sectors) if the peer is reconfigured to use a different storage.

The problem manifests itself if we are not the top of the device stack
(DRBD is used a LVM PV).

A hardware reconfiguration on the peer may cause the supported
max_bio_size to shrink, and the connection handshake would now
unnecessarily shrink the max_bio_size on the active node.

There is no way to notify upper layers that they have to "re-stack"
their limits. So they won't notice at all, and may keep submitting bios
that are suddenly considered "too large for device".

We already check for compatibility and ignore changes on the peer,
the code only was masked out unless we have a fully established connection.
We just need to allow it a bit earlier during the handshake.

Also consider max_hw_sectors in our merge bvec function, just in case.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_nl.c  | 4 ++--
 drivers/block/drbd/drbd_req.c | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 37dad18ba153..c706d50a8b06 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1132,9 +1132,9 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 	/* We may ignore peer limits if the peer is modern enough.
 	   Because new from 8.3.8 onwards the peer can use multiple
 	   BIOs for a single peer_request */
-	if (mdev->state.conn >= C_CONNECTED) {
+	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
 		if (mdev->tconn->agreed_pro_version < 94)
-			peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
 			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
 		else if (mdev->tconn->agreed_pro_version == 94)
 			peer = DRBD_MAX_SIZE_H80_PACKET;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index c24379ffd4e3..fec7bef44994 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1306,6 +1306,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	int backing_limit;
 
 	if (bio_size && get_ldev(mdev)) {
+		unsigned int max_hw_sectors = queue_max_hw_sectors(q);
 		struct request_queue * const b =
 			mdev->ldev->backing_bdev->bd_disk->queue;
 		if (b->merge_bvec_fn) {
@@ -1313,6 +1314,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 			limit = min(limit, backing_limit);
 		}
 		put_ldev(mdev);
+		if ((limit >> 9) > max_hw_sectors)
+			limit = max_hw_sectors << 9;
 	}
 	return limit;
 }

From e35f38bf73b6c9ec9521d9deb94198a419692db5 Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Fri, 18 Oct 2013 17:11:46 -0500
Subject: [PATCH 18/94] rsxx: Disallow discards from being unmapped.

This patch fixes a bug in which discards were always
calling pci_unmap_page. Discards should never call the
pci_unmap_page function call because they are never mapped.

This caused a race condition on PowerPC systems when issuing
discards, writes, and reads all at the same time. The
pci_map_page function would eventually map logical address
0 for a read or write. Discards are always assigned a DMA
address of 0 because they are never mapped. So if
pci_map_page mapped address 0 for a DMA and a discard was
"unmapped" then the address would be freed and would cause
an EEH event to occur when Hardware accesses the address.

This was injected/uncovered in commit:
b347f9cf0bc8d42ee95ba1d3837fd93045ab336b

The pci_dma_mapping_error function declares -1 a DMA_ERROR
not 0 like initially thought So before we would never unmap
discards because they were considered NULL.

This patch should fall on top of commit id:
fc1967bb08a6184ed44ef990e1dd4389901b809c

Also, the driver version is being up dated.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/dma.c       | 27 ++++++++++++++++-----------
 drivers/block/rsxx/rsxx_priv.h |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 4103601ae675..fc88ba3e1bd2 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -223,12 +223,14 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
 /*----------------- RSXX DMA Handling -------------------*/
 static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
 {
-	if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
-		pci_unmap_page(ctrl->card->dev, dma->dma_addr,
-			       get_dma_size(dma),
-			       dma->cmd == HW_CMD_BLK_WRITE ?
-					   PCI_DMA_TODEVICE :
-					   PCI_DMA_FROMDEVICE);
+	if (dma->cmd != HW_CMD_BLK_DISCARD) {
+		if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+			pci_unmap_page(ctrl->card->dev, dma->dma_addr,
+				       get_dma_size(dma),
+				       dma->cmd == HW_CMD_BLK_WRITE ?
+						   PCI_DMA_TODEVICE :
+						   PCI_DMA_FROMDEVICE);
+		}
 	}
 
 	kmem_cache_free(rsxx_dma_pool, dma);
@@ -1057,11 +1059,14 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 			else
 				card->ctrl[i].stats.reads_issued--;
 
-			pci_unmap_page(card->dev, dma->dma_addr,
-				       get_dma_size(dma),
-				       dma->cmd == HW_CMD_BLK_WRITE ?
-				       PCI_DMA_TODEVICE :
-				       PCI_DMA_FROMDEVICE);
+			if (dma->cmd != HW_CMD_BLK_DISCARD) {
+				pci_unmap_page(card->dev, dma->dma_addr,
+					       get_dma_size(dma),
+					       dma->cmd == HW_CMD_BLK_WRITE ?
+					       PCI_DMA_TODEVICE :
+					       PCI_DMA_FROMDEVICE);
+			}
+
 			list_add_tail(&dma->list, &issued_dmas[i]);
 			push_tracker(card->ctrl[i].trackers, j);
 			cnt++;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 913740e53d31..23fa05630fef 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -52,7 +52,7 @@ struct proc_cmd;
 #define RS70_PCI_REV_SUPPORTED	4
 
 #define DRIVER_NAME "rsxx"
-#define DRIVER_VERSION "4.0.1.2498"
+#define DRIVER_VERSION "4.0.2.2510"
 
 /* Block size is 4096 */
 #define RSXX_HW_BLK_SHIFT		12

From 8c49a77ca451541938f90008f419fca965b76b72 Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Fri, 18 Oct 2013 17:12:35 -0500
Subject: [PATCH 19/94] rsxx: Fix possible kernel panic with invalid config.

This patch fixes a possible Kernel Panic on driver load if
the configuration on the card is messed up or not yet set.
The driver could possible give a 32 bit unsigned all Fs to
the kernel as the device's block size.

Now we only write the block size to the kernel if the
configuration from the card is valid.

Also, driver version is being updated.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/dev.c       | 8 +++++---
 drivers/block/rsxx/rsxx_priv.h | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index d7af441880be..2284f5d3a54a 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -295,13 +295,15 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
 		return -ENOMEM;
 	}
 
-	blk_size = card->config.data.block_size;
+	if (card->config_valid) {
+		blk_size = card->config.data.block_size;
+		blk_queue_dma_alignment(card->queue, blk_size - 1);
+		blk_queue_logical_block_size(card->queue, blk_size);
+	}
 
 	blk_queue_make_request(card->queue, rsxx_make_request);
 	blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
-	blk_queue_dma_alignment(card->queue, blk_size - 1);
 	blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
-	blk_queue_logical_block_size(card->queue, blk_size);
 	blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
 
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 23fa05630fef..6bbc64d0f690 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -52,7 +52,7 @@ struct proc_cmd;
 #define RS70_PCI_REV_SUPPORTED	4
 
 #define DRIVER_NAME "rsxx"
-#define DRIVER_VERSION "4.0.2.2510"
+#define DRIVER_VERSION "4.0.3.2516"
 
 /* Block size is 4096 */
 #define RSXX_HW_BLK_SHIFT		12

From f721bb0dbd3fd37f16c49c97155f40f22496a970 Mon Sep 17 00:00:00 2001
From: Akhil Bhansali <bhansaliakhil@gmail.com>
Date: Wed, 23 Oct 2013 13:00:08 +0100
Subject: [PATCH 20/94] skd: Fix checkpatch ERRORS and removed unused functions

This patch fixes checkpatch.pl errors for assignment in if condition.
It also removes unused readq / readl function calls.

As Andrew had disabled the compilation of drivers for 32 bit,
I have modified format specifiers in few VPRINTKs to avoid warnings
during 64 bit compilation.

Signed-off-by: Akhil Bhansali <abhansali@stec-inc.com>
Reviewed-by: Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 53 +++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 3110f68ecedd..308bf474dc39 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -420,14 +420,10 @@ static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
 	if (likely(skdev->dbg_level < 2)) {
 		writel(val, skdev->mem_map[1] + offset);
 		barrier();
-		readl(skdev->mem_map[1] + offset);
-		barrier();
 	} else {
 		barrier();
 		writel(val, skdev->mem_map[1] + offset);
 		barrier();
-		readl(skdev->mem_map[1] + offset);
-		barrier();
 		VPRINTK(skdev, "offset %x = %x\n", offset, val);
 	}
 }
@@ -438,14 +434,10 @@ static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
 	if (likely(skdev->dbg_level < 2)) {
 		writeq(val, skdev->mem_map[1] + offset);
 		barrier();
-		readq(skdev->mem_map[1] + offset);
-		barrier();
 	} else {
 		barrier();
 		writeq(val, skdev->mem_map[1] + offset);
 		barrier();
-		readq(skdev->mem_map[1] + offset);
-		barrier();
 		VPRINTK(skdev, "offset %x = %016llx\n", offset, val);
 	}
 }
@@ -1656,18 +1648,36 @@ static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
 		goto out;
 	}
 
-	if ((rc = skd_sg_io_get_and_check_args(skdev, &sksgio)) ||
-	    (rc = skd_sg_io_obtain_skspcl(skdev, &sksgio)) ||
-	    (rc = skd_sg_io_prep_buffering(skdev, &sksgio)) ||
-	    (rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV)))
+	rc = skd_sg_io_get_and_check_args(skdev, &sksgio);
+	if (rc)
 		goto out;
 
-	if ((rc = skd_sg_io_send_fitmsg(skdev, &sksgio)) ||
-	    (rc = skd_sg_io_await(skdev, &sksgio)))
+	rc = skd_sg_io_obtain_skspcl(skdev, &sksgio);
+	if (rc)
 		goto out;
 
-	if ((rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV)) ||
-	    (rc = skd_sg_io_put_status(skdev, &sksgio)))
+	rc = skd_sg_io_prep_buffering(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_send_fitmsg(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_await(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_put_status(skdev, &sksgio);
+	if (rc)
 		goto out;
 
 	rc = 0;
@@ -4556,11 +4566,10 @@ static int skd_cons_skmsg(struct skd_device *skdev)
 	int rc = 0;
 	u32 i;
 
-	VPRINTK(skdev, "skmsg_table kzalloc, struct %u, count %u total %lu\n",
+	VPRINTK(skdev, "skmsg_table kzalloc, struct %lu, count %u total %lu\n",
 		sizeof(struct skd_fitmsg_context),
 		skdev->num_fitmsg_context,
-		(unsigned long) sizeof(struct skd_fitmsg_context) *
-					skdev->num_fitmsg_context);
+		sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
 
 	skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
 				     *skdev->num_fitmsg_context, GFP_KERNEL);
@@ -4611,7 +4620,7 @@ static int skd_cons_skreq(struct skd_device *skdev)
 	int rc = 0;
 	u32 i;
 
-	VPRINTK(skdev, "skreq_table kzalloc, struct %u, count %u total %u\n",
+	VPRINTK(skdev, "skreq_table kzalloc, struct %lu, count %u total %lu\n",
 		sizeof(struct skd_request_context),
 		skdev->num_req_context,
 		sizeof(struct skd_request_context) * skdev->num_req_context);
@@ -4623,7 +4632,7 @@ static int skd_cons_skreq(struct skd_device *skdev)
 		goto err_out;
 	}
 
-	VPRINTK(skdev, "alloc sg_table sg_per_req %u scatlist %u total %u\n",
+	VPRINTK(skdev, "alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
 		skdev->sgs_per_request, sizeof(struct scatterlist),
 		skdev->sgs_per_request * sizeof(struct scatterlist));
 
@@ -4668,7 +4677,7 @@ static int skd_cons_skspcl(struct skd_device *skdev)
 	int rc = 0;
 	u32 i, nbytes;
 
-	VPRINTK(skdev, "skspcl_table kzalloc, struct %u, count %u total %u\n",
+	VPRINTK(skdev, "skspcl_table kzalloc, struct %lu, count %u total %lu\n",
 		sizeof(struct skd_special_context),
 		skdev->n_special,
 		sizeof(struct skd_special_context) * skdev->n_special);

From 2e44b42718ac49a397324a360df4ecab617b3fe2 Mon Sep 17 00:00:00 2001
From: rchinthekindi <rchinthekindi@stec-inc.com>
Date: Thu, 24 Oct 2013 12:51:23 +0100
Subject: [PATCH 21/94] skd: Replaced custom debug PRINTKs with pr_debug

Replaced DPRINTK() and VPRINTK() with pr_debug().

Signed-off-by: Ramprasad C <ramprasad.chinthekindi@hgst.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 691 +++++++++++++++++++++++----------------
 1 file changed, 407 insertions(+), 284 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 308bf474dc39..ab17bff6b4f9 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -68,14 +68,6 @@ enum {
 	SKD_FLUSH_DATA_SECOND,
 };
 
-#define DPRINTK(skdev, fmt, args ...) \
-	do { \
-		if (unlikely((skdev)->dbg_level > 0)) {	\
-			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
-			       __func__, __LINE__, ## args); \
-		} \
-	} while (0)
-
 #define SKD_ASSERT(expr) \
 	do { \
 		if (unlikely(!(expr))) { \
@@ -84,15 +76,6 @@ enum {
 		} \
 	} while (0)
 
-#define VPRINTK(skdev, fmt, args ...) \
-	do { \
-		if (unlikely((skdev)->dbg_level > 1)) {	\
-			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
-			       __func__, __LINE__, ## args); \
-		} \
-	} while (0)
-
-
 #define DRV_NAME "skd"
 #define DRV_VERSION "2.2.1"
 #define DRV_BUILD_ID "0260"
@@ -408,7 +391,8 @@ static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
 		barrier();
 		val = readl(skdev->mem_map[1] + offset);
 		barrier();
-		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+		pr_debug("%s:%s:%d offset %x = %x\n",
+			 skdev->name, __func__, __LINE__, offset, val);
 		return val;
 	}
 
@@ -424,7 +408,8 @@ static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
 		barrier();
 		writel(val, skdev->mem_map[1] + offset);
 		barrier();
-		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+		pr_debug("%s:%s:%d offset %x = %x\n",
+			 skdev->name, __func__, __LINE__, offset, val);
 	}
 }
 
@@ -438,7 +423,8 @@ static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
 		barrier();
 		writeq(val, skdev->mem_map[1] + offset);
 		barrier();
-		VPRINTK(skdev, "offset %x = %016llx\n", offset, val);
+		pr_debug("%s:%s:%d offset %x = %016llx\n",
+			 skdev->name, __func__, __LINE__, offset, val);
 	}
 }
 
@@ -764,15 +750,17 @@ static void skd_request_fn(struct request_queue *q)
 			if (io_flags & REQ_FUA)
 				fua++;
 
-			VPRINTK(skdev,
-				"new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				req, lba, lba, count, count, data_dir);
+			pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
+				 "count=%u(0x%x) dir=%d\n",
+				 skdev->name, __func__, __LINE__,
+				 req, lba, lba, count, count, data_dir);
 		} else {
 			if (!list_empty(&skdev->flush_list)) {
 				/* Process data part of FLUSH request. */
 				bio = (struct bio *)skd_flush_cmd_dequeue(skdev);
 				flush++;
-				VPRINTK(skdev, "processing FLUSH request with data.\n");
+				pr_debug("%s:%s:%d processing FLUSH request with data.\n",
+					 skdev->name, __func__, __LINE__);
 			} else {
 				/* peek at our bio queue */
 				bio = bio_list_peek(&skdev->bio_queue);
@@ -787,9 +775,10 @@ static void skd_request_fn(struct request_queue *q)
 			data_dir = bio_data_dir(bio);
 			io_flags = bio->bi_rw;
 
-			VPRINTK(skdev,
-				"new bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				bio, lba, lba, count, count, data_dir);
+			pr_debug("%s:%s:%d new bio=%p lba=%u(0x%x) "
+				 "count=%u(0x%x) dir=%d\n",
+				 skdev->name, __func__, __LINE__,
+				 bio, lba, lba, count, count, data_dir);
 
 			if (io_flags & REQ_FLUSH)
 				flush++;
@@ -805,15 +794,17 @@ static void skd_request_fn(struct request_queue *q)
 
 		/* Are too many requets already in progress? */
 		if (skdev->in_flight >= skdev->cur_max_queue_depth) {
-			VPRINTK(skdev, "qdepth %d, limit %d\n",
-				skdev->in_flight, skdev->cur_max_queue_depth);
+			pr_debug("%s:%s:%d qdepth %d, limit %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skdev->in_flight, skdev->cur_max_queue_depth);
 			break;
 		}
 
 		/* Is a skd_request_context available? */
 		skreq = skdev->skreq_free_list;
 		if (skreq == NULL) {
-			VPRINTK(skdev, "Out of req=%p\n", q);
+			pr_debug("%s:%s:%d Out of req=%p\n",
+				 skdev->name, __func__, __LINE__, q);
 			break;
 		}
 		SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
@@ -822,7 +813,8 @@ static void skd_request_fn(struct request_queue *q)
 		/* Now we check to see if we can get a fit msg */
 		if (skmsg == NULL) {
 			if (skdev->skmsg_free_list == NULL) {
-				VPRINTK(skdev, "Out of msg\n");
+				pr_debug("%s:%s:%d Out of msg\n",
+					 skdev->name, __func__, __LINE__);
 				break;
 			}
 		}
@@ -862,7 +854,9 @@ static void skd_request_fn(struct request_queue *q)
 			/* Are there any FIT msg buffers available? */
 			skmsg = skdev->skmsg_free_list;
 			if (skmsg == NULL) {
-				VPRINTK(skdev, "Out of msg skdev=%p\n", skdev);
+				pr_debug("%s:%s:%d Out of msg skdev=%p\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev);
 				break;
 			}
 			SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE);
@@ -945,7 +939,8 @@ static void skd_request_fn(struct request_queue *q)
 			 * only resource that has been allocated but might
 			 * not be used is that the FIT msg could be empty.
 			 */
-			DPRINTK(skdev, "error Out\n");
+			pr_debug("%s:%s:%d error Out\n",
+				 skdev->name, __func__, __LINE__);
 			skd_end_request(skdev, skreq, error);
 			continue;
 		}
@@ -970,8 +965,9 @@ skip_sg:
 		timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
 		skdev->timeout_slot[timo_slot]++;
 		skdev->in_flight++;
-		VPRINTK(skdev, "req=0x%x busy=%d\n",
-			skreq->id, skdev->in_flight);
+		pr_debug("%s:%s:%d req=0x%x busy=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skdev->in_flight);
 
 		/*
 		 * If the FIT msg buffer is full send it.
@@ -993,8 +989,9 @@ skip_sg:
 	if (skmsg != NULL) {
 		/* Bigger than just a FIT msg header? */
 		if (skmsg->length > sizeof(struct fit_msg_hdr)) {
-			VPRINTK(skdev, "sending msg=%p, len %d\n",
-				skmsg, skmsg->length);
+			pr_debug("%s:%s:%d sending msg=%p, len %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skmsg, skmsg->length);
 			skd_send_fitmsg(skdev, skmsg);
 		} else {
 			/*
@@ -1027,7 +1024,8 @@ static void skd_end_request_blk(struct skd_device *skdev,
 
 	if ((io_flags & REQ_DISCARD) &&
 		(skreq->discard_page == 1)) {
-		VPRINTK(skdev, "skd_end_request_blk, free the page!");
+		pr_debug("%s:%s:%d skd_end_request_blk, free the page!",
+			 skdev->name, __func__, __LINE__);
 		free_page((unsigned long)req->buffer);
 		req->buffer = NULL;
 	}
@@ -1041,7 +1039,8 @@ static void skd_end_request_blk(struct skd_device *skdev,
 		pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
 		       skd_name(skdev), cmd, lba, count, skreq->id);
 	} else
-		VPRINTK(skdev, "id=0x%x error=%d\n", skreq->id, error);
+		pr_debug("%s:%s:%d id=0x%x error=%d\n",
+			 skdev->name, __func__, __LINE__, skreq->id, error);
 
 	__blk_end_request_all(skreq->req, error);
 }
@@ -1093,14 +1092,16 @@ static int skd_preop_sg_list_blk(struct skd_device *skdev,
 	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
 
 	if (unlikely(skdev->dbg_level > 1)) {
-		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
-			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
 		for (i = 0; i < n_sg; i++) {
 			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
-			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
-				"addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
 		}
 	}
 
@@ -1132,7 +1133,8 @@ static void skd_end_request_bio(struct skd_device *skdev,
 
 	if ((io_flags & REQ_DISCARD) &&
 		(skreq->discard_page == 1)) {
-		VPRINTK(skdev, "biomode: skd_end_request: freeing DISCARD page.\n");
+		pr_debug("%s:%s:%d biomode: skd_end_request: freeing DISCARD page.\n",
+			 skdev->name, __func__, __LINE__);
 		free_page((unsigned long)page_address(bio->bi_io_vec->bv_page));
 	}
 
@@ -1157,7 +1159,8 @@ static void skd_end_request_bio(struct skd_device *skdev,
 		part_stat_unlock();
 	}
 
-	VPRINTK(skdev, "id=0x%x error=%d\n", skreq->id, error);
+	pr_debug("%s:%s:%d id=0x%x error=%d\n",
+		 skdev->name, __func__, __LINE__, skreq->id, error);
 
 	bio_endio(skreq->bio, error);
 }
@@ -1229,14 +1232,16 @@ static int skd_preop_sg_list_bio(struct skd_device *skdev,
 	}
 
 	if (unlikely(skdev->dbg_level > 1)) {
-		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
-			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
 		for (i = 0; i < n_sg; i++) {
 			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
-			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
-				"addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
 		}
 	}
 
@@ -1391,8 +1396,9 @@ static void skd_timer_tick(ulong arg)
 	/* Something is overdue */
 	overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT;
 
-	DPRINTK(skdev, "found %d timeouts, draining busy=%d\n",
-		skdev->timeout_slot[timo_slot], skdev->in_flight);
+	pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->timeout_slot[timo_slot], skdev->in_flight);
 	pr_err("(%s): Overdue IOs (%d), busy %d\n",
 	       skd_name(skdev), skdev->timeout_slot[timo_slot],
 	       skdev->in_flight);
@@ -1415,8 +1421,9 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 	case SKD_DRVR_STATE_LOAD:
 		break;
 	case SKD_DRVR_STATE_BUSY_SANITIZE:
-		VPRINTK(skdev, "drive busy sanitize[%x], driver[%x]\n",
-			skdev->drive_state, skdev->state);
+		pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->drive_state, skdev->state);
 		/* If we've been in sanitize for 3 seconds, we figure we're not
 		 * going to get anymore completions, so recover requests now
 		 */
@@ -1430,14 +1437,16 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 	case SKD_DRVR_STATE_BUSY:
 	case SKD_DRVR_STATE_BUSY_IMMINENT:
 	case SKD_DRVR_STATE_BUSY_ERASE:
-		VPRINTK(skdev, "busy[%x], countdown=%d\n",
-			skdev->state, skdev->timer_countdown);
+		pr_debug("%s:%s:%d busy[%x], countdown=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state, skdev->timer_countdown);
 		if (skdev->timer_countdown > 0) {
 			skdev->timer_countdown--;
 			return;
 		}
-		DPRINTK(skdev, "busy[%x], timedout=%d, restarting device.",
-			skdev->state, skdev->timer_countdown);
+		pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state, skdev->timer_countdown);
 		skd_restart_device(skdev);
 		break;
 
@@ -1470,15 +1479,17 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 		break;
 
 	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
-		DPRINTK(skdev,
-			"draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
-			skdev->timo_slot,
-			skdev->timer_countdown,
-			skdev->in_flight,
-			skdev->timeout_slot[skdev->timo_slot]);
+		pr_debug("%s:%s:%d "
+			 "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->timo_slot,
+			 skdev->timer_countdown,
+			 skdev->in_flight,
+			 skdev->timeout_slot[skdev->timo_slot]);
 		/* if the slot has cleared we can let the I/O continue */
 		if (skdev->timeout_slot[skdev->timo_slot] == 0) {
-			DPRINTK(skdev, "Slot drained, starting queue.\n");
+			pr_debug("%s:%s:%d Slot drained, starting queue.\n",
+				 skdev->name, __func__, __LINE__);
 			skdev->state = SKD_DRVR_STATE_ONLINE;
 			skd_start_queue(skdev);
 			return;
@@ -1601,8 +1612,9 @@ static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
 	struct skd_device *skdev = disk->private_data;
 	void __user *p = (void *)arg;
 
-	DPRINTK(skdev, "%s: CMD[%s] ioctl  mode 0x%x, cmd 0x%x arg %0lx\n",
-		disk->disk_name, current->comm, mode, cmd_in, arg);
+	pr_debug("%s:%s:%d %s: CMD[%s] ioctl  mode 0x%x, cmd 0x%x arg %0lx\n",
+		 skdev->name, __func__, __LINE__,
+		 disk->disk_name, current->comm, mode, cmd_in, arg);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1622,7 +1634,8 @@ static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	}
 
-	DPRINTK(skdev, "%s:  completion rc %d\n", disk->disk_name, rc);
+	pr_debug("%s:%s:%d %s:  completion rc %d\n",
+		 skdev->name, __func__, __LINE__, disk->disk_name, rc);
 	return rc;
 }
 
@@ -1643,7 +1656,8 @@ static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
 		break;
 
 	default:
-		DPRINTK(skdev, "drive not online\n");
+		pr_debug("%s:%s:%d drive not online\n",
+			 skdev->name, __func__, __LINE__);
 		rc = -ENXIO;
 		goto out;
 	}
@@ -1697,33 +1711,38 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
 	int i, acc;
 
 	if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) {
-		DPRINTK(skdev, "access sg failed %p\n", sksgio->argp);
+		pr_debug("%s:%s:%d access sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
 		return -EFAULT;
 	}
 
 	if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) {
-		DPRINTK(skdev, "copy_from_user sg failed %p\n", sksgio->argp);
+		pr_debug("%s:%s:%d copy_from_user sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
 		return -EFAULT;
 	}
 
 	if (sgp->interface_id != SG_INTERFACE_ID_ORIG) {
-		DPRINTK(skdev, "interface_id invalid 0x%x\n",
-			sgp->interface_id);
+		pr_debug("%s:%s:%d interface_id invalid 0x%x\n",
+			 skdev->name, __func__, __LINE__, sgp->interface_id);
 		return -EINVAL;
 	}
 
 	if (sgp->cmd_len > sizeof(sksgio->cdb)) {
-		DPRINTK(skdev, "cmd_len invalid %d\n", sgp->cmd_len);
+		pr_debug("%s:%s:%d cmd_len invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->cmd_len);
 		return -EINVAL;
 	}
 
 	if (sgp->iovec_count > 256) {
-		DPRINTK(skdev, "iovec_count invalid %d\n", sgp->iovec_count);
+		pr_debug("%s:%s:%d iovec_count invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->iovec_count);
 		return -EINVAL;
 	}
 
 	if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) {
-		DPRINTK(skdev, "dxfer_len invalid %d\n", sgp->dxfer_len);
+		pr_debug("%s:%s:%d dxfer_len invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->dxfer_len);
 		return -EINVAL;
 	}
 
@@ -1742,18 +1761,21 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
 		break;
 
 	default:
-		DPRINTK(skdev, "dxfer_dir invalid %d\n", sgp->dxfer_direction);
+		pr_debug("%s:%s:%d dxfer_dir invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->dxfer_direction);
 		return -EINVAL;
 	}
 
 	if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) {
-		DPRINTK(skdev, "copy_from_user cmdp failed %p\n", sgp->cmdp);
+		pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n",
+			 skdev->name, __func__, __LINE__, sgp->cmdp);
 		return -EFAULT;
 	}
 
 	if (sgp->mx_sb_len != 0) {
 		if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) {
-			DPRINTK(skdev, "access sbp failed %p\n", sgp->sbp);
+			pr_debug("%s:%s:%d access sbp failed %p\n",
+				 skdev->name, __func__, __LINE__, sgp->sbp);
 			return -EFAULT;
 		}
 	}
@@ -1770,16 +1792,17 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
 
 		iov = kmalloc(nbytes, GFP_KERNEL);
 		if (iov == NULL) {
-			DPRINTK(skdev, "alloc iovec failed %d\n",
-				sgp->iovec_count);
+			pr_debug("%s:%s:%d alloc iovec failed %d\n",
+				 skdev->name, __func__, __LINE__,
+				 sgp->iovec_count);
 			return -ENOMEM;
 		}
 		sksgio->iov = iov;
 		sksgio->iovcnt = sgp->iovec_count;
 
 		if (copy_from_user(iov, sgp->dxferp, nbytes)) {
-			DPRINTK(skdev, "copy_from_user iovec failed %p\n",
-				sgp->dxferp);
+			pr_debug("%s:%s:%d copy_from_user iovec failed %p\n",
+				 skdev->name, __func__, __LINE__, sgp->dxferp);
 			return -EFAULT;
 		}
 
@@ -1807,8 +1830,9 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
 		struct sg_iovec *iov = sksgio->iov;
 		for (i = 0; i < sksgio->iovcnt; i++, iov++) {
 			if (!access_ok(acc, iov->iov_base, iov->iov_len)) {
-				DPRINTK(skdev, "access data failed %p/%d\n",
-					iov->iov_base, (int)iov->iov_len);
+				pr_debug("%s:%s:%d access data failed %p/%d\n",
+					 skdev->name, __func__, __LINE__,
+					 iov->iov_base, (int)iov->iov_len);
 				return -EFAULT;
 			}
 		}
@@ -1843,14 +1867,16 @@ static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
 			break;
 		}
 
-		DPRINTK(skdev, "blocking\n");
+		pr_debug("%s:%s:%d blocking\n",
+			 skdev->name, __func__, __LINE__);
 
 		rc = wait_event_interruptible_timeout(
 				skdev->waitq,
 				(skdev->skspcl_free_list != NULL),
 				msecs_to_jiffies(sksgio->sg.timeout));
 
-		DPRINTK(skdev, "unblocking, rc=%d\n", rc);
+		pr_debug("%s:%s:%d unblocking, rc=%d\n",
+			 skdev->name, __func__, __LINE__, rc);
 
 		if (rc <= 0) {
 			if (rc == 0)
@@ -1927,15 +1953,17 @@ static int skd_skreq_prep_buffering(struct skd_device *skdev,
 	if (unlikely(skdev->dbg_level > 1)) {
 		u32 i;
 
-		VPRINTK(skdev, "skreq=%x sksg_list=%p sksg_dma=%llx\n",
-			skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
 		for (i = 0; i < skreq->n_sg; i++) {
 			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
 
-			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
-				"addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
 		}
 	}
 
@@ -2057,7 +2085,8 @@ static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
 	spin_lock_irqsave(&skdev->lock, flags);
 
 	if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) {
-		DPRINTK(skdev, "skspcl %p aborted\n", sksgio->skspcl);
+		pr_debug("%s:%s:%d skspcl %p aborted\n",
+			 skdev->name, __func__, __LINE__, sksgio->skspcl);
 
 		/* Build check cond, sense and let command finish. */
 		/* For a timeout, we must fabricate completion and sense
@@ -2082,11 +2111,13 @@ static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
 		sksgio->skspcl->orphaned = 1;
 		sksgio->skspcl = NULL;
 		if (rc == 0) {
-			DPRINTK(skdev, "timed out %p (%u ms)\n", sksgio,
-				sksgio->sg.timeout);
+			pr_debug("%s:%s:%d timed out %p (%u ms)\n",
+				 skdev->name, __func__, __LINE__,
+				 sksgio, sksgio->sg.timeout);
 			rc = -ETIMEDOUT;
 		} else {
-			DPRINTK(skdev, "cntlc %p\n", sksgio);
+			pr_debug("%s:%s:%d cntlc %p\n",
+				 skdev->name, __func__, __LINE__, sksgio);
 			rc = -EINTR;
 		}
 	}
@@ -2116,8 +2147,9 @@ static int skd_sg_io_put_status(struct skd_device *skdev,
 	if (sgp->masked_status || sgp->host_status || sgp->driver_status)
 		sgp->info |= SG_INFO_CHECK;
 
-	DPRINTK(skdev, "status %x masked %x resid 0x%x\n", sgp->status,
-		sgp->masked_status, sgp->resid);
+	pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 sgp->status, sgp->masked_status, sgp->resid);
 
 	if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) {
 		if (sgp->mx_sb_len > 0) {
@@ -2129,15 +2161,17 @@ static int skd_sg_io_put_status(struct skd_device *skdev,
 			sgp->sb_len_wr = nbytes;
 
 			if (__copy_to_user(sgp->sbp, ei, nbytes)) {
-				DPRINTK(skdev, "copy_to_user sense failed %p\n",
-					sgp->sbp);
+				pr_debug("%s:%s:%d copy_to_user sense failed %p\n",
+					 skdev->name, __func__, __LINE__,
+					 sgp->sbp);
 				return -EFAULT;
 			}
 		}
 	}
 
 	if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) {
-		DPRINTK(skdev, "copy_to_user sg failed %p\n", sksgio->argp);
+		pr_debug("%s:%s:%d copy_to_user sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
 		return -EFAULT;
 	}
 
@@ -2325,7 +2359,8 @@ static void skd_complete_internal(struct skd_device *skdev,
 
 	SKD_ASSERT(skspcl == &skdev->internal_skspcl);
 
-	DPRINTK(skdev, "complete internal %x\n", scsi->cdb[0]);
+	pr_debug("%s:%s:%d complete internal %x\n",
+		 skdev->name, __func__, __LINE__, scsi->cdb[0]);
 
 	skspcl->req.completion = *skcomp;
 	skspcl->req.state = SKD_REQ_STATE_IDLE;
@@ -2345,11 +2380,13 @@ static void skd_complete_internal(struct skd_device *skdev,
 			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
 		else {
 			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				VPRINTK(skdev, "TUR failed, don't send anymore"
-					"state 0x%x\n", skdev->state);
+				pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
 				return;
 			}
-			DPRINTK(skdev, "**** TUR failed, retry skerr\n");
+			pr_debug("%s:%s:%d **** TUR failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
 			skd_send_internal_skspcl(skdev, skspcl, 0x00);
 		}
 		break;
@@ -2359,12 +2396,13 @@ static void skd_complete_internal(struct skd_device *skdev,
 			skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
 		else {
 			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				VPRINTK(skdev, "write buffer failed, don't send"
-					" anymore state 0x%x\n", skdev->state);
+				pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
 				return;
 			}
-			DPRINTK(skdev,
-				"**** write buffer failed, retry skerr\n");
+			pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
 			skd_send_internal_skspcl(skdev, skspcl, 0x00);
 		}
 		break;
@@ -2392,13 +2430,15 @@ static void skd_complete_internal(struct skd_device *skdev,
 
 		} else {
 			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				VPRINTK(skdev,
-					"read buffer failed, don't send anymore"
-					"state 0x%x\n", skdev->state);
+				pr_debug("%s:%s:%d "
+					 "read buffer failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
 				return;
 			}
-			DPRINTK(skdev,
-				"**** read buffer failed, retry skerr\n");
+			pr_debug("%s:%s:%d "
+				 "**** read buffer failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
 			skd_send_internal_skspcl(skdev, skspcl, 0x00);
 		}
 		break;
@@ -2413,9 +2453,10 @@ static void skd_complete_internal(struct skd_device *skdev,
 				(buf[4] << 24) | (buf[5] << 16) |
 				(buf[6] << 8) | buf[7];
 
-			DPRINTK(skdev, "last lba %d, bs %d\n",
-				skdev->read_cap_last_lba,
-				skdev->read_cap_blocksize);
+			pr_debug("%s:%s:%d last lba %d, bs %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skdev->read_cap_last_lba,
+				 skdev->read_cap_blocksize);
 
 			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
 
@@ -2426,11 +2467,13 @@ static void skd_complete_internal(struct skd_device *skdev,
 			   (skerr->key == MEDIUM_ERROR)) {
 			skdev->read_cap_last_lba = ~0;
 			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
-			DPRINTK(skdev,
-				"**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n");
+			pr_debug("%s:%s:%d "
+				 "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n",
+				 skdev->name, __func__, __LINE__);
 			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
 		} else {
-			DPRINTK(skdev, "**** READCAP failed, retry TUR\n");
+			pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n",
+				 skdev->name, __func__, __LINE__);
 			skd_send_internal_skspcl(skdev, skspcl,
 						 TEST_UNIT_READY);
 		}
@@ -2447,7 +2490,8 @@ static void skd_complete_internal(struct skd_device *skdev,
 		}
 
 		if (skd_unquiesce_dev(skdev) < 0)
-			DPRINTK(skdev, "**** failed, to ONLINE device\n");
+			pr_debug("%s:%s:%d **** failed, to ONLINE device\n",
+				 skdev->name, __func__, __LINE__);
 		 /* connection is complete */
 		skdev->connect_retries = 0;
 		break;
@@ -2477,10 +2521,12 @@ static void skd_send_fitmsg(struct skd_device *skdev,
 	u64 qcmd;
 	struct fit_msg_hdr *fmh;
 
-	VPRINTK(skdev, "dma address 0x%llx, busy=%d\n",
-		skmsg->mb_dma_address, skdev->in_flight);
-	VPRINTK(skdev, "msg_buf 0x%p, offset %x\n",
-		skmsg->msg_buf, skmsg->offset);
+	pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skmsg->mb_dma_address, skdev->in_flight);
+	pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n",
+		 skdev->name, __func__, __LINE__,
+		 skmsg->msg_buf, skmsg->offset);
 
 	qcmd = skmsg->mb_dma_address;
 	qcmd |= FIT_QCMD_QID_NORMAL;
@@ -2492,11 +2538,12 @@ static void skd_send_fitmsg(struct skd_device *skdev,
 		u8 *bp = (u8 *)skmsg->msg_buf;
 		int i;
 		for (i = 0; i < skmsg->length; i += 8) {
-			VPRINTK(skdev, "  msg[%2d] %02x %02x %02x %02x "
-				"%02x %02x %02x %02x\n",
-				i, bp[i + 0], bp[i + 1], bp[i + 2],
-				bp[i + 3], bp[i + 4], bp[i + 5],
-				bp[i + 6], bp[i + 7]);
+			pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x "
+				 "%02x %02x %02x %02x\n",
+				 skdev->name, __func__, __LINE__,
+				 i, bp[i + 0], bp[i + 1], bp[i + 2],
+				 bp[i + 3], bp[i + 4], bp[i + 5],
+				 bp[i + 6], bp[i + 7]);
 			if (i == 0)
 				i = 64 - 8;
 		}
@@ -2530,26 +2577,28 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
 		int i;
 
 		for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
-			VPRINTK(skdev,
-				"  spcl[%2d] %02x %02x %02x %02x  "
-				"%02x %02x %02x %02x\n", i,
-				bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
-				bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
+			pr_debug("%s:%s:%d  spcl[%2d] %02x %02x %02x %02x  "
+				 "%02x %02x %02x %02x\n",
+				 skdev->name, __func__, __LINE__, i,
+				 bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
+				 bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
 			if (i == 0)
 				i = 64 - 8;
 		}
 
-		VPRINTK(skdev, "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
-			skspcl, skspcl->req.id, skspcl->req.sksg_list,
-			skspcl->req.sksg_dma_address);
+		pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skspcl, skspcl->req.id, skspcl->req.sksg_list,
+			 skspcl->req.sksg_dma_address);
 		for (i = 0; i < skspcl->req.n_sg; i++) {
 			struct fit_sg_descriptor *sgd =
 				&skspcl->req.sksg_list[i];
 
-			VPRINTK(skdev, "  sg[%d] count=%u ctrl=0x%x "
-				"addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
 		}
 	}
 
@@ -2632,9 +2681,9 @@ static enum skd_check_status_action skd_check_status(struct skd_device *skdev,
 	       skd_name(skdev), skerr->key, skerr->code, skerr->qual,
 	       skerr->fruc);
 
-	VPRINTK(skdev, "stat: t=%02x stat=%02x k=%02x c=%02x q=%02x "
-		"fruc=%02x\n", skerr->type, cmp_status, skerr->key,
-		skerr->code, skerr->qual, skerr->fruc);
+	pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n",
+		 skdev->name, __func__, __LINE__, skerr->type, cmp_status,
+		 skerr->key, skerr->code, skerr->qual, skerr->fruc);
 
 	/* Does the info match an entry in the good category? */
 	n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]);
@@ -2674,11 +2723,13 @@ static enum skd_check_status_action skd_check_status(struct skd_device *skdev,
 	 * zero status means good
 	 */
 	if (cmp_status) {
-		DPRINTK(skdev, "status check: error\n");
+		pr_debug("%s:%s:%d status check: error\n",
+			 skdev->name, __func__, __LINE__);
 		return SKD_CHECK_STATUS_REPORT_ERROR;
 	}
 
-	DPRINTK(skdev, "status check good default\n");
+	pr_debug("%s:%s:%d status check good default\n",
+		 skdev->name, __func__, __LINE__);
 	return SKD_CHECK_STATUS_REPORT_GOOD;
 }
 
@@ -2816,7 +2867,8 @@ static void skd_do_inq_page_00(struct skd_device *skdev,
 	/* Caller requested "supported pages".  The driver needs to insert
 	 * its page.
 	 */
-	VPRINTK(skdev, "skd_do_driver_inquiry: modify supported pages.\n");
+	pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n",
+		 skdev->name, __func__, __LINE__);
 
 	/* If the device rejected the request because the CDB was
 	 * improperly formed, then just leave.
@@ -2913,7 +2965,8 @@ static void skd_do_inq_page_da(struct skd_device *skdev,
 	struct driver_inquiry_data inq;
 	u16 val;
 
-	VPRINTK(skdev, "skd_do_driver_inquiry: return driver page\n");
+	pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n",
+		 skdev->name, __func__, __LINE__);
 
 	memset(&inq, 0, sizeof(inq));
 
@@ -3045,14 +3098,16 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 
 		skerr = &skdev->skerr_table[skdev->skcomp_ix];
 
-		VPRINTK(skdev,
-			"cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
-			"busy=%d rbytes=0x%x proto=%d\n", skdev->skcomp_cycle,
-			skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
-			skdev->in_flight, cmp_bytes, skdev->proto_ver);
+		pr_debug("%s:%s:%d "
+			 "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
+			 "busy=%d rbytes=0x%x proto=%d\n",
+			 skdev->name, __func__, __LINE__, skdev->skcomp_cycle,
+			 skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
+			 skdev->in_flight, cmp_bytes, skdev->proto_ver);
 
 		if (cmp_cycle != skdev->skcomp_cycle) {
-			VPRINTK(skdev, "end of completions\n");
+			pr_debug("%s:%s:%d end of completions\n",
+				 skdev->name, __func__, __LINE__);
 			break;
 		}
 		/*
@@ -3088,8 +3143,9 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 		 * Make sure the request ID for the slot matches.
 		 */
 		if (skreq->id != req_id) {
-			DPRINTK(skdev, "mismatch comp_id=0x%x req_id=0x%x\n",
-				req_id, skreq->id);
+			pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n",
+				 skdev->name, __func__, __LINE__,
+				 req_id, skreq->id);
 			{
 				u16 new_id = cmp_cntxt;
 				pr_err("(%s): Completion mismatch "
@@ -3104,8 +3160,9 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 		SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
 
 		if (skreq->state == SKD_REQ_STATE_ABORTED) {
-			DPRINTK(skdev, "reclaim req %p id=%04x\n",
-				skreq, skreq->id);
+			pr_debug("%s:%s:%d reclaim req %p id=%04x\n",
+				 skdev->name, __func__, __LINE__,
+				 skreq, skreq->id);
 			/* a previously timed out command can
 			 * now be cleaned up */
 			skd_release_skreq(skdev, skreq);
@@ -3125,9 +3182,10 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 
 		if (((!skd_bio) && !skreq->req) ||
 		    ((skd_bio) && !skreq->bio)) {
-			DPRINTK(skdev, "NULL backptr skdreq %p, "
-				"req=0x%x req_id=0x%x\n",
-				skreq, skreq->id, req_id);
+			pr_debug("%s:%s:%d NULL backptr skdreq %p, "
+				 "req=0x%x req_id=0x%x\n",
+				 skdev->name, __func__, __LINE__,
+				 skreq, skreq->id, req_id);
 		} else {
 			/*
 			 * Capture the outcome and post it back to the
@@ -3196,8 +3254,9 @@ static void skd_complete_other(struct skd_device *skdev,
 	req_table = req_id & SKD_ID_TABLE_MASK;
 	req_slot = req_id & SKD_ID_SLOT_MASK;
 
-	DPRINTK(skdev, "table=0x%x id=0x%x slot=%d\n", req_table, req_id,
-		req_slot);
+	pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 req_table, req_id, req_slot);
 
 	/*
 	 * Based on the request id, determine how to dispatch this completion.
@@ -3265,12 +3324,14 @@ static void skd_complete_special(struct skd_device *skdev,
 				 volatile struct fit_comp_error_info *skerr,
 				 struct skd_special_context *skspcl)
 {
-	DPRINTK(skdev, " completing special request %p\n", skspcl);
+	pr_debug("%s:%s:%d  completing special request %p\n",
+		 skdev->name, __func__, __LINE__, skspcl);
 	if (skspcl->orphaned) {
 		/* Discard orphaned request */
 		/* ?: Can this release directly or does it need
 		 * to use a worker? */
-		DPRINTK(skdev, "release orphaned %p\n", skspcl);
+		pr_debug("%s:%s:%d release orphaned %p\n",
+			 skdev->name, __func__, __LINE__, skspcl);
 		skd_release_special(skdev, skspcl);
 		return;
 	}
@@ -3308,7 +3369,8 @@ static void skd_release_special(struct skd_device *skdev,
 	skdev->skspcl_free_list = (struct skd_special_context *)skspcl;
 
 	if (was_depleted) {
-		DPRINTK(skdev, "skspcl was depleted\n");
+		pr_debug("%s:%s:%d skspcl was depleted\n",
+			 skdev->name, __func__, __LINE__);
 		/* Free list was depleted. Their might be waiters. */
 		wake_up_interruptible(&skdev->waitq);
 	}
@@ -3373,7 +3435,8 @@ static skd_isr(int irq, void *ptr)
 		ack = FIT_INT_DEF_MASK;
 		ack &= intstat;
 
-		VPRINTK(skdev, "intstat=0x%x ack=0x%x\n", intstat, ack);
+		pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n",
+			 skdev->name, __func__, __LINE__, intstat, ack);
 
 		/* As long as there is an int pending on device, keep
 		 * running loop.  When none, get out, but if we've never
@@ -3534,7 +3597,8 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 		}
 		break;
 	case FIT_SR_DRIVE_FW_BOOTING:
-		VPRINTK(skdev, "ISR FIT_SR_DRIVE_FW_BOOTING %s\n", skdev->name);
+		pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
 		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
 		break;
@@ -3643,10 +3707,14 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue)
 		 */
 		if (skspcl->req.state == SKD_REQ_STATE_BUSY) {
 			if (skspcl->orphaned) {
-				DPRINTK(skdev, "orphaned %p\n", skspcl);
+				pr_debug("%s:%s:%d orphaned %p\n",
+					 skdev->name, __func__, __LINE__,
+					 skspcl);
 				skd_release_special(skdev, skspcl);
 			} else {
-				DPRINTK(skdev, "not orphaned %p\n", skspcl);
+				pr_debug("%s:%s:%d not orphaned %p\n",
+					 skdev->name, __func__, __LINE__,
+					 skspcl);
 				skspcl->req.state = SKD_REQ_STATE_ABORTED;
 			}
 		}
@@ -3667,7 +3735,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev)
 
 	mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
 
-	DPRINTK(skdev, "mfd=0x%x last_mtd=0x%x\n", mfd, skdev->last_mtd);
+	pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n",
+		 skdev->name, __func__, __LINE__, mfd, skdev->last_mtd);
 
 	/* ignore any mtd that is an ack for something we didn't send */
 	if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
@@ -3762,7 +3831,8 @@ static void skd_disable_interrupts(struct skd_device *skdev)
 	sense = SKD_READL(skdev, FIT_CONTROL);
 	sense &= ~FIT_CR_ENABLE_INTERRUPTS;
 	SKD_WRITEL(skdev, sense, FIT_CONTROL);
-	DPRINTK(skdev, "sense 0x%x\n", sense);
+	pr_debug("%s:%s:%d sense 0x%x\n",
+		 skdev->name, __func__, __LINE__, sense);
 
 	/* Note that the 1s is written. A 1-bit means
 	 * disable, a 0 means enable.
@@ -3781,11 +3851,13 @@ static void skd_enable_interrupts(struct skd_device *skdev)
 	/* Note that the compliment of mask is written. A 1-bit means
 	 * disable, a 0 means enable. */
 	SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
-	DPRINTK(skdev, "interrupt mask=0x%x\n", ~val);
+	pr_debug("%s:%s:%d interrupt mask=0x%x\n",
+		 skdev->name, __func__, __LINE__, ~val);
 
 	val = SKD_READL(skdev, FIT_CONTROL);
 	val |= FIT_CR_ENABLE_INTERRUPTS;
-	DPRINTK(skdev, "control=0x%x\n", val);
+	pr_debug("%s:%s:%d control=0x%x\n",
+		 skdev->name, __func__, __LINE__, val);
 	SKD_WRITEL(skdev, val, FIT_CONTROL);
 }
 
@@ -3801,7 +3873,8 @@ static void skd_soft_reset(struct skd_device *skdev)
 
 	val = SKD_READL(skdev, FIT_CONTROL);
 	val |= (FIT_CR_SOFT_RESET);
-	DPRINTK(skdev, "control=0x%x\n", val);
+	pr_debug("%s:%s:%d control=0x%x\n",
+		 skdev->name, __func__, __LINE__, val);
 	SKD_WRITEL(skdev, val, FIT_CONTROL);
 }
 
@@ -3818,7 +3891,8 @@ static void skd_start_device(struct skd_device *skdev)
 
 	sense = SKD_READL(skdev, FIT_STATUS);
 
-	DPRINTK(skdev, "initial status=0x%x\n", sense);
+	pr_debug("%s:%s:%d initial status=0x%x\n",
+		 skdev->name, __func__, __LINE__, sense);
 
 	state = sense & FIT_SR_DRIVE_STATE_MASK;
 	skdev->drive_state = state;
@@ -3835,7 +3909,8 @@ static void skd_start_device(struct skd_device *skdev)
 		break;
 
 	case FIT_SR_DRIVE_FW_BOOTING:
-		VPRINTK(skdev, "FIT_SR_DRIVE_FW_BOOTING %s\n", skdev->name);
+		pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
 		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
 		break;
@@ -3876,7 +3951,8 @@ static void skd_start_device(struct skd_device *skdev)
 		 */
 		skd_drive_fault(skdev);
 		/*start the queue so we can respond with error to requests */
-		VPRINTK(skdev, "starting %s queue\n", skdev->name);
+		pr_debug("%s:%s:%d starting %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		skd_start_queue(skdev);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
@@ -3887,8 +3963,8 @@ static void skd_start_device(struct skd_device *skdev)
 		 * to the BAR1 addresses. */
 		skd_drive_disappeared(skdev);
 		/*start the queue so we can respond with error to requests */
-		VPRINTK(skdev, "starting %s queue to error-out reqs\n",
-			skdev->name);
+		pr_debug("%s:%s:%d starting %s queue to error-out reqs\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		skd_start_queue(skdev);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
@@ -3901,19 +3977,24 @@ static void skd_start_device(struct skd_device *skdev)
 	}
 
 	state = SKD_READL(skdev, FIT_CONTROL);
-	DPRINTK(skdev, "FIT Control Status=0x%x\n", state);
+	pr_debug("%s:%s:%d FIT Control Status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
-	DPRINTK(skdev, "Intr Status=0x%x\n", state);
+	pr_debug("%s:%s:%d Intr Status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	state = SKD_READL(skdev, FIT_INT_MASK_HOST);
-	DPRINTK(skdev, "Intr Mask=0x%x\n", state);
+	pr_debug("%s:%s:%d Intr Mask=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
-	DPRINTK(skdev, "Msg from Dev=0x%x\n", state);
+	pr_debug("%s:%s:%d Msg from Dev=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	state = SKD_READL(skdev, FIT_HW_VERSION);
-	DPRINTK(skdev, "HW version=0x%x\n", state);
+	pr_debug("%s:%s:%d HW version=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	spin_unlock_irqrestore(&skdev->lock, flags);
 }
@@ -4006,7 +4087,8 @@ static void skd_restart_device(struct skd_device *skdev)
 
 	state = SKD_READL(skdev, FIT_STATUS);
 
-	DPRINTK(skdev, "drive status=0x%x\n", state);
+	pr_debug("%s:%s:%d drive status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
 
 	state &= FIT_SR_DRIVE_STATE_MASK;
 	skdev->drive_state = state;
@@ -4026,7 +4108,8 @@ static int skd_quiesce_dev(struct skd_device *skdev)
 	switch (skdev->state) {
 	case SKD_DRVR_STATE_BUSY:
 	case SKD_DRVR_STATE_BUSY_IMMINENT:
-		VPRINTK(skdev, "stopping %s queue\n", skdev->name);
+		pr_debug("%s:%s:%d stopping %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		skd_stop_queue(skdev);
 		break;
 	case SKD_DRVR_STATE_ONLINE:
@@ -4039,7 +4122,8 @@ static int skd_quiesce_dev(struct skd_device *skdev)
 	case SKD_DRVR_STATE_RESUMING:
 	default:
 		rc = -EINVAL;
-		VPRINTK(skdev, "state [%d] not implemented\n", skdev->state);
+		pr_debug("%s:%s:%d state [%d] not implemented\n",
+			 skdev->name, __func__, __LINE__, skdev->state);
 	}
 	return rc;
 }
@@ -4051,7 +4135,8 @@ static int skd_unquiesce_dev(struct skd_device *skdev)
 
 	skd_log_skdev(skdev, "unquiesce");
 	if (skdev->state == SKD_DRVR_STATE_ONLINE) {
-		DPRINTK(skdev, "**** device already ONLINE\n");
+		pr_debug("%s:%s:%d **** device already ONLINE\n",
+			 skdev->name, __func__, __LINE__);
 		return 0;
 	}
 	if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
@@ -4064,7 +4149,8 @@ static int skd_unquiesce_dev(struct skd_device *skdev)
 		 * to become available.
 		 */
 		skdev->state = SKD_DRVR_STATE_BUSY;
-		DPRINTK(skdev, "drive BUSY state\n");
+		pr_debug("%s:%s:%d drive BUSY state\n",
+			 skdev->name, __func__, __LINE__);
 		return 0;
 	}
 
@@ -4088,8 +4174,10 @@ static int skd_unquiesce_dev(struct skd_device *skdev)
 		       skd_skdev_state_to_str(prev_driver_state),
 		       prev_driver_state, skd_skdev_state_to_str(skdev->state),
 		       skdev->state);
-		DPRINTK(skdev, "**** device ONLINE...starting block queue\n");
-		VPRINTK(skdev, "starting %s queue\n", skdev->name);
+		pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n",
+			 skdev->name, __func__, __LINE__);
+		pr_debug("%s:%s:%d starting %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
 		pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
 		skd_start_queue(skdev);
 		skdev->gendisk_on = 1;
@@ -4098,8 +4186,9 @@ static int skd_unquiesce_dev(struct skd_device *skdev)
 
 	case SKD_DRVR_STATE_DISAPPEARED:
 	default:
-		DPRINTK(skdev, "**** driver state %d, not implemented \n",
-			skdev->state);
+		pr_debug("%s:%s:%d **** driver state %d, not implemented \n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state);
 		return -EBUSY;
 	}
 	return 0;
@@ -4117,7 +4206,9 @@ static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
 	unsigned long flags;
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev),
 	       irq, SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
@@ -4131,7 +4222,9 @@ static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
 	unsigned long flags;
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
 	skd_isr_fwstate(skdev);
 	spin_unlock_irqrestore(&skdev->lock, flags);
@@ -4146,7 +4239,9 @@ static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
 	int deferred;
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
 	deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
 						&flush_enqueued);
@@ -4170,7 +4265,9 @@ static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
 	unsigned long flags;
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
 	skd_isr_msg_from_dev(skdev);
 	spin_unlock_irqrestore(&skdev->lock, flags);
@@ -4183,7 +4280,9 @@ static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
 	unsigned long flags;
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "MSIX = 0x%x\n", SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
 	SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
 	spin_unlock_irqrestore(&skdev->lock, flags);
 	return IRQ_HANDLED;
@@ -4275,8 +4374,9 @@ static int skd_acquire_msix(struct skd_device *skdev)
 			       skd_name(skdev), rc);
 			goto msix_out;
 		}
-		DPRINTK(skdev, "%s: <%s> allocated %d MSI-X vectors\n",
-			pci_name(pdev), skdev->name, rc);
+		pr_debug("%s:%s:%d %s: <%s> allocated %d MSI-X vectors\n",
+			 skdev->name, __func__, __LINE__,
+			 pci_name(pdev), skdev->name, rc);
 
 		skdev->msix_count = rc;
 		rc = pci_enable_msix(pdev, entries, skdev->msix_count);
@@ -4303,9 +4403,10 @@ static int skd_acquire_msix(struct skd_device *skdev)
 		qentry->entry = entries[i].entry;
 		qentry->rsp = NULL;
 		qentry->have_irq = 0;
-		DPRINTK(skdev, "%s: <%s> msix (%d) vec %d, entry %x\n",
-			pci_name(pdev), skdev->name,
-			i, qentry->vector, qentry->entry);
+		pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n",
+			 skdev->name, __func__, __LINE__,
+			 pci_name(pdev), skdev->name,
+			 i, qentry->vector, qentry->entry);
 		qentry++;
 	}
 
@@ -4328,8 +4429,9 @@ static int skd_acquire_msix(struct skd_device *skdev)
 			qentry->rsp = skdev;
 		}
 	}
-	DPRINTK(skdev, "%s: <%s> msix %d irq(s) enabled\n",
-		pci_name(pdev), skdev->name, skdev->msix_count);
+	pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n",
+		 skdev->name, __func__, __LINE__,
+		 pci_name(pdev), skdev->name, skdev->msix_count);
 	return 0;
 
 msix_out:
@@ -4489,43 +4591,42 @@ static struct skd_device *skd_construct(struct pci_dev *pdev)
 	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
 	INIT_LIST_HEAD(&skdev->flush_list);
 
-	VPRINTK(skdev, "skcomp\n");
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_skcomp(skdev);
 	if (rc < 0)
 		goto err_out;
 
-	VPRINTK(skdev, "skmsg\n");
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_skmsg(skdev);
 	if (rc < 0)
 		goto err_out;
 
-	VPRINTK(skdev, "skreq\n");
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_skreq(skdev);
 	if (rc < 0)
 		goto err_out;
 
-	VPRINTK(skdev, "skspcl\n");
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_skspcl(skdev);
 	if (rc < 0)
 		goto err_out;
 
-	VPRINTK(skdev, "sksb\n");
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_sksb(skdev);
 	if (rc < 0)
 		goto err_out;
 
-	VPRINTK(skdev, "disk\n");
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_disk(skdev);
 	if (rc < 0)
 		goto err_out;
 
-
-
-	DPRINTK(skdev, "VICTORY\n");
+	pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
 	return skdev;
 
 err_out:
-	DPRINTK(skdev, "construct failed\n");
+	pr_debug("%s:%s:%d construct failed\n",
+		 skdev->name, __func__, __LINE__);
 	skd_destruct(skdev);
 	return NULL;
 }
@@ -4539,8 +4640,9 @@ static int skd_cons_skcomp(struct skd_device *skdev)
 	nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
 	nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
 
-	VPRINTK(skdev, "comp pci_alloc, total bytes %d entries %d\n", nbytes,
-		SKD_N_COMPLETION_ENTRY);
+	pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n",
+		 skdev->name, __func__, __LINE__,
+		 nbytes, SKD_N_COMPLETION_ENTRY);
 
 	skcomp = pci_alloc_consistent(skdev->pdev, nbytes,
 				      &skdev->cq_dma_address);
@@ -4566,10 +4668,11 @@ static int skd_cons_skmsg(struct skd_device *skdev)
 	int rc = 0;
 	u32 i;
 
-	VPRINTK(skdev, "skmsg_table kzalloc, struct %lu, count %u total %lu\n",
-		sizeof(struct skd_fitmsg_context),
-		skdev->num_fitmsg_context,
-		sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
+	pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_fitmsg_context),
+		 skdev->num_fitmsg_context,
+		 sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
 
 	skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
 				     *skdev->num_fitmsg_context, GFP_KERNEL);
@@ -4620,10 +4723,11 @@ static int skd_cons_skreq(struct skd_device *skdev)
 	int rc = 0;
 	u32 i;
 
-	VPRINTK(skdev, "skreq_table kzalloc, struct %lu, count %u total %lu\n",
-		sizeof(struct skd_request_context),
-		skdev->num_req_context,
-		sizeof(struct skd_request_context) * skdev->num_req_context);
+	pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_request_context),
+		 skdev->num_req_context,
+		 sizeof(struct skd_request_context) * skdev->num_req_context);
 
 	skdev->skreq_table = kzalloc(sizeof(struct skd_request_context)
 				     * skdev->num_req_context, GFP_KERNEL);
@@ -4632,9 +4736,10 @@ static int skd_cons_skreq(struct skd_device *skdev)
 		goto err_out;
 	}
 
-	VPRINTK(skdev, "alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
-		skdev->sgs_per_request, sizeof(struct scatterlist),
-		skdev->sgs_per_request * sizeof(struct scatterlist));
+	pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->sgs_per_request, sizeof(struct scatterlist),
+		 skdev->sgs_per_request * sizeof(struct scatterlist));
 
 	for (i = 0; i < skdev->num_req_context; i++) {
 		struct skd_request_context *skreq;
@@ -4677,10 +4782,11 @@ static int skd_cons_skspcl(struct skd_device *skdev)
 	int rc = 0;
 	u32 i, nbytes;
 
-	VPRINTK(skdev, "skspcl_table kzalloc, struct %lu, count %u total %lu\n",
-		sizeof(struct skd_special_context),
-		skdev->n_special,
-		sizeof(struct skd_special_context) * skdev->n_special);
+	pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_special_context),
+		 skdev->n_special,
+		 sizeof(struct skd_special_context) * skdev->n_special);
 
 	skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context)
 				      * skdev->n_special, GFP_KERNEL);
@@ -4872,7 +4978,8 @@ static int skd_cons_disk(struct skd_device *skdev)
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 
 	spin_lock_irqsave(&skdev->lock, flags);
-	VPRINTK(skdev, "stopping %s queue\n", skdev->name);
+	pr_debug("%s:%s:%d stopping %s queue\n",
+		 skdev->name, __func__, __LINE__, skdev->name);
 	skd_stop_queue(skdev);
 	spin_unlock_irqrestore(&skdev->lock, flags);
 
@@ -4902,25 +5009,25 @@ static void skd_destruct(struct skd_device *skdev)
 		return;
 
 
-	VPRINTK(skdev, "disk\n");
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
 	skd_free_disk(skdev);
 
-	VPRINTK(skdev, "sksb\n");
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
 	skd_free_sksb(skdev);
 
-	VPRINTK(skdev, "skspcl\n");
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
 	skd_free_skspcl(skdev);
 
-	VPRINTK(skdev, "skreq\n");
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
 	skd_free_skreq(skdev);
 
-	VPRINTK(skdev, "skmsg\n");
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
 	skd_free_skmsg(skdev);
 
-	VPRINTK(skdev, "skcomp\n");
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
 	skd_free_skcomp(skdev);
 
-	VPRINTK(skdev, "skdev\n");
+	pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
 	kfree(skdev);
 }
 
@@ -5106,8 +5213,9 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
 	skdev = bdev->bd_disk->private_data;
 
-	DPRINTK(skdev, "%s: CMD[%s] getgeo device\n",
-		bdev->bd_disk->disk_name, current->comm);
+	pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n",
+		 skdev->name, __func__, __LINE__,
+		 bdev->bd_disk->disk_name, current->comm);
 
 	if (skdev->read_cap_is_valid) {
 		capacity = get_capacity(skdev->disk);
@@ -5122,7 +5230,7 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
 static int skd_bdev_attach(struct skd_device *skdev)
 {
-	DPRINTK(skdev, "add_disk\n");
+	pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
 	add_disk(skdev->disk);
 	return 0;
 }
@@ -5245,9 +5353,10 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			rc = -ENODEV;
 			goto err_out_iounmap;
 		}
-		DPRINTK(skdev, "mem_map=%p, phyd=%016llx, size=%d\n",
-			skdev->mem_map[i],
-			(uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+		pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->mem_map[i],
+			 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
 	}
 
 	rc = skd_acquire_irq(skdev);
@@ -5441,9 +5550,10 @@ static int skd_pci_resume(struct pci_dev *pdev)
 			rc = -ENODEV;
 			goto err_out_iounmap;
 		}
-		DPRINTK(skdev, "mem_map=%p, phyd=%016llx, size=%d\n",
-			skdev->mem_map[i],
-			(uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+		pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->mem_map[i],
+			 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
 	}
 	rc = skd_acquire_irq(skdev);
 	if (rc) {
@@ -5643,35 +5753,44 @@ const char *skd_skreq_state_to_str(enum skd_req_state state)
 
 static void skd_log_skdev(struct skd_device *skdev, const char *event)
 {
-	DPRINTK(skdev, "(%s) skdev=%p event='%s'\n", skdev->name, skdev, event);
-	DPRINTK(skdev, "  drive_state=%s(%d) driver_state=%s(%d)\n",
-		skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
-		skd_skdev_state_to_str(skdev->state), skdev->state);
-	DPRINTK(skdev, "  busy=%d limit=%d dev=%d lowat=%d\n",
-		skdev->in_flight, skdev->cur_max_queue_depth,
-		skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
-	DPRINTK(skdev, "  timestamp=0x%x cycle=%d cycle_ix=%d\n",
-		skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
+	pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skdev, event);
+	pr_debug("%s:%s:%d   drive_state=%s(%d) driver_state=%s(%d)\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+		 skd_skdev_state_to_str(skdev->state), skdev->state);
+	pr_debug("%s:%s:%d   busy=%d limit=%d dev=%d lowat=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->in_flight, skdev->cur_max_queue_depth,
+		 skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+	pr_debug("%s:%s:%d   timestamp=0x%x cycle=%d cycle_ix=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
 }
 
 static void skd_log_skmsg(struct skd_device *skdev,
 			  struct skd_fitmsg_context *skmsg, const char *event)
 {
-	DPRINTK(skdev, "(%s) skmsg=%p event='%s'\n", skdev->name, skmsg, event);
-	DPRINTK(skdev, "  state=%s(%d) id=0x%04x length=%d\n",
-		skd_skmsg_state_to_str(skmsg->state), skmsg->state,
-		skmsg->id, skmsg->length);
+	pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skmsg, event);
+	pr_debug("%s:%s:%d   state=%s(%d) id=0x%04x length=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_skmsg_state_to_str(skmsg->state), skmsg->state,
+		 skmsg->id, skmsg->length);
 }
 
 static void skd_log_skreq(struct skd_device *skdev,
 			  struct skd_request_context *skreq, const char *event)
 {
-	DPRINTK(skdev, "(%s) skreq=%p event='%s'\n", skdev->name, skreq, event);
-	DPRINTK(skdev, "  state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
-		skd_skreq_state_to_str(skreq->state), skreq->state,
-		skreq->id, skreq->fitmsg_id);
-	DPRINTK(skdev, "  timo=0x%x sg_dir=%d n_sg=%d\n",
-		skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
+	pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skreq, event);
+	pr_debug("%s:%s:%d   state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_skreq_state_to_str(skreq->state), skreq->state,
+		 skreq->id, skreq->fitmsg_id);
+	pr_debug("%s:%s:%d   timo=0x%x sg_dir=%d n_sg=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
 
 	if (!skd_bio) {
 		if (skreq->req != NULL) {
@@ -5679,24 +5798,28 @@ static void skd_log_skreq(struct skd_device *skdev,
 			u32 lba = (u32)blk_rq_pos(req);
 			u32 count = blk_rq_sectors(req);
 
-			DPRINTK(skdev,
-				"  req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				req, lba, lba, count, count,
-				(int)rq_data_dir(req));
+			pr_debug("%s:%s:%d "
+				 "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				 skdev->name, __func__, __LINE__,
+				 req, lba, lba, count, count,
+				 (int)rq_data_dir(req));
 		} else
-			DPRINTK(skdev, "  req=NULL\n");
+			pr_debug("%s:%s:%d req=NULL\n",
+				 skdev->name, __func__, __LINE__);
 	} else {
 		if (skreq->bio != NULL) {
 			struct bio *bio = skreq->bio;
 			u32 lba = (u32)bio->bi_sector;
 			u32 count = bio_sectors(bio);
 
-			DPRINTK(skdev,
-				"  bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				bio, lba, lba, count, count,
-				(int)bio_data_dir(bio));
+			pr_debug("%s:%s:%d "
+				 "bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+				 skdev->name, __func__, __LINE__,
+				 bio, lba, lba, count, count,
+				 (int)bio_data_dir(bio));
 		} else
-			DPRINTK(skdev, "  req=NULL\n");
+			pr_debug("%s:%s:%d req=NULL\n",
+				 skdev->name, __func__, __LINE__);
 	}
 }
 

From b88fac630bf71bd0af44502d6688e99a5426d438 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 29 Oct 2013 13:46:06 -0600
Subject: [PATCH 22/94] cciss: return 0 from driver probe function on success,
 not 1

A return value of 1 is interpreted as an error

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/cciss.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index edfa2515bc86..0c004ac05811 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -5183,7 +5183,7 @@ reinit_after_soft_reset:
 	rebuild_lun_table(h, 1, 0);
 	cciss_engage_scsi(h);
 	h->busy_initializing = 0;
-	return 1;
+	return 0;
 
 clean4:
 	cciss_free_cmd_pool(h);

From ef0899410ff630b2e75306da49996dbbfa318165 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 31 Oct 2013 13:24:28 +0100
Subject: [PATCH 23/94] s390/dasd: hold request queue sysfs lock when calling
 elevator_init()

"elevator: Fix a race in elevator switching and md device initialization"
changed the semantics of elevator_init() in a way that now enforces to hold
the corresponding request queue's sysfs_lock when calling elevator_init()
to fix a race.
The patch did not convert the s390 dasd device driver which is the only
device driver which also calls elevator_init(). So add the missing locking.

Cc: Tomoki Sekiyama <tomoki.sekiyama@hds.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/s390/block/dasd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 451bf99582ff..846d5c6609d8 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2978,12 +2978,12 @@ static int dasd_alloc_queue(struct dasd_block *block)
 
 	elevator_exit(block->request_queue->elevator);
 	block->request_queue->elevator = NULL;
+	mutex_lock(&block->request_queue->sysfs_lock);
 	rc = elevator_init(block->request_queue, "deadline");
-	if (rc) {
+	if (rc)
 		blk_cleanup_queue(block->request_queue);
-		return rc;
-	}
-	return 0;
+	mutex_unlock(&block->request_queue->sysfs_lock);
+	return rc;
 }
 
 /*

From 1762b57fcbe365c2e3f79769a7fe77942ea3165f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 30 Oct 2013 13:23:53 +0800
Subject: [PATCH 24/94] skd: fix error return code in skd_pci_probe()

Fix to return -ENOMEM in the skd construct error handling
case instead of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index ab17bff6b4f9..1a8717fce41d 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -5321,8 +5321,10 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	skdev = skd_construct(pdev);
-	if (skdev == NULL)
+	if (skdev == NULL) {
+		rc = -ENOMEM;
 		goto err_out_regions;
+	}
 
 	skd_pci_info(skdev, pci_str);
 	pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str);

From fcd37eb3c1347193935d07a82b84dfc7d418dd05 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 1 Nov 2013 10:14:56 -0600
Subject: [PATCH 25/94] skd: rip out bio path

The skd driver has a selectable rq or bio based queueing model.
For 3.14, we want to turn this into a single blk-mq interface
instead. With the immutable biovecs being merged in 3.13, the
bio model would need patches to even work. So rip it out, with
a conversion pending for blk-mq in the next release.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 529 +++++----------------------------------
 1 file changed, 62 insertions(+), 467 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 1a8717fce41d..49e1e8b48422 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -354,13 +354,7 @@ struct skd_device {
 
 	u32 timo_slot;
 
-
 	struct work_struct completion_worker;
-
-	struct bio_list bio_queue;
-	int queue_stopped;
-
-	struct list_head flush_list;
 };
 
 #define SKD_FLUSH_JOB   "skd-flush-jobs"
@@ -470,11 +464,6 @@ MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
 module_param(skd_isr_comp_limit, int, 0444);
 MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
 
-static int skd_bio;
-module_param(skd_bio, int, 0444);
-MODULE_PARM_DESC(skd_bio,
-		 "Register as a bio device instead of block (0, 1) default=0");
-
 /* Major device number dynamically assigned. */
 static u32 skd_major;
 
@@ -512,11 +501,6 @@ static void skd_log_skmsg(struct skd_device *skdev,
 static void skd_log_skreq(struct skd_device *skdev,
 			  struct skd_request_context *skreq, const char *event);
 
-/* FLUSH FUA flag handling. */
-static int skd_flush_cmd_enqueue(struct skd_device *, void *);
-static void *skd_flush_cmd_dequeue(struct skd_device *);
-
-
 /*
  *****************************************************************************
  * READ/WRITE REQUESTS
@@ -524,40 +508,25 @@ static void *skd_flush_cmd_dequeue(struct skd_device *);
  */
 static void skd_stop_queue(struct skd_device *skdev)
 {
-	if (!skd_bio)
-		blk_stop_queue(skdev->queue);
-	else
-		skdev->queue_stopped = 1;
+	blk_stop_queue(skdev->queue);
 }
 
 static void skd_unstop_queue(struct skd_device *skdev)
 {
-	if (!skd_bio)
-		queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
-	else
-		skdev->queue_stopped = 0;
+	queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
 }
 
 static void skd_start_queue(struct skd_device *skdev)
 {
-	if (!skd_bio) {
-		blk_start_queue(skdev->queue);
-	} else {
-		pr_err("(%s): Starting queue\n", skd_name(skdev));
-		skdev->queue_stopped = 0;
-		skd_request_fn(skdev->queue);
-	}
+	blk_start_queue(skdev->queue);
 }
 
 static int skd_queue_stopped(struct skd_device *skdev)
 {
-	if (!skd_bio)
-		return blk_queue_stopped(skdev->queue);
-	else
-		return skdev->queue_stopped;
+	return blk_queue_stopped(skdev->queue);
 }
 
-static void skd_fail_all_pending_blk(struct skd_device *skdev)
+static void skd_fail_all_pending(struct skd_device *skdev)
 {
 	struct request_queue *q = skdev->queue;
 	struct request *req;
@@ -571,42 +540,6 @@ static void skd_fail_all_pending_blk(struct skd_device *skdev)
 	}
 }
 
-static void skd_fail_all_pending_bio(struct skd_device *skdev)
-{
-	struct bio *bio;
-	int error = -EIO;
-
-	for (;; ) {
-		bio = bio_list_pop(&skdev->bio_queue);
-
-		if (bio == NULL)
-			break;
-
-		bio_endio(bio, error);
-	}
-}
-
-static void skd_fail_all_pending(struct skd_device *skdev)
-{
-	if (!skd_bio)
-		skd_fail_all_pending_blk(skdev);
-	else
-		skd_fail_all_pending_bio(skdev);
-}
-
-static void skd_make_request(struct request_queue *q, struct bio *bio)
-{
-	struct skd_device *skdev = q->queuedata;
-	unsigned long flags;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-
-	bio_list_add(&skdev->bio_queue, bio);
-	skd_request_fn(skdev->queue);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-}
-
 static void
 skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
 		int data_dir, unsigned lba,
@@ -667,18 +600,9 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
 	put_unaligned_be64(lba, &buf[8]);
 	put_unaligned_be32(count, &buf[16]);
 
-	if (!skd_bio) {
-		req = skreq->req;
-		blk_add_request_payload(req, page, len);
-		req->buffer = buf;
-	} else {
-		skreq->bio->bi_io_vec->bv_page = page;
-		skreq->bio->bi_io_vec->bv_offset = 0;
-		skreq->bio->bi_io_vec->bv_len = len;
-
-		skreq->bio->bi_vcnt = 1;
-		skreq->bio->bi_phys_segments = 1;
-	}
+	req = skreq->req;
+	blk_add_request_payload(req, page, len);
+	req->buffer = buf;
 }
 
 static void skd_request_fn_not_online(struct request_queue *q);
@@ -690,7 +614,6 @@ static void skd_request_fn(struct request_queue *q)
 	struct fit_msg_hdr *fmh = NULL;
 	struct skd_request_context *skreq;
 	struct request *req = NULL;
-	struct bio *bio = NULL;
 	struct skd_scsi_request *scsi_req;
 	struct page *page;
 	unsigned long io_flags;
@@ -732,60 +655,27 @@ static void skd_request_fn(struct request_queue *q)
 
 		flush = fua = 0;
 
-		if (!skd_bio) {
-			req = blk_peek_request(q);
+		req = blk_peek_request(q);
 
-			/* Are there any native requests to start? */
-			if (req == NULL)
-				break;
+		/* Are there any native requests to start? */
+		if (req == NULL)
+			break;
 
-			lba = (u32)blk_rq_pos(req);
-			count = blk_rq_sectors(req);
-			data_dir = rq_data_dir(req);
-			io_flags = req->cmd_flags;
+		lba = (u32)blk_rq_pos(req);
+		count = blk_rq_sectors(req);
+		data_dir = rq_data_dir(req);
+		io_flags = req->cmd_flags;
 
-			if (io_flags & REQ_FLUSH)
-				flush++;
+		if (io_flags & REQ_FLUSH)
+			flush++;
 
-			if (io_flags & REQ_FUA)
-				fua++;
+		if (io_flags & REQ_FUA)
+			fua++;
 
-			pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
-				 "count=%u(0x%x) dir=%d\n",
-				 skdev->name, __func__, __LINE__,
-				 req, lba, lba, count, count, data_dir);
-		} else {
-			if (!list_empty(&skdev->flush_list)) {
-				/* Process data part of FLUSH request. */
-				bio = (struct bio *)skd_flush_cmd_dequeue(skdev);
-				flush++;
-				pr_debug("%s:%s:%d processing FLUSH request with data.\n",
-					 skdev->name, __func__, __LINE__);
-			} else {
-				/* peek at our bio queue */
-				bio = bio_list_peek(&skdev->bio_queue);
-			}
-
-			/* Are there any native requests to start? */
-			if (bio == NULL)
-				break;
-
-			lba = (u32)bio->bi_sector;
-			count = bio_sectors(bio);
-			data_dir = bio_data_dir(bio);
-			io_flags = bio->bi_rw;
-
-			pr_debug("%s:%s:%d new bio=%p lba=%u(0x%x) "
-				 "count=%u(0x%x) dir=%d\n",
-				 skdev->name, __func__, __LINE__,
-				 bio, lba, lba, count, count, data_dir);
-
-			if (io_flags & REQ_FLUSH)
-				flush++;
-
-			if (io_flags & REQ_FUA)
-				fua++;
-		}
+		pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
+			 "count=%u(0x%x) dir=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 req, lba, lba, count, count, data_dir);
 
 		/* At this point we know there is a request
 		 * (from our bio q or req q depending on the way
@@ -831,23 +721,9 @@ static void skd_request_fn(struct request_queue *q)
 		 * the native request. Note that skd_request_context is
 		 * available but is still at the head of the free list.
 		 */
-		if (!skd_bio) {
-			blk_start_request(req);
-			skreq->req = req;
-			skreq->fitmsg_id = 0;
-		} else {
-			if (unlikely(flush == SKD_FLUSH_DATA_SECOND)) {
-				skreq->bio = bio;
-			} else {
-				skreq->bio = bio_list_pop(&skdev->bio_queue);
-				SKD_ASSERT(skreq->bio == bio);
-				skreq->start_time = jiffies;
-				part_inc_in_flight(&skdev->disk->part0,
-						   bio_data_dir(bio));
-			}
-
-			skreq->fitmsg_id = 0;
-		}
+		blk_start_request(req);
+		skreq->req = req;
+		skreq->fitmsg_id = 0;
 
 		/* Either a FIT msg is in progress or we have to start one. */
 		if (skmsg == NULL) {
@@ -923,8 +799,7 @@ static void skd_request_fn(struct request_queue *q)
 		if (fua)
 			scsi_req->cdb[1] |= SKD_FUA_NV;
 
-		if ((!skd_bio && !req->bio) ||
-			(skd_bio && flush == SKD_FLUSH_ZERO_SIZE_FIRST))
+		if (!req->bio)
 			goto skip_sg;
 
 		error = skd_preop_sg_list(skdev, skreq);
@@ -1011,8 +886,7 @@ skip_sg:
 	 * If req is non-NULL it means there is something to do but
 	 * we are out of a resource.
 	 */
-	if (((!skd_bio) && req) ||
-	    ((skd_bio) && bio_list_peek(&skdev->bio_queue)))
+	if (req)
 		skd_stop_queue(skdev);
 }
 
@@ -1045,7 +919,7 @@ static void skd_end_request_blk(struct skd_device *skdev,
 	__blk_end_request_all(skreq->req, error);
 }
 
-static int skd_preop_sg_list_blk(struct skd_device *skdev,
+static int skd_preop_sg_list(struct skd_device *skdev,
 				 struct skd_request_context *skreq)
 {
 	struct request *req = skreq->req;
@@ -1108,7 +982,7 @@ static int skd_preop_sg_list_blk(struct skd_device *skdev,
 	return 0;
 }
 
-static void skd_postop_sg_list_blk(struct skd_device *skdev,
+static void skd_postop_sg_list(struct skd_device *skdev,
 				   struct skd_request_context *skreq)
 {
 	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
@@ -1124,184 +998,10 @@ static void skd_postop_sg_list_blk(struct skd_device *skdev,
 	pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
 }
 
-static void skd_end_request_bio(struct skd_device *skdev,
-				struct skd_request_context *skreq, int error)
-{
-	struct bio *bio = skreq->bio;
-	int rw = bio_data_dir(bio);
-	unsigned long io_flags = bio->bi_rw;
-
-	if ((io_flags & REQ_DISCARD) &&
-		(skreq->discard_page == 1)) {
-		pr_debug("%s:%s:%d biomode: skd_end_request: freeing DISCARD page.\n",
-			 skdev->name, __func__, __LINE__);
-		free_page((unsigned long)page_address(bio->bi_io_vec->bv_page));
-	}
-
-	if (unlikely(error)) {
-		u32 lba = (u32)skreq->bio->bi_sector;
-		u32 count = bio_sectors(skreq->bio);
-		char *cmd = (rw == WRITE) ? "write" : "read";
-		pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
-		       skd_name(skdev), cmd, lba, count, skreq->id);
-	}
-	{
-		int cpu = part_stat_lock();
-
-		if (likely(!error)) {
-			part_stat_inc(cpu, &skdev->disk->part0, ios[rw]);
-			part_stat_add(cpu, &skdev->disk->part0, sectors[rw],
-				      bio_sectors(bio));
-		}
-		part_stat_add(cpu, &skdev->disk->part0, ticks[rw],
-			      jiffies - skreq->start_time);
-		part_dec_in_flight(&skdev->disk->part0, rw);
-		part_stat_unlock();
-	}
-
-	pr_debug("%s:%s:%d id=0x%x error=%d\n",
-		 skdev->name, __func__, __LINE__, skreq->id, error);
-
-	bio_endio(skreq->bio, error);
-}
-
-static int skd_preop_sg_list_bio(struct skd_device *skdev,
-				 struct skd_request_context *skreq)
-{
-	struct bio *bio = skreq->bio;
-	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
-	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
-	int n_sg;
-	int i;
-	struct bio_vec *vec;
-	struct fit_sg_descriptor *sgd;
-	u64 dma_addr;
-	u32 count;
-	int errs = 0;
-	unsigned int io_flags = 0;
-	io_flags |= bio->bi_rw;
-
-	skreq->sg_byte_count = 0;
-	n_sg = skreq->n_sg = skreq->bio->bi_vcnt;
-
-	if (n_sg <= 0)
-		return -EINVAL;
-
-	if (n_sg > skdev->sgs_per_request) {
-		pr_err("(%s): sg overflow n=%d\n",
-		       skd_name(skdev), n_sg);
-		skreq->n_sg = 0;
-		return -EIO;
-	}
-
-	for (i = 0; i < skreq->n_sg; i++) {
-		vec = bio_iovec_idx(bio, i);
-		dma_addr = pci_map_page(skdev->pdev,
-					vec->bv_page,
-					vec->bv_offset, vec->bv_len, pci_dir);
-		count = vec->bv_len;
-
-		if (count == 0 || count > 64u * 1024u || (count & 3) != 0
-		    || (dma_addr & 3) != 0) {
-			pr_err(
-			       "(%s): Bad sg ix=%d count=%d addr=0x%llx\n",
-			       skd_name(skdev), i, count, dma_addr);
-			errs++;
-		}
-
-		sgd = &skreq->sksg_list[i];
-
-		sgd->control = FIT_SGD_CONTROL_NOT_LAST;
-		sgd->byte_count = vec->bv_len;
-		skreq->sg_byte_count += vec->bv_len;
-		sgd->host_side_addr = dma_addr;
-		sgd->dev_side_addr = 0; /* not used */
-	}
-
-	skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
-	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
-
-
-	 if (!(io_flags & REQ_DISCARD)) {
-		count = bio_sectors(bio) << 9u;
-		if (count != skreq->sg_byte_count) {
-			pr_err("(%s): mismatch count sg=%d req=%d\n",
-			       skd_name(skdev), skreq->sg_byte_count, count);
-			errs++;
-		}
-	}
-
-	if (unlikely(skdev->dbg_level > 1)) {
-		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
-			 skdev->name, __func__, __LINE__,
-			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
-		for (i = 0; i < n_sg; i++) {
-			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
-			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
-				 "addr=0x%llx next=0x%llx\n",
-				 skdev->name, __func__, __LINE__,
-				 i, sgd->byte_count, sgd->control,
-				 sgd->host_side_addr, sgd->next_desc_ptr);
-		}
-	}
-
-	if (errs != 0) {
-		skd_postop_sg_list(skdev, skreq);
-		skreq->n_sg = 0;
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int skd_preop_sg_list(struct skd_device *skdev,
-			     struct skd_request_context *skreq)
-{
-	if (!skd_bio)
-		return skd_preop_sg_list_blk(skdev, skreq);
-	else
-		return skd_preop_sg_list_bio(skdev, skreq);
-}
-
-static void skd_postop_sg_list_bio(struct skd_device *skdev,
-				   struct skd_request_context *skreq)
-{
-	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
-	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
-	int i;
-	struct fit_sg_descriptor *sgd;
-
-	/*
-	 * restore the next ptr for next IO request so we
-	 * don't have to set it every time.
-	 */
-	skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
-		skreq->sksg_dma_address +
-		((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
-
-	for (i = 0; i < skreq->n_sg; i++) {
-		sgd = &skreq->sksg_list[i];
-		pci_unmap_page(skdev->pdev, sgd->host_side_addr,
-			       sgd->byte_count, pci_dir);
-	}
-}
-
-static void skd_postop_sg_list(struct skd_device *skdev,
-			       struct skd_request_context *skreq)
-{
-	if (!skd_bio)
-		skd_postop_sg_list_blk(skdev, skreq);
-	else
-		skd_postop_sg_list_bio(skdev, skreq);
-}
-
 static void skd_end_request(struct skd_device *skdev,
 			    struct skd_request_context *skreq, int error)
 {
-	if (likely(!skd_bio))
-		skd_end_request_blk(skdev, skreq, error);
-	else
-		skd_end_request_bio(skdev, skreq, error);
+	skd_end_request_blk(skdev, skreq, error);
 }
 
 static void skd_request_fn_not_online(struct request_queue *q)
@@ -2754,13 +2454,10 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 		break;
 
 	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
-		if (!skd_bio) {
-			if ((unsigned long) ++skreq->req->special <
-			    SKD_MAX_RETRIES) {
-				skd_log_skreq(skdev, skreq, "retry");
-				skd_requeue_request(skdev, skreq);
-				break;
-			}
+		if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) {
+			skd_log_skreq(skdev, skreq, "retry");
+			skd_requeue_request(skdev, skreq);
+			break;
 		}
 	/* fall through to report error */
 
@@ -2774,12 +2471,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 static void skd_requeue_request(struct skd_device *skdev,
 				struct skd_request_context *skreq)
 {
-	if (!skd_bio) {
-		blk_requeue_request(skdev->queue, skreq->req);
-	} else {
-		bio_list_add_head(&skdev->bio_queue, skreq->bio);
-		skreq->bio = NULL;
-	}
+	blk_requeue_request(skdev->queue, skreq->req);
 }
 
 
@@ -2840,11 +2532,7 @@ static void skd_release_skreq(struct skd_device *skdev,
 	/*
 	 * Reset backpointer
 	 */
-	if (likely(!skd_bio))
-		skreq->req = NULL;
-	else
-		skreq->bio = NULL;
-
+	skreq->req = NULL;
 
 	/*
 	 * Reclaim the skd_request_context
@@ -3084,8 +2772,6 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 	u32 cmp_bytes = 0;
 	int rc = 0;
 	int processed = 0;
-	int ret;
-
 
 	for (;; ) {
 		SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
@@ -3180,8 +2866,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 		if (skreq->n_sg > 0)
 			skd_postop_sg_list(skdev, skreq);
 
-		if (((!skd_bio) && !skreq->req) ||
-		    ((skd_bio) && !skreq->bio)) {
+		if (!skreq->req) {
 			pr_debug("%s:%s:%d NULL backptr skdreq %p, "
 				 "req=0x%x req_id=0x%x\n",
 				 skdev->name, __func__, __LINE__,
@@ -3191,30 +2876,10 @@ static int skd_isr_completion_posted(struct skd_device *skdev,
 			 * Capture the outcome and post it back to the
 			 * native request.
 			 */
-			if (likely(cmp_status == SAM_STAT_GOOD)) {
-				if (unlikely(skreq->flush_cmd)) {
-					if (skd_bio) {
-						/* if empty size bio, we are all done */
-						if (bio_sectors(skreq->bio) == 0) {
-							skd_end_request(skdev, skreq, 0);
-						} else {
-							ret = skd_flush_cmd_enqueue(skdev, (void *)skreq->bio);
-							if (ret != 0) {
-								pr_err("Failed to enqueue flush bio with Data. Err=%d.\n", ret);
-								skd_end_request(skdev, skreq, ret);
-							} else {
-								((*enqueued)++);
-							}
-						}
-					} else {
-						skd_end_request(skdev, skreq, 0);
-					}
-				} else {
-					skd_end_request(skdev, skreq, 0);
-				}
-			} else {
+			if (likely(cmp_status == SAM_STAT_GOOD))
+				skd_end_request(skdev, skreq, 0);
+			else
 				skd_resolve_req_exception(skdev, skreq);
-			}
 		}
 
 		/*
@@ -3645,29 +3310,20 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue)
 			skd_log_skreq(skdev, skreq, "recover");
 
 			SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0);
-			if (!skd_bio)
-				SKD_ASSERT(skreq->req != NULL);
-			else
-				SKD_ASSERT(skreq->bio != NULL);
+			SKD_ASSERT(skreq->req != NULL);
 
 			/* Release DMA resources for the request. */
 			if (skreq->n_sg > 0)
 				skd_postop_sg_list(skdev, skreq);
 
-			if (!skd_bio) {
-				if (requeue &&
-				    (unsigned long) ++skreq->req->special <
-				    SKD_MAX_RETRIES)
-					skd_requeue_request(skdev, skreq);
-				else
-				skd_end_request(skdev, skreq, -EIO);
-			} else
+			if (requeue &&
+			    (unsigned long) ++skreq->req->special <
+			    SKD_MAX_RETRIES)
+				skd_requeue_request(skdev, skreq);
+			else
 				skd_end_request(skdev, skreq, -EIO);
 
-			if (!skd_bio)
-				skreq->req = NULL;
-			else
-				skreq->bio = NULL;
+			skreq->req = NULL;
 
 			skreq->state = SKD_REQ_STATE_IDLE;
 			skreq->id += SKD_ID_INCR;
@@ -4580,16 +4236,11 @@ static struct skd_device *skd_construct(struct pci_dev *pdev)
 	skdev->sgs_per_request = skd_sgs_per_request;
 	skdev->dbg_level = skd_dbg_level;
 
-	if (skd_bio)
-		bio_list_init(&skdev->bio_queue);
-
-
 	atomic_set(&skdev->device_count, 0);
 
 	spin_lock_init(&skdev->lock);
 
 	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
-	INIT_LIST_HEAD(&skdev->flush_list);
 
 	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
 	rc = skd_cons_skcomp(skdev);
@@ -4941,13 +4592,7 @@ static int skd_cons_disk(struct skd_device *skdev)
 	disk->fops = &skd_blockdev_ops;
 	disk->private_data = skdev;
 
-	if (!skd_bio) {
-		q = blk_init_queue(skd_request_fn, &skdev->lock);
-	} else {
-		q = blk_alloc_queue(GFP_KERNEL);
-		q->queue_flags = QUEUE_FLAG_IO_STAT | QUEUE_FLAG_STACKABLE;
-	}
-
+	q = blk_init_queue(skd_request_fn, &skdev->lock);
 	if (!q) {
 		rc = -ENOMEM;
 		goto err_out;
@@ -4957,11 +4602,6 @@ static int skd_cons_disk(struct skd_device *skdev)
 	disk->queue = q;
 	q->queuedata = skdev;
 
-	if (skd_bio) {
-		q->queue_lock = &skdev->lock;
-		blk_queue_make_request(q, skd_make_request);
-	}
-
 	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
 	blk_queue_max_segments(q, skdev->sgs_per_request);
 	blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
@@ -5794,35 +5434,19 @@ static void skd_log_skreq(struct skd_device *skdev,
 		 skdev->name, __func__, __LINE__,
 		 skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
 
-	if (!skd_bio) {
-		if (skreq->req != NULL) {
-			struct request *req = skreq->req;
-			u32 lba = (u32)blk_rq_pos(req);
-			u32 count = blk_rq_sectors(req);
+	if (skreq->req != NULL) {
+		struct request *req = skreq->req;
+		u32 lba = (u32)blk_rq_pos(req);
+		u32 count = blk_rq_sectors(req);
 
-			pr_debug("%s:%s:%d "
-				 "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				 skdev->name, __func__, __LINE__,
-				 req, lba, lba, count, count,
-				 (int)rq_data_dir(req));
-		} else
-			pr_debug("%s:%s:%d req=NULL\n",
-				 skdev->name, __func__, __LINE__);
-	} else {
-		if (skreq->bio != NULL) {
-			struct bio *bio = skreq->bio;
-			u32 lba = (u32)bio->bi_sector;
-			u32 count = bio_sectors(bio);
-
-			pr_debug("%s:%s:%d "
-				 "bio=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
-				 skdev->name, __func__, __LINE__,
-				 bio, lba, lba, count, count,
-				 (int)bio_data_dir(bio));
-		} else
-			pr_debug("%s:%s:%d req=NULL\n",
-				 skdev->name, __func__, __LINE__);
-	}
+		pr_debug("%s:%s:%d "
+			 "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 req, lba, lba, count, count,
+			 (int)rq_data_dir(req));
+	} else
+		pr_debug("%s:%s:%d req=NULL\n",
+			 skdev->name, __func__, __LINE__);
 }
 
 /*
@@ -5918,34 +5542,5 @@ static void __exit skd_exit(void)
 	kmem_cache_destroy(skd_flush_slab);
 }
 
-static int
-skd_flush_cmd_enqueue(struct skd_device *skdev, void *cmd)
-{
-	struct skd_flush_cmd *item;
-
-	item = kmem_cache_zalloc(skd_flush_slab, GFP_ATOMIC);
-	if (!item) {
-		pr_err("skd_flush_cmd_enqueue: Failed to allocated item.\n");
-		return -ENOMEM;
-	}
-
-	item->cmd = cmd;
-	list_add_tail(&item->flist, &skdev->flush_list);
-	return 0;
-}
-
-static void *
-skd_flush_cmd_dequeue(struct skd_device *skdev)
-{
-	void *cmd;
-	struct skd_flush_cmd *item;
-
-	item = list_entry(skdev->flush_list.next, struct skd_flush_cmd, flist);
-	list_del_init(&item->flist);
-	cmd = item->cmd;
-	kmem_cache_free(skd_flush_slab, item);
-	return cmd;
-}
-
 module_init(skd_init);
 module_exit(skd_exit);

From 6a5ec65b9acee39f9af4a15a81858d5fc07498d0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 1 Nov 2013 10:38:45 -0600
Subject: [PATCH 26/94] skd: cleanup the skd_*() function block wrapping

Just call the block functions directly, don't wrap them
in skd helpers. With only one queueing model enabled, there's
no point in doing that.

Also kill the ->start_time and ->bio from the skd_request_context,
we don't use those anymore.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 52 ++++++++++++----------------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 49e1e8b48422..d404d7646d9c 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -214,8 +214,6 @@ struct skd_request_context {
 	u32 fitmsg_id;
 
 	struct request *req;
-	struct bio *bio;
-	unsigned long start_time;
 	u8 flush_cmd;
 	u8 discard_page;
 
@@ -506,26 +504,6 @@ static void skd_log_skreq(struct skd_device *skdev,
  * READ/WRITE REQUESTS
  *****************************************************************************
  */
-static void skd_stop_queue(struct skd_device *skdev)
-{
-	blk_stop_queue(skdev->queue);
-}
-
-static void skd_unstop_queue(struct skd_device *skdev)
-{
-	queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
-}
-
-static void skd_start_queue(struct skd_device *skdev)
-{
-	blk_start_queue(skdev->queue);
-}
-
-static int skd_queue_stopped(struct skd_device *skdev)
-{
-	return blk_queue_stopped(skdev->queue);
-}
-
 static void skd_fail_all_pending(struct skd_device *skdev)
 {
 	struct request_queue *q = skdev->queue;
@@ -634,14 +612,14 @@ static void skd_request_fn(struct request_queue *q)
 		return;
 	}
 
-	if (skd_queue_stopped(skdev)) {
+	if (blk_queue_stopped(skdev->queue)) {
 		if (skdev->skmsg_free_list == NULL ||
 		    skdev->skreq_free_list == NULL ||
 		    skdev->in_flight >= skdev->queue_low_water_mark)
 			/* There is still some kind of shortage */
 			return;
 
-		skd_unstop_queue(skdev);
+		queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
 	}
 
 	/*
@@ -887,7 +865,7 @@ skip_sg:
 	 * we are out of a resource.
 	 */
 	if (req)
-		skd_stop_queue(skdev);
+		blk_stop_queue(skdev->queue);
 }
 
 static void skd_end_request_blk(struct skd_device *skdev,
@@ -1106,7 +1084,7 @@ static void skd_timer_tick(ulong arg)
 	skdev->timer_countdown = SKD_DRAINING_TIMO;
 	skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT;
 	skdev->timo_slot = timo_slot;
-	skd_stop_queue(skdev);
+	blk_stop_queue(skdev->queue);
 
 timer_func_out:
 	mod_timer(&skdev->timer, (jiffies + HZ));
@@ -1165,7 +1143,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 
 		/*start the queue so we can respond with error to requests */
 		/* wakeup anyone waiting for startup complete */
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
 		break;
@@ -1191,7 +1169,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 			pr_debug("%s:%s:%d Slot drained, starting queue.\n",
 				 skdev->name, __func__, __LINE__);
 			skdev->state = SKD_DRVR_STATE_ONLINE;
-			skd_start_queue(skdev);
+			blk_start_queue(skdev->queue);
 			return;
 		}
 		if (skdev->timer_countdown > 0) {
@@ -1241,7 +1219,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev)
 
 		/*start the queue so we can respond with error to requests */
 		/* wakeup anyone waiting for startup complete */
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
 		break;
@@ -3241,7 +3219,7 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 		 */
 		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
 		skdev->timer_countdown = SKD_TIMER_SECONDS(3);
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		break;
 	case FIT_SR_DRIVE_BUSY_ERASE:
 		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
@@ -3276,7 +3254,7 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 	case FIT_SR_DRIVE_FAULT:
 		skd_drive_fault(skdev);
 		skd_recover_requests(skdev, 0);
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		break;
 
 	/* PCIe bus returned all Fs? */
@@ -3285,7 +3263,7 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 		       skd_name(skdev), state, sense);
 		skd_drive_disappeared(skdev);
 		skd_recover_requests(skdev, 0);
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		break;
 	default:
 		/*
@@ -3609,7 +3587,7 @@ static void skd_start_device(struct skd_device *skdev)
 		/*start the queue so we can respond with error to requests */
 		pr_debug("%s:%s:%d starting %s queue\n",
 			 skdev->name, __func__, __LINE__, skdev->name);
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
 		break;
@@ -3621,7 +3599,7 @@ static void skd_start_device(struct skd_device *skdev)
 		/*start the queue so we can respond with error to requests */
 		pr_debug("%s:%s:%d starting %s queue to error-out reqs\n",
 			 skdev->name, __func__, __LINE__, skdev->name);
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		skdev->gendisk_on = -1;
 		wake_up_interruptible(&skdev->waitq);
 		break;
@@ -3766,7 +3744,7 @@ static int skd_quiesce_dev(struct skd_device *skdev)
 	case SKD_DRVR_STATE_BUSY_IMMINENT:
 		pr_debug("%s:%s:%d stopping %s queue\n",
 			 skdev->name, __func__, __LINE__, skdev->name);
-		skd_stop_queue(skdev);
+		blk_stop_queue(skdev->queue);
 		break;
 	case SKD_DRVR_STATE_ONLINE:
 	case SKD_DRVR_STATE_STOPPING:
@@ -3835,7 +3813,7 @@ static int skd_unquiesce_dev(struct skd_device *skdev)
 		pr_debug("%s:%s:%d starting %s queue\n",
 			 skdev->name, __func__, __LINE__, skdev->name);
 		pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
-		skd_start_queue(skdev);
+		blk_start_queue(skdev->queue);
 		skdev->gendisk_on = 1;
 		wake_up_interruptible(&skdev->waitq);
 		break;
@@ -4620,7 +4598,7 @@ static int skd_cons_disk(struct skd_device *skdev)
 	spin_lock_irqsave(&skdev->lock, flags);
 	pr_debug("%s:%s:%d stopping %s queue\n",
 		 skdev->name, __func__, __LINE__, skdev->name);
-	skd_stop_queue(skdev);
+	blk_stop_queue(skdev->queue);
 	spin_unlock_irqrestore(&skdev->lock, flags);
 
 err_out:

From 38d4a1bb994e87057bc8b59e393931904b6b8bc0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 1 Nov 2013 15:05:10 -0400
Subject: [PATCH 27/94] skd: more removal of bio-based code

Remove skd_flush_cmd structure and skd_flush_slab.
Remove skd_end_request wrapper around skd_end_request_blk.
Remove skd_requeue_request, use blk_requeue_request directly.
Cleanup some comments (remove "bio" info) and whitespace.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 91 +++++++++-------------------------------
 1 file changed, 20 insertions(+), 71 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index d404d7646d9c..5dc5b39e5b85 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -86,7 +86,7 @@ enum {
 MODULE_AUTHOR("bug-reports: support@stec-inc.com");
 MODULE_LICENSE("Dual BSD/GPL");
 
-MODULE_DESCRIPTION("STEC s1120 PCIe SSD block/BIO driver (b" DRV_BUILD_ID ")");
+MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")");
 MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
 
 #define PCI_VENDOR_ID_STEC      0x1B39
@@ -352,23 +352,10 @@ struct skd_device {
 
 	u32 timo_slot;
 
+
 	struct work_struct completion_worker;
 };
 
-#define SKD_FLUSH_JOB   "skd-flush-jobs"
-struct kmem_cache *skd_flush_slab;
-
-/*
- * These commands hold "nonzero size FLUSH bios",
- * which are enqueud in skdev->flush_list during
- * completion of "zero size FLUSH commands".
- * It will be active in biomode.
- */
-struct skd_flush_cmd {
-	void *cmd;
-	struct list_head flist;
-};
-
 #define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
 #define SKD_READL(DEV, OFF)      skd_reg_read32(DEV, OFF)
 #define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
@@ -541,7 +528,7 @@ skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
 
 static void
 skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
-			struct skd_request_context *skreq)
+			    struct skd_request_context *skreq)
 {
 	skreq->flush_cmd = 1;
 
@@ -559,9 +546,9 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
 
 static void
 skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
-			struct skd_request_context *skreq,
-			struct page *page,
-			u32 lba, u32 count)
+		     struct skd_request_context *skreq,
+		     struct page *page,
+		     u32 lba, u32 count)
 {
 	char *buf;
 	unsigned long len;
@@ -655,10 +642,7 @@ static void skd_request_fn(struct request_queue *q)
 			 skdev->name, __func__, __LINE__,
 			 req, lba, lba, count, count, data_dir);
 
-		/* At this point we know there is a request
-		 * (from our bio q or req q depending on the way
-		 * the driver is built do checks for resources.
-		 */
+		/* At this point we know there is a request */
 
 		/* Are too many requets already in progress? */
 		if (skdev->in_flight >= skdev->cur_max_queue_depth) {
@@ -693,7 +677,7 @@ static void skd_request_fn(struct request_queue *q)
 		skreq->discard_page = 0;
 
 		/*
-		 * OK to now dequeue request from either bio or q.
+		 * OK to now dequeue request from q.
 		 *
 		 * At this point we are comitted to either start or reject
 		 * the native request. Note that skd_request_context is
@@ -868,15 +852,15 @@ skip_sg:
 		blk_stop_queue(skdev->queue);
 }
 
-static void skd_end_request_blk(struct skd_device *skdev,
-				struct skd_request_context *skreq, int error)
+static void skd_end_request(struct skd_device *skdev,
+			    struct skd_request_context *skreq, int error)
 {
 	struct request *req = skreq->req;
 	unsigned int io_flags = req->cmd_flags;
 
 	if ((io_flags & REQ_DISCARD) &&
 		(skreq->discard_page == 1)) {
-		pr_debug("%s:%s:%d skd_end_request_blk, free the page!",
+		pr_debug("%s:%s:%d, free the page!",
 			 skdev->name, __func__, __LINE__);
 		free_page((unsigned long)req->buffer);
 		req->buffer = NULL;
@@ -898,7 +882,7 @@ static void skd_end_request_blk(struct skd_device *skdev,
 }
 
 static int skd_preop_sg_list(struct skd_device *skdev,
-				 struct skd_request_context *skreq)
+			     struct skd_request_context *skreq)
 {
 	struct request *req = skreq->req;
 	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
@@ -961,7 +945,7 @@ static int skd_preop_sg_list(struct skd_device *skdev,
 }
 
 static void skd_postop_sg_list(struct skd_device *skdev,
-				   struct skd_request_context *skreq)
+			       struct skd_request_context *skreq)
 {
 	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
 	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
@@ -976,12 +960,6 @@ static void skd_postop_sg_list(struct skd_device *skdev,
 	pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
 }
 
-static void skd_end_request(struct skd_device *skdev,
-			    struct skd_request_context *skreq, int error)
-{
-	skd_end_request_blk(skdev, skreq, error);
-}
-
 static void skd_request_fn_not_online(struct request_queue *q)
 {
 	struct skd_device *skdev = q->queuedata;
@@ -1525,7 +1503,7 @@ static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
 	struct skd_special_context *skspcl = NULL;
 	int rc;
 
-	for (;; ) {
+	for (;;) {
 		ulong flags;
 
 		spin_lock_irqsave(&skdev->lock, flags);
@@ -2300,10 +2278,6 @@ static void skd_complete_other(struct skd_device *skdev,
 			       volatile struct fit_completion_entry_v1 *skcomp,
 			       volatile struct fit_comp_error_info *skerr);
 
-
-static void skd_requeue_request(struct skd_device *skdev,
-				struct skd_request_context *skreq);
-
 struct sns_info {
 	u8 type;
 	u8 stat;
@@ -2349,9 +2323,9 @@ static struct sns_info skd_chkstat_table[] = {
  * type and stat, ignore key, asc, ascq.
  */
 
-static enum skd_check_status_action skd_check_status(struct skd_device *skdev,
-				u8 cmp_status,
-				volatile struct fit_comp_error_info *skerr)
+static enum skd_check_status_action
+skd_check_status(struct skd_device *skdev,
+		 u8 cmp_status, volatile struct fit_comp_error_info *skerr)
 {
 	int i, n;
 
@@ -2424,7 +2398,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 
 	case SKD_CHECK_STATUS_BUSY_IMMINENT:
 		skd_log_skreq(skdev, skreq, "retry(busy)");
-		skd_requeue_request(skdev, skreq);
+		blk_requeue_request(skdev->queue, skreq->req);
 		pr_info("(%s) drive BUSY imminent\n", skd_name(skdev));
 		skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
 		skdev->timer_countdown = SKD_TIMER_MINUTES(20);
@@ -2434,7 +2408,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
 		if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) {
 			skd_log_skreq(skdev, skreq, "retry");
-			skd_requeue_request(skdev, skreq);
+			blk_requeue_request(skdev->queue, skreq->req);
 			break;
 		}
 	/* fall through to report error */
@@ -2446,14 +2420,6 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 	}
 }
 
-static void skd_requeue_request(struct skd_device *skdev,
-				struct skd_request_context *skreq)
-{
-	blk_requeue_request(skdev->queue, skreq->req);
-}
-
-
-
 /* assume spinlock is already held */
 static void skd_release_skreq(struct skd_device *skdev,
 			      struct skd_request_context *skreq)
@@ -2998,7 +2964,6 @@ static void skd_release_special(struct skd_device *skdev,
 	int i, was_depleted;
 
 	for (i = 0; i < skspcl->req.n_sg; i++) {
-
 		struct page *page = sg_page(&skspcl->req.sg[i]);
 		__free_page(page);
 	}
@@ -3141,7 +3106,6 @@ static skd_isr(int irq, void *ptr)
 	return rc;
 }
 
-
 static void skd_drive_fault(struct skd_device *skdev)
 {
 	skdev->state = SKD_DRVR_STATE_FAULT;
@@ -3297,7 +3261,7 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue)
 			if (requeue &&
 			    (unsigned long) ++skreq->req->special <
 			    SKD_MAX_RETRIES)
-				skd_requeue_request(skdev, skreq);
+				blk_requeue_request(skdev->queue, skreq->req);
 			else
 				skd_end_request(skdev, skreq, -EIO);
 
@@ -3305,8 +3269,6 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue)
 
 			skreq->state = SKD_REQ_STATE_IDLE;
 			skreq->id += SKD_ID_INCR;
-
-
 		}
 		if (i > 0)
 			skreq[-1].next = skreq;
@@ -3879,7 +3841,6 @@ static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
 	SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
 	deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
 						&flush_enqueued);
-
 	if (flush_enqueued)
 		skd_request_fn(skdev->queue);
 
@@ -5450,15 +5411,6 @@ static int __init skd_init(void)
 		skd_isr_type = SKD_IRQ_DEFAULT;
 	}
 
-	skd_flush_slab = kmem_cache_create(SKD_FLUSH_JOB,
-						sizeof(struct skd_flush_cmd),
-						0, 0, NULL);
-
-	if (!skd_flush_slab) {
-		pr_err("failed to allocated flush slab.\n");
-		return -ENOMEM;
-	}
-
 	if (skd_max_queue_depth < 1
 	    || skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
 		pr_info(
@@ -5507,7 +5459,6 @@ static int __init skd_init(void)
 	skd_major = rc;
 
 	return pci_register_driver(&skd_driver);
-
 }
 
 static void __exit skd_exit(void)
@@ -5516,8 +5467,6 @@ static void __exit skd_exit(void)
 
 	unregister_blkdev(skd_major, DRV_NAME);
 	pci_unregister_driver(&skd_driver);
-
-	kmem_cache_destroy(skd_flush_slab);
 }
 
 module_init(skd_init);

From a073ae953ee20b33c8463f76d563fd7e2851c0ac Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:36:59 +0100
Subject: [PATCH 28/94] skd: fix unregister_blkdev() placement

register_blkdev() is called before pci_register_driver() in skd_init()
so unregister_blkdev() should be called after pci_unregister_driver()
in skd_exit(). Fix it.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 5dc5b39e5b85..6f09bca3a24f 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -5465,8 +5465,8 @@ static void __exit skd_exit(void)
 {
 	pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
 
-	unregister_blkdev(skd_major, DRV_NAME);
 	pci_unregister_driver(&skd_driver);
+	unregister_blkdev(skd_major, DRV_NAME);
 }
 
 module_init(skd_init);

From 2b91c55222a0b2381e12328ca8cc60e6a823a998 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:00 +0100
Subject: [PATCH 29/94] skd: fix error paths in skd_init()

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 6f09bca3a24f..8c96d182601d 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -5396,7 +5396,7 @@ static void skd_log_skreq(struct skd_device *skdev,
 
 static int __init skd_init(void)
 {
-	int rc = 0;
+	int rc = -ENOMEM;
 
 	pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
 
@@ -5454,11 +5454,21 @@ static int __init skd_init(void)
 	/* Obtain major device number. */
 	rc = register_blkdev(0, DRV_NAME);
 	if (rc < 0)
-		return rc;
+		goto err_register_blkdev;
 
 	skd_major = rc;
 
-	return pci_register_driver(&skd_driver);
+	rc = pci_register_driver(&skd_driver);
+	if (rc < 0)
+		goto err_pci_register_driver;
+
+	return rc;
+
+err_pci_register_driver:
+	unregister_blkdev(skd_major, DRV_NAME);
+
+err_register_blkdev:
+	return rc;
 }
 
 static void __exit skd_exit(void)

From fbed149ab328b72f501a3344ab4f20f712c63392 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:01 +0100
Subject: [PATCH 30/94] skd: fix error messages in skd_init()

* change priority level from KERN_INFO to KERN_ERR
* add "skd: " prefix
* do minor CodingStyle fixes

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 8c96d182601d..99d5d1251c45 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -5406,47 +5406,44 @@ static int __init skd_init(void)
 	case SKD_IRQ_MSIX:
 		break;
 	default:
-		pr_info("skd_isr_type %d invalid, re-set to %d\n",
+		pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n",
 		       skd_isr_type, SKD_IRQ_DEFAULT);
 		skd_isr_type = SKD_IRQ_DEFAULT;
 	}
 
-	if (skd_max_queue_depth < 1
-	    || skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
-		pr_info(
-		       "skd_max_queue_depth %d invalid, re-set to %d\n",
+	if (skd_max_queue_depth < 1 ||
+	    skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
+		pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n",
 		       skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
 		skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
 	}
 
 	if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) {
-		pr_info(
-		       "skd_max_req_per_msg %d invalid, re-set to %d\n",
+		pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n",
 		       skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
 		skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
 	}
 
 	if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
-		pr_info(
-		       "skd_sg_per_request %d invalid, re-set to %d\n",
+		pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n",
 		       skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
 		skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
 	}
 
 	if (skd_dbg_level < 0 || skd_dbg_level > 2) {
-		pr_info("skd_dbg_level %d invalid, re-set to %d\n",
+		pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n",
 		       skd_dbg_level, 0);
 		skd_dbg_level = 0;
 	}
 
 	if (skd_isr_comp_limit < 0) {
-		pr_info("skd_isr_comp_limit %d invalid, set to %d\n",
+		pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n",
 		       skd_isr_comp_limit, 0);
 		skd_isr_comp_limit = 0;
 	}
 
 	if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) {
-		pr_info("skd_max_pass_thru %d invalid, re-set to %d\n",
+		pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n",
 		       skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT);
 		skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
 	}

From b8df6647c248db457f1c2fb4f0f6350beaf16f9e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:02 +0100
Subject: [PATCH 31/94] skd: register block device only if some devices are
 present

Register block device in skd_pci_probe() instead of in skd_init() so it
is registered only if some devices are present (currently it is always
registered when the driver is loaded). Please note that this change
depends on the fact that register_blkdev(0, ...) never returns 0.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 99d5d1251c45..9ff34be89f99 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -4899,6 +4899,14 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		}
 	}
 
+	if (!skd_major) {
+		rc = register_blkdev(0, DRV_NAME);
+		if (rc < 0)
+			goto err_out_regions;
+		BUG_ON(!rc);
+		skd_major = rc;
+	}
+
 	skdev = skd_construct(pdev);
 	if (skdev == NULL) {
 		rc = -ENOMEM;
@@ -5396,8 +5404,6 @@ static void skd_log_skreq(struct skd_device *skdev,
 
 static int __init skd_init(void)
 {
-	int rc = -ENOMEM;
-
 	pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
 
 	switch (skd_isr_type) {
@@ -5448,24 +5454,7 @@ static int __init skd_init(void)
 		skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
 	}
 
-	/* Obtain major device number. */
-	rc = register_blkdev(0, DRV_NAME);
-	if (rc < 0)
-		goto err_register_blkdev;
-
-	skd_major = rc;
-
-	rc = pci_register_driver(&skd_driver);
-	if (rc < 0)
-		goto err_pci_register_driver;
-
-	return rc;
-
-err_pci_register_driver:
-	unregister_blkdev(skd_major, DRV_NAME);
-
-err_register_blkdev:
-	return rc;
+	return pci_register_driver(&skd_driver);
 }
 
 static void __exit skd_exit(void)
@@ -5473,7 +5462,9 @@ static void __exit skd_exit(void)
 	pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
 
 	pci_unregister_driver(&skd_driver);
-	unregister_blkdev(skd_major, DRV_NAME);
+
+	if (skd_major)
+		unregister_blkdev(skd_major, DRV_NAME);
 }
 
 module_init(skd_init);

From 43504631d04253d9866b9fbd2e06aed2e88a692b Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:03 +0100
Subject: [PATCH 32/94] skd: remove SCSI subsystem specific includes

This is not a SCSI host driver so remove SCSI subsystem specific
includes.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 9ff34be89f99..47d8a6a0a0cf 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -42,9 +42,6 @@
 #include <linux/wait.h>
 #include <linux/uio.h>
 #include <scsi/scsi.h>
-#include <scsi/scsi_host.h>
-#include <scsi/scsi_tcq.h>
-#include <scsi/scsi_cmnd.h>
 #include <scsi/sg.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>

From 4ca90b533278f15849287b5b6c0c7dc45b134cde Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:04 +0100
Subject: [PATCH 33/94] skd: use <asm/unaligned.h>

Use <asm/unaligned.h> instead of <asm-generic/unaligned.h>.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 47d8a6a0a0cf..e3dfd0dae44a 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -45,7 +45,7 @@
 #include <scsi/sg.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
-#include <asm-generic/unaligned.h>
+#include <asm/unaligned.h>
 
 #include "skd_s1120.h"
 

From ebedd16dab5e7cf22fb1156b74fa53f0233389a7 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:05 +0100
Subject: [PATCH 34/94] skd: remove redundant skdev->pdev assignment from
 skd_pci_probe()

skdev->pdev is set to pdev twice in skd_pci_probe(), first time
through skd_construct() call and the second time directly in
the function. Remove the second assignment as it is not needed.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index e3dfd0dae44a..70118a12eb37 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -4925,7 +4925,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 
 	pci_set_drvdata(pdev, skdev);
-	skdev->pdev = pdev;
+
 	skdev->disk->driverfs_dev = &pdev->dev;
 
 	for (i = 0; i < SKD_MAX_BARS; i++) {

From 7f74e5e94477b7b859c239661399af1fe37927f8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:06 +0100
Subject: [PATCH 35/94] skd: remove SKD_OMIT_FROM_SRC_DIST ifdefs

SKD_OMIT_FROM_SRC_DIST is never defined.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_s1120.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
index bf01941cdd62..426581e6d170 100644
--- a/drivers/block/skd_s1120.h
+++ b/drivers/block/skd_s1120.h
@@ -21,11 +21,9 @@
 #define  FIT_QCMD_QID_MASK              (0x3 << 1)
 #define  FIT_QCMD_QID0                  (0x0 << 1)
 #define  FIT_QCMD_QID_NORMAL            FIT_QCMD_QID0
-#ifndef SKD_OMIT_FROM_SRC_DIST
 #define  FIT_QCMD_QID1                  (0x1 << 1)
 #define  FIT_QCMD_QID2                  (0x2 << 1)
 #define  FIT_QCMD_QID3                  (0x3 << 1)
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 #define  FIT_QCMD_FLUSH_QUEUE           (0ull)      /* add QID */
 #define  FIT_QCMD_MSGSIZE_MASK          (0x3 << 4)
 #define  FIT_QCMD_MSGSIZE_64            (0x0 << 4)
@@ -39,13 +37,9 @@
  * Control, 32-bit r/w
  */
 #define FIT_CONTROL                 0x500u
-#ifndef SKD_OMIT_FROM_SRC_DIST
 #define  FIT_CR_HARD_RESET              (1u << 0u)
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 #define  FIT_CR_SOFT_RESET              (1u << 1u)
-#ifndef SKD_OMIT_FROM_SRC_DIST
 #define	 FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 #define  FIT_CR_ENABLE_INTERRUPTS       (1u << 7u)
 
 /*
@@ -53,10 +47,8 @@
  */
 #define FIT_STATUS			0x510u
 #define FIT_SR_DRIVE_STATE_MASK		0x000000FFu
-#ifndef SKD_OMIT_FROM_SRC_DIST
 #define	FIT_SR_SIGNATURE		(0xFF << 8)
 #define	FIT_SR_PIO_DMA			(1 << 16)
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 #define FIT_SR_DRIVE_OFFLINE		0x00
 #define FIT_SR_DRIVE_INIT		0x01
 /* #define FIT_SR_DRIVE_READY		0x02 */
@@ -74,14 +66,12 @@
 #define FIT_SR_DEVICE_MISSING           0xFF
 #define FIT_SR__RESERVED		0xFFFFFF00u
 
-#ifndef SKD_OMIT_FROM_SRC_DIST
 /*
  * FIT_STATUS - Status register data definition
  */
 #define	FIT_SR_STATE_MASK		(0xFF << 0)
 #define	FIT_SR_SIGNATURE		(0xFF << 8)
 #define	FIT_SR_PIO_DMA			(1 << 16)
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 
 
 /*
@@ -189,10 +179,8 @@
 #define FIT_MFD_PM_SLEEP		        0x17u
 #define FIT_MFD_CMD_PROGRESS		    0x18u
 
-#ifndef SKD_OMIT_FROM_SRC_DIST
 #define FIT_MTD_DEBUG                   0xFEu
 #define FIT_MFD_DEBUG                   0xFFu
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 
 #define FIT_MFD_MASK			(0xFFu)
 #define FIT_MFD_DATA_MASK		(0xFFu)
@@ -248,7 +236,6 @@ struct fit_msg_hdr {
 #define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
 #define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
 
-#ifndef SKD_OMIT_FROM_SRC_DIST
 /*
  * Format of a completion entry. The completion queue is circular
  * and must have at least as many entries as the maximum number
@@ -264,7 +251,6 @@ struct fit_msg_hdr {
  * Command_context is opaque and taken verbatim from the SSDI command.
  * All other fields are big endian.
  */
-#endif /* SKD_OMIT_FROM_SRC_DIST */
 #define FIT_PROTOCOL_VERSION_0          0
 
 /*

From fec23f63115622cfc82ed92e5d9348cb81c69173 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:07 +0100
Subject: [PATCH 36/94] skd: cleanup skd_do_inq_page_da()

skdev->pdev and skdev->pdev->bus are always different than NULL in
skd_do_inq_page_da() so simplify the code accordingly.

Also cache skdev->pdev value in pdev variable while at it.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 70118a12eb37..2886a8c3a64d 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2590,6 +2590,7 @@ static void skd_do_inq_page_da(struct skd_device *skdev,
 			       volatile struct fit_comp_error_info *skerr,
 			       uint8_t *cdb, uint8_t *buf)
 {
+	struct pci_dev *pdev = skdev->pdev;
 	unsigned max_bytes;
 	struct driver_inquiry_data inq;
 	u16 val;
@@ -2601,36 +2602,22 @@ static void skd_do_inq_page_da(struct skd_device *skdev,
 
 	inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE;
 
-	if (skdev->pdev && skdev->pdev->bus) {
-		skd_get_link_info(skdev->pdev,
-				  &inq.pcie_link_speed, &inq.pcie_link_lanes);
-		inq.pcie_bus_number = cpu_to_be16(skdev->pdev->bus->number);
-		inq.pcie_device_number = PCI_SLOT(skdev->pdev->devfn);
-		inq.pcie_function_number = PCI_FUNC(skdev->pdev->devfn);
+	skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes);
+	inq.pcie_bus_number = cpu_to_be16(pdev->bus->number);
+	inq.pcie_device_number = PCI_SLOT(pdev->devfn);
+	inq.pcie_function_number = PCI_FUNC(pdev->devfn);
 
-		pci_read_config_word(skdev->pdev, PCI_VENDOR_ID, &val);
-		inq.pcie_vendor_id = cpu_to_be16(val);
+	pci_read_config_word(pdev, PCI_VENDOR_ID, &val);
+	inq.pcie_vendor_id = cpu_to_be16(val);
 
-		pci_read_config_word(skdev->pdev, PCI_DEVICE_ID, &val);
-		inq.pcie_device_id = cpu_to_be16(val);
+	pci_read_config_word(pdev, PCI_DEVICE_ID, &val);
+	inq.pcie_device_id = cpu_to_be16(val);
 
-		pci_read_config_word(skdev->pdev, PCI_SUBSYSTEM_VENDOR_ID,
-				     &val);
-		inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
+	pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val);
+	inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
 
-		pci_read_config_word(skdev->pdev, PCI_SUBSYSTEM_ID, &val);
-		inq.pcie_subsystem_device_id = cpu_to_be16(val);
-	} else {
-		inq.pcie_bus_number = 0xFFFF;
-		inq.pcie_device_number = 0xFF;
-		inq.pcie_function_number = 0xFF;
-		inq.pcie_link_speed = 0xFF;
-		inq.pcie_link_lanes = 0xFF;
-		inq.pcie_vendor_id = 0xFFFF;
-		inq.pcie_device_id = 0xFFFF;
-		inq.pcie_subsystem_vendor_id = 0xFFFF;
-		inq.pcie_subsystem_device_id = 0xFFFF;
-	}
+	pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val);
+	inq.pcie_subsystem_device_id = cpu_to_be16(val);
 
 	/* Driver version, fixed lenth, padded with spaces on the right */
 	inq.driver_version_length = sizeof(inq.driver_version);

From 542d7b0015b8cb30d7c0772657408c6a6e9a8675 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:08 +0100
Subject: [PATCH 37/94] skd: reorder construct/destruct code

Reorder placement of skd_construct(), skd_cons_sg_list(), skd_destruct()
and skd_free_sg_list() functions. Then remove no longer needed function
prototypes.

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 319 ++++++++++++++++++---------------------
 1 file changed, 148 insertions(+), 171 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2886a8c3a64d..9199c93be926 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -449,7 +449,6 @@ MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
 /* Major device number dynamically assigned. */
 static u32 skd_major;
 
-static struct skd_device *skd_construct(struct pci_dev *pdev);
 static void skd_destruct(struct skd_device *skdev);
 static const struct block_device_operations skd_blockdev_ops;
 static void skd_send_fitmsg(struct skd_device *skdev,
@@ -4115,96 +4114,6 @@ static void skd_release_irq(struct skd_device *skdev)
  *****************************************************************************
  */
 
-static int skd_cons_skcomp(struct skd_device *skdev);
-static int skd_cons_skmsg(struct skd_device *skdev);
-static int skd_cons_skreq(struct skd_device *skdev);
-static int skd_cons_skspcl(struct skd_device *skdev);
-static int skd_cons_sksb(struct skd_device *skdev);
-static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
-						  u32 n_sg,
-						  dma_addr_t *ret_dma_addr);
-static int skd_cons_disk(struct skd_device *skdev);
-
-#define SKD_N_DEV_TABLE         16u
-static u32 skd_next_devno;
-
-static struct skd_device *skd_construct(struct pci_dev *pdev)
-{
-	struct skd_device *skdev;
-	int blk_major = skd_major;
-	int rc;
-
-	skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
-
-	if (!skdev) {
-		pr_err(PFX "(%s): memory alloc failure\n",
-		       pci_name(pdev));
-		return NULL;
-	}
-
-	skdev->state = SKD_DRVR_STATE_LOAD;
-	skdev->pdev = pdev;
-	skdev->devno = skd_next_devno++;
-	skdev->major = blk_major;
-	skdev->irq_type = skd_isr_type;
-	sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
-	skdev->dev_max_queue_depth = 0;
-
-	skdev->num_req_context = skd_max_queue_depth;
-	skdev->num_fitmsg_context = skd_max_queue_depth;
-	skdev->n_special = skd_max_pass_thru;
-	skdev->cur_max_queue_depth = 1;
-	skdev->queue_low_water_mark = 1;
-	skdev->proto_ver = 99;
-	skdev->sgs_per_request = skd_sgs_per_request;
-	skdev->dbg_level = skd_dbg_level;
-
-	atomic_set(&skdev->device_count, 0);
-
-	spin_lock_init(&skdev->lock);
-
-	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
-
-	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_skcomp(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_skmsg(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_skreq(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_skspcl(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_sksb(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
-	rc = skd_cons_disk(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
-	return skdev;
-
-err_out:
-	pr_debug("%s:%s:%d construct failed\n",
-		 skdev->name, __func__, __LINE__);
-	skd_destruct(skdev);
-	return NULL;
-}
-
 static int skd_cons_skcomp(struct skd_device *skdev)
 {
 	int rc = 0;
@@ -4292,6 +4201,35 @@ err_out:
 	return rc;
 }
 
+static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
+						  u32 n_sg,
+						  dma_addr_t *ret_dma_addr)
+{
+	struct fit_sg_descriptor *sg_list;
+	u32 nbytes;
+
+	nbytes = sizeof(*sg_list) * n_sg;
+
+	sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
+
+	if (sg_list != NULL) {
+		uint64_t dma_address = *ret_dma_addr;
+		u32 i;
+
+		memset(sg_list, 0, nbytes);
+
+		for (i = 0; i < n_sg - 1; i++) {
+			uint64_t ndp_off;
+			ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
+
+			sg_list[i].next_desc_ptr = dma_address + ndp_off;
+		}
+		sg_list[i].next_desc_ptr = 0LL;
+	}
+
+	return sg_list;
+}
+
 static int skd_cons_skreq(struct skd_device *skdev)
 {
 	int rc = 0;
@@ -4465,35 +4403,6 @@ err_out:
 	return rc;
 }
 
-static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
-						  u32 n_sg,
-						  dma_addr_t *ret_dma_addr)
-{
-	struct fit_sg_descriptor *sg_list;
-	u32 nbytes;
-
-	nbytes = sizeof(*sg_list) * n_sg;
-
-	sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
-
-	if (sg_list != NULL) {
-		uint64_t dma_address = *ret_dma_addr;
-		u32 i;
-
-		memset(sg_list, 0, nbytes);
-
-		for (i = 0; i < n_sg - 1; i++) {
-			uint64_t ndp_off;
-			ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
-
-			sg_list[i].next_desc_ptr = dma_address + ndp_off;
-		}
-		sg_list[i].next_desc_ptr = 0LL;
-	}
-
-	return sg_list;
-}
-
 static int skd_cons_disk(struct skd_device *skdev)
 {
 	int rc = 0;
@@ -4550,50 +4459,92 @@ err_out:
 	return rc;
 }
 
+#define SKD_N_DEV_TABLE         16u
+static u32 skd_next_devno;
+
+static struct skd_device *skd_construct(struct pci_dev *pdev)
+{
+	struct skd_device *skdev;
+	int blk_major = skd_major;
+	int rc;
+
+	skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
+
+	if (!skdev) {
+		pr_err(PFX "(%s): memory alloc failure\n",
+		       pci_name(pdev));
+		return NULL;
+	}
+
+	skdev->state = SKD_DRVR_STATE_LOAD;
+	skdev->pdev = pdev;
+	skdev->devno = skd_next_devno++;
+	skdev->major = blk_major;
+	skdev->irq_type = skd_isr_type;
+	sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
+	skdev->dev_max_queue_depth = 0;
+
+	skdev->num_req_context = skd_max_queue_depth;
+	skdev->num_fitmsg_context = skd_max_queue_depth;
+	skdev->n_special = skd_max_pass_thru;
+	skdev->cur_max_queue_depth = 1;
+	skdev->queue_low_water_mark = 1;
+	skdev->proto_ver = 99;
+	skdev->sgs_per_request = skd_sgs_per_request;
+	skdev->dbg_level = skd_dbg_level;
+
+	atomic_set(&skdev->device_count, 0);
+
+	spin_lock_init(&skdev->lock);
+
+	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
+
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skcomp(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skmsg(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skreq(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skspcl(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_sksb(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_disk(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
+	return skdev;
+
+err_out:
+	pr_debug("%s:%s:%d construct failed\n",
+		 skdev->name, __func__, __LINE__);
+	skd_destruct(skdev);
+	return NULL;
+}
+
 /*
  *****************************************************************************
  * DESTRUCT (FREE)
  *****************************************************************************
  */
 
-static void skd_free_skcomp(struct skd_device *skdev);
-static void skd_free_skmsg(struct skd_device *skdev);
-static void skd_free_skreq(struct skd_device *skdev);
-static void skd_free_skspcl(struct skd_device *skdev);
-static void skd_free_sksb(struct skd_device *skdev);
-static void skd_free_sg_list(struct skd_device *skdev,
-			     struct fit_sg_descriptor *sg_list,
-			     u32 n_sg, dma_addr_t dma_addr);
-static void skd_free_disk(struct skd_device *skdev);
-
-static void skd_destruct(struct skd_device *skdev)
-{
-	if (skdev == NULL)
-		return;
-
-
-	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
-	skd_free_disk(skdev);
-
-	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
-	skd_free_sksb(skdev);
-
-	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
-	skd_free_skspcl(skdev);
-
-	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
-	skd_free_skreq(skdev);
-
-	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
-	skd_free_skmsg(skdev);
-
-	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
-	skd_free_skcomp(skdev);
-
-	pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
-	kfree(skdev);
-}
-
 static void skd_free_skcomp(struct skd_device *skdev)
 {
 	if (skdev->skcomp_table != NULL) {
@@ -4636,6 +4587,19 @@ static void skd_free_skmsg(struct skd_device *skdev)
 	skdev->skmsg_table = NULL;
 }
 
+static void skd_free_sg_list(struct skd_device *skdev,
+			     struct fit_sg_descriptor *sg_list,
+			     u32 n_sg, dma_addr_t dma_addr)
+{
+	if (sg_list != NULL) {
+		u32 nbytes;
+
+		nbytes = sizeof(*sg_list) * n_sg;
+
+		pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
+	}
+}
+
 static void skd_free_skreq(struct skd_device *skdev)
 {
 	u32 i;
@@ -4732,19 +4696,6 @@ static void skd_free_sksb(struct skd_device *skdev)
 	skspcl->req.sksg_dma_address = 0;
 }
 
-static void skd_free_sg_list(struct skd_device *skdev,
-			     struct fit_sg_descriptor *sg_list,
-			     u32 n_sg, dma_addr_t dma_addr)
-{
-	if (sg_list != NULL) {
-		u32 nbytes;
-
-		nbytes = sizeof(*sg_list) * n_sg;
-
-		pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
-	}
-}
-
 static void skd_free_disk(struct skd_device *skdev)
 {
 	struct gendisk *disk = skdev->disk;
@@ -4761,8 +4712,34 @@ static void skd_free_disk(struct skd_device *skdev)
 	skdev->disk = NULL;
 }
 
+static void skd_destruct(struct skd_device *skdev)
+{
+	if (skdev == NULL)
+		return;
 
 
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+	skd_free_disk(skdev);
+
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+	skd_free_sksb(skdev);
+
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+	skd_free_skspcl(skdev);
+
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+	skd_free_skreq(skdev);
+
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+	skd_free_skmsg(skdev);
+
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+	skd_free_skcomp(skdev);
+
+	pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
+	kfree(skdev);
+}
+
 /*
  *****************************************************************************
  * BLOCK DEVICE (BDEV) GLUE

From f1a3c6191369632ada5b709997b91a7f15045ff4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Tue, 5 Nov 2013 12:37:09 +0100
Subject: [PATCH 38/94] skd: fix formatting in skd_s1120.h

Cc: Akhil Bhansali <abhansali@stec-inc.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_s1120.h | 218 ++++++++++++++++++--------------------
 1 file changed, 104 insertions(+), 114 deletions(-)

diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
index 426581e6d170..61c757ff0161 100644
--- a/drivers/block/skd_s1120.h
+++ b/drivers/block/skd_s1120.h
@@ -17,30 +17,29 @@
 /*
  * Q-channel, 64-bit r/w
  */
-#define FIT_Q_COMMAND               0x400u
-#define  FIT_QCMD_QID_MASK              (0x3 << 1)
-#define  FIT_QCMD_QID0                  (0x0 << 1)
-#define  FIT_QCMD_QID_NORMAL            FIT_QCMD_QID0
-#define  FIT_QCMD_QID1                  (0x1 << 1)
-#define  FIT_QCMD_QID2                  (0x2 << 1)
-#define  FIT_QCMD_QID3                  (0x3 << 1)
-#define  FIT_QCMD_FLUSH_QUEUE           (0ull)      /* add QID */
-#define  FIT_QCMD_MSGSIZE_MASK          (0x3 << 4)
-#define  FIT_QCMD_MSGSIZE_64            (0x0 << 4)
-#define  FIT_QCMD_MSGSIZE_128           (0x1 << 4)
-#define  FIT_QCMD_MSGSIZE_256           (0x2 << 4)
-#define  FIT_QCMD_MSGSIZE_512           (0x3 << 4)
-#define  FIT_QCMD_BASE_ADDRESS_MASK     (0xFFFFFFFFFFFFFFC0ull)
-
+#define FIT_Q_COMMAND			0x400u
+#define FIT_QCMD_QID_MASK		(0x3 << 1)
+#define  FIT_QCMD_QID0			(0x0 << 1)
+#define  FIT_QCMD_QID_NORMAL		FIT_QCMD_QID0
+#define  FIT_QCMD_QID1			(0x1 << 1)
+#define  FIT_QCMD_QID2			(0x2 << 1)
+#define  FIT_QCMD_QID3			(0x3 << 1)
+#define  FIT_QCMD_FLUSH_QUEUE		(0ull)	/* add QID */
+#define  FIT_QCMD_MSGSIZE_MASK		(0x3 << 4)
+#define  FIT_QCMD_MSGSIZE_64		(0x0 << 4)
+#define  FIT_QCMD_MSGSIZE_128		(0x1 << 4)
+#define  FIT_QCMD_MSGSIZE_256		(0x2 << 4)
+#define  FIT_QCMD_MSGSIZE_512		(0x3 << 4)
+#define  FIT_QCMD_BASE_ADDRESS_MASK	(0xFFFFFFFFFFFFFFC0ull)
 
 /*
  * Control, 32-bit r/w
  */
-#define FIT_CONTROL                 0x500u
-#define  FIT_CR_HARD_RESET              (1u << 0u)
-#define  FIT_CR_SOFT_RESET              (1u << 1u)
-#define	 FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
-#define  FIT_CR_ENABLE_INTERRUPTS       (1u << 7u)
+#define FIT_CONTROL			0x500u
+#define  FIT_CR_HARD_RESET		(1u << 0u)
+#define  FIT_CR_SOFT_RESET		(1u << 1u)
+#define  FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
+#define  FIT_CR_ENABLE_INTERRUPTS	(1u << 7u)
 
 /*
  * Status, 32-bit, r/o
@@ -63,88 +62,82 @@
 #define FIT_SR_DRIVE_BUSY_ERASE		0x0B
 #define FIT_SR_DRIVE_FW_BOOTING		0x0C
 #define FIT_SR_DRIVE_NEED_FW_DOWNLOAD	0xFE
-#define FIT_SR_DEVICE_MISSING           0xFF
+#define FIT_SR_DEVICE_MISSING		0xFF
 #define FIT_SR__RESERVED		0xFFFFFF00u
 
 /*
  * FIT_STATUS - Status register data definition
  */
-#define	FIT_SR_STATE_MASK		(0xFF << 0)
-#define	FIT_SR_SIGNATURE		(0xFF << 8)
-#define	FIT_SR_PIO_DMA			(1 << 16)
-
+#define FIT_SR_STATE_MASK		(0xFF << 0)
+#define FIT_SR_SIGNATURE		(0xFF << 8)
+#define FIT_SR_PIO_DMA			(1 << 16)
 
 /*
  * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
  */
-#define FIT_INT_STATUS_HOST         0x520u
-#define  FIT_ISH_FW_STATE_CHANGE        (1u << 0u)
-#define  FIT_ISH_COMPLETION_POSTED      (1u << 1u)
-#define  FIT_ISH_MSG_FROM_DEV           (1u << 2u)
-#define  FIT_ISH_UNDEFINED_3            (1u << 3u)
-#define  FIT_ISH_UNDEFINED_4            (1u << 4u)
-#define  FIT_ISH_Q0_FULL                (1u << 5u)
-#define  FIT_ISH_Q1_FULL                (1u << 6u)
-#define  FIT_ISH_Q2_FULL                (1u << 7u)
-#define  FIT_ISH_Q3_FULL                (1u << 8u)
-#define  FIT_ISH_QCMD_FIFO_OVERRUN      (1u << 9u)
-#define  FIT_ISH_BAD_EXP_ROM_READ       (1u << 10u)
+#define FIT_INT_STATUS_HOST		0x520u
+#define  FIT_ISH_FW_STATE_CHANGE	(1u << 0u)
+#define  FIT_ISH_COMPLETION_POSTED	(1u << 1u)
+#define  FIT_ISH_MSG_FROM_DEV		(1u << 2u)
+#define  FIT_ISH_UNDEFINED_3		(1u << 3u)
+#define  FIT_ISH_UNDEFINED_4		(1u << 4u)
+#define  FIT_ISH_Q0_FULL		(1u << 5u)
+#define  FIT_ISH_Q1_FULL		(1u << 6u)
+#define  FIT_ISH_Q2_FULL		(1u << 7u)
+#define  FIT_ISH_Q3_FULL		(1u << 8u)
+#define  FIT_ISH_QCMD_FIFO_OVERRUN	(1u << 9u)
+#define  FIT_ISH_BAD_EXP_ROM_READ	(1u << 10u)
 
+#define FIT_INT_DEF_MASK \
+	(FIT_ISH_FW_STATE_CHANGE | \
+	 FIT_ISH_COMPLETION_POSTED | \
+	 FIT_ISH_MSG_FROM_DEV | \
+	 FIT_ISH_Q0_FULL | \
+	 FIT_ISH_Q1_FULL | \
+	 FIT_ISH_Q2_FULL | \
+	 FIT_ISH_Q3_FULL | \
+	 FIT_ISH_QCMD_FIFO_OVERRUN | \
+	 FIT_ISH_BAD_EXP_ROM_READ)
 
-#define FIT_INT_DEF_MASK	\
-		(FIT_ISH_FW_STATE_CHANGE | \
-		 FIT_ISH_COMPLETION_POSTED | \
-		 FIT_ISH_MSG_FROM_DEV | \
-		 FIT_ISH_Q0_FULL | \
-		 FIT_ISH_Q1_FULL | \
-		 FIT_ISH_Q2_FULL | \
-		 FIT_ISH_Q3_FULL | \
-		 FIT_ISH_QCMD_FIFO_OVERRUN | \
-		 FIT_ISH_BAD_EXP_ROM_READ)
+#define FIT_INT_QUEUE_FULL \
+	(FIT_ISH_Q0_FULL | \
+	 FIT_ISH_Q1_FULL | \
+	 FIT_ISH_Q2_FULL | \
+	 FIT_ISH_Q3_FULL)
 
-#define FIT_INT_QUEUE_FULL	\
-		(FIT_ISH_Q0_FULL | \
-		FIT_ISH_Q1_FULL | \
-		FIT_ISH_Q2_FULL | \
-		FIT_ISH_Q3_FULL)
+#define MSI_MSG_NWL_ERROR_0		0x00000000
+#define MSI_MSG_NWL_ERROR_1		0x00000001
+#define MSI_MSG_NWL_ERROR_2		0x00000002
+#define MSI_MSG_NWL_ERROR_3		0x00000003
+#define MSI_MSG_STATE_CHANGE		0x00000004
+#define MSI_MSG_COMPLETION_POSTED	0x00000005
+#define MSI_MSG_MSG_FROM_DEV		0x00000006
+#define MSI_MSG_RESERVED_0		0x00000007
+#define MSI_MSG_RESERVED_1		0x00000008
+#define MSI_MSG_QUEUE_0_FULL		0x00000009
+#define MSI_MSG_QUEUE_1_FULL		0x0000000A
+#define MSI_MSG_QUEUE_2_FULL		0x0000000B
+#define MSI_MSG_QUEUE_3_FULL		0x0000000C
 
+#define FIT_INT_RESERVED_MASK \
+	(FIT_ISH_UNDEFINED_3 | \
+	 FIT_ISH_UNDEFINED_4)
 
-#define MSI_MSG_NWL_ERROR_0             0x00000000
-#define MSI_MSG_NWL_ERROR_1             0x00000001
-#define MSI_MSG_NWL_ERROR_2             0x00000002
-#define MSI_MSG_NWL_ERROR_3             0x00000003
-#define MSI_MSG_STATE_CHANGE            0x00000004
-#define MSI_MSG_COMPLETION_POSTED       0x00000005
-#define MSI_MSG_MSG_FROM_DEV            0x00000006
-#define MSI_MSG_RESERVED_0              0x00000007
-#define MSI_MSG_RESERVED_1              0x00000008
-#define MSI_MSG_QUEUE_0_FULL            0x00000009
-#define MSI_MSG_QUEUE_1_FULL            0x0000000A
-#define MSI_MSG_QUEUE_2_FULL            0x0000000B
-#define MSI_MSG_QUEUE_3_FULL            0x0000000C
-
-
-
-#define FIT_INT_RESERVED_MASK	\
-		(FIT_ISH_UNDEFINED_3 | \
-		FIT_ISH_UNDEFINED_4)
 /*
  * Interrupt mask, 32-bit r/w
  * Bit definitions are the same as FIT_INT_STATUS_HOST
  */
-#define FIT_INT_MASK_HOST           0x528u
-
+#define FIT_INT_MASK_HOST		0x528u
 
 /*
  * Message to device, 32-bit r/w
  */
-#define FIT_MSG_TO_DEVICE           0x540u
+#define FIT_MSG_TO_DEVICE		0x540u
 
 /*
  * Message from device, 32-bit, r/o
  */
-#define FIT_MSG_FROM_DEVICE         0x548u
-
+#define FIT_MSG_FROM_DEVICE		0x548u
 
 /*
  * 32-bit messages to/from device, composition/extraction macros
@@ -153,52 +146,50 @@
 	((((TYPE)  & 0xFFu) << 24u) | \
 	(((PARAM) & 0xFFu) << 16u) | \
 	(((DATA)  & 0xFFFFu) << 0u))
-#define FIT_MXD_TYPE(MXD)               (((MXD) >> 24u) & 0xFFu)
-#define FIT_MXD_PARAM(MXD)              (((MXD) >> 16u) & 0xFFu)
-#define FIT_MXD_DATA(MXD)               (((MXD) >> 0u) & 0xFFFFu)
-
+#define FIT_MXD_TYPE(MXD)		(((MXD) >> 24u) & 0xFFu)
+#define FIT_MXD_PARAM(MXD)		(((MXD) >> 16u) & 0xFFu)
+#define FIT_MXD_DATA(MXD)		(((MXD) >> 0u) & 0xFFFFu)
 
 /*
  * Types of messages to/from device
  */
-#define FIT_MTD_FITFW_INIT              0x01u
-#define FIT_MTD_GET_CMDQ_DEPTH          0x02u
-#define FIT_MTD_SET_COMPQ_DEPTH         0x03u
-#define FIT_MTD_SET_COMPQ_ADDR          0x04u
-#define FIT_MTD_ARM_QUEUE               0x05u
-#define FIT_MTD_CMD_LOG_HOST_ID         0x07u
-#define FIT_MTD_CMD_LOG_TIME_STAMP_LO   0x08u
-#define FIT_MTD_CMD_LOG_TIME_STAMP_HI   0x09u
-#define FIT_MFD_SMART_EXCEEDED          0x10u
-#define FIT_MFD_POWER_DOWN		        0x11u
-#define FIT_MFD_OFFLINE			        0x12u
-#define FIT_MFD_ONLINE			        0x13u
-#define FIT_MFD_FW_RESTARTING		    0x14u
-#define FIT_MFD_PM_ACTIVE		        0x15u
-#define FIT_MFD_PM_STANDBY		        0x16u
-#define FIT_MFD_PM_SLEEP		        0x17u
-#define FIT_MFD_CMD_PROGRESS		    0x18u
+#define FIT_MTD_FITFW_INIT		0x01u
+#define FIT_MTD_GET_CMDQ_DEPTH		0x02u
+#define FIT_MTD_SET_COMPQ_DEPTH		0x03u
+#define FIT_MTD_SET_COMPQ_ADDR		0x04u
+#define FIT_MTD_ARM_QUEUE		0x05u
+#define FIT_MTD_CMD_LOG_HOST_ID		0x07u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_LO	0x08u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_HI	0x09u
+#define FIT_MFD_SMART_EXCEEDED		0x10u
+#define FIT_MFD_POWER_DOWN		0x11u
+#define FIT_MFD_OFFLINE			0x12u
+#define FIT_MFD_ONLINE			0x13u
+#define FIT_MFD_FW_RESTARTING		0x14u
+#define FIT_MFD_PM_ACTIVE		0x15u
+#define FIT_MFD_PM_STANDBY		0x16u
+#define FIT_MFD_PM_SLEEP		0x17u
+#define FIT_MFD_CMD_PROGRESS		0x18u
 
-#define FIT_MTD_DEBUG                   0xFEu
-#define FIT_MFD_DEBUG                   0xFFu
+#define FIT_MTD_DEBUG			0xFEu
+#define FIT_MFD_DEBUG			0xFFu
 
 #define FIT_MFD_MASK			(0xFFu)
 #define FIT_MFD_DATA_MASK		(0xFFu)
 #define FIT_MFD_MSG(x)			(((x) >> 24) & FIT_MFD_MASK)
 #define FIT_MFD_DATA(x)			((x) & FIT_MFD_MASK)
 
-
 /*
  * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
  * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
  * (was Response buffer in docs)
  */
-#define FIT_MSG_TO_DEVICE_ARG       0x580u
+#define FIT_MSG_TO_DEVICE_ARG		0x580u
 
 /*
  * Hardware (ASIC) version, 32-bit r/o
  */
-#define FIT_HW_VERSION              0x588u
+#define FIT_HW_VERSION			0x588u
 
 /*
  * Scatter/gather list descriptor.
@@ -213,8 +204,8 @@ struct fit_sg_descriptor {
 	uint64_t next_desc_ptr;
 };
 
-#define FIT_SGD_CONTROL_NOT_LAST    0x000u
-#define FIT_SGD_CONTROL_LAST        0x40Eu
+#define FIT_SGD_CONTROL_NOT_LAST	0x000u
+#define FIT_SGD_CONTROL_LAST		0x40Eu
 
 /*
  * Header at the beginning of a FIT message. The header
@@ -228,9 +219,9 @@ struct fit_msg_hdr {
 	uint8_t _reserved[62];
 };
 
-#define FIT_PROTOCOL_ID_FIT     1
-#define FIT_PROTOCOL_ID_SSDI    2
-#define FIT_PROTOCOL_ID_SOFIT   3
+#define FIT_PROTOCOL_ID_FIT	1
+#define FIT_PROTOCOL_ID_SSDI	2
+#define FIT_PROTOCOL_ID_SOFIT	3
 
 
 #define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
@@ -251,7 +242,7 @@ struct fit_msg_hdr {
  * Command_context is opaque and taken verbatim from the SSDI command.
  * All other fields are big endian.
  */
-#define FIT_PROTOCOL_VERSION_0          0
+#define FIT_PROTOCOL_VERSION_0		0
 
 /*
  *  Protocol major version 1 completion entry.
@@ -264,8 +255,8 @@ struct fit_completion_entry_v1 {
 	uint8_t		status;  /* SCSI status */
 	uint8_t		cycle;
 };
-#define FIT_PROTOCOL_VERSION_1          1
-#define FIT_PROTOCOL_VERSION_CURRENT    FIT_PROTOCOL_VERSION_1
+#define FIT_PROTOCOL_VERSION_1		1
+#define FIT_PROTOCOL_VERSION_CURRENT	FIT_PROTOCOL_VERSION_1
 
 struct fit_comp_error_info {
 	uint8_t		type:7; /* 00: Bits0-6 indicates the type of sense data. */
@@ -293,10 +284,9 @@ struct fit_comp_error_info {
 
 
 /* Task management constants */
-#define SOFT_TASK_SIMPLE            0x00
-#define SOFT_TASK_HEAD_OF_QUEUE     0x01
-#define SOFT_TASK_ORDERED           0x02
-
+#define SOFT_TASK_SIMPLE		0x00
+#define SOFT_TASK_HEAD_OF_QUEUE		0x01
+#define SOFT_TASK_ORDERED		0x02
 
 /* Version zero has the last 32 bits reserved,
  * Version one has the last 32 bits sg_list_len_bytes;

From bfe11d6de1c416cea4f3f0f35f864162063ce3fa Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 29 Oct 2013 18:31:14 +0100
Subject: [PATCH 39/94] xen-blkfront: restore the non-persistent data path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When persistent grants were added they were always used, even if the
backend doesn't have this feature (there's no harm in always using the
same set of pages). This restores the old data path when the backend
doesn't have persistent grants, removing the burden of doing a memcpy
when it is not actually needed.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reported-by: Felipe Franciosi <felipe.franciosi@citrix.com>
Cc: Felipe Franciosi <felipe.franciosi@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
[v2: Fix up whitespace issues]
---
 drivers/block/xen-blkfront.c | 125 ++++++++++++++++++++++++++++-------
 1 file changed, 100 insertions(+), 25 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5b8a15483a4c..432db1b59b00 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -121,7 +121,8 @@ struct blkfront_info
 	struct work_struct work;
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
-	struct list_head persistent_gnts;
+	struct list_head grants;
+	struct list_head indirect_pages;
 	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
 	unsigned int feature_flush;
@@ -200,15 +201,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 		if (!gnt_list_entry)
 			goto out_of_memory;
 
-		granted_page = alloc_page(GFP_NOIO);
-		if (!granted_page) {
-			kfree(gnt_list_entry);
-			goto out_of_memory;
+		if (info->feature_persistent) {
+			granted_page = alloc_page(GFP_NOIO);
+			if (!granted_page) {
+				kfree(gnt_list_entry);
+				goto out_of_memory;
+			}
+			gnt_list_entry->pfn = page_to_pfn(granted_page);
 		}
 
-		gnt_list_entry->pfn = page_to_pfn(granted_page);
 		gnt_list_entry->gref = GRANT_INVALID_REF;
-		list_add(&gnt_list_entry->node, &info->persistent_gnts);
+		list_add(&gnt_list_entry->node, &info->grants);
 		i++;
 	}
 
@@ -216,9 +219,10 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 
 out_of_memory:
 	list_for_each_entry_safe(gnt_list_entry, n,
-	                         &info->persistent_gnts, node) {
+	                         &info->grants, node) {
 		list_del(&gnt_list_entry->node);
-		__free_page(pfn_to_page(gnt_list_entry->pfn));
+		if (info->feature_persistent)
+			__free_page(pfn_to_page(gnt_list_entry->pfn));
 		kfree(gnt_list_entry);
 		i--;
 	}
@@ -227,13 +231,14 @@ out_of_memory:
 }
 
 static struct grant *get_grant(grant_ref_t *gref_head,
+                               unsigned long pfn,
                                struct blkfront_info *info)
 {
 	struct grant *gnt_list_entry;
 	unsigned long buffer_mfn;
 
-	BUG_ON(list_empty(&info->persistent_gnts));
-	gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant,
+	BUG_ON(list_empty(&info->grants));
+	gnt_list_entry = list_first_entry(&info->grants, struct grant,
 	                                  node);
 	list_del(&gnt_list_entry->node);
 
@@ -245,6 +250,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 	/* Assign a gref to this page */
 	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
 	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (!info->feature_persistent) {
+		BUG_ON(!pfn);
+		gnt_list_entry->pfn = pfn;
+	}
 	buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
 	gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
 	                                info->xbdev->otherend_id,
@@ -480,22 +489,34 @@ static int blkif_queue_request(struct request *req)
 
 			if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
 			    (i % SEGS_PER_INDIRECT_FRAME == 0)) {
+				unsigned long pfn;
+
 				if (segments)
 					kunmap_atomic(segments);
 
 				n = i / SEGS_PER_INDIRECT_FRAME;
-				gnt_list_entry = get_grant(&gref_head, info);
+				if (!info->feature_persistent) {
+					struct page *indirect_page;
+
+					/* Fetch a pre-allocated page to use for indirect grefs */
+					BUG_ON(list_empty(&info->indirect_pages));
+					indirect_page = list_first_entry(&info->indirect_pages,
+					                                 struct page, lru);
+					list_del(&indirect_page->lru);
+					pfn = page_to_pfn(indirect_page);
+				}
+				gnt_list_entry = get_grant(&gref_head, pfn, info);
 				info->shadow[id].indirect_grants[n] = gnt_list_entry;
 				segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
 				ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
 			}
 
-			gnt_list_entry = get_grant(&gref_head, info);
+			gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
 			ref = gnt_list_entry->gref;
 
 			info->shadow[id].grants_used[i] = gnt_list_entry;
 
-			if (rq_data_dir(req)) {
+			if (rq_data_dir(req) && info->feature_persistent) {
 				char *bvec_data;
 				void *shared_data;
 
@@ -907,21 +928,36 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		blk_stop_queue(info->rq);
 
 	/* Remove all persistent grants */
-	if (!list_empty(&info->persistent_gnts)) {
+	if (!list_empty(&info->grants)) {
 		list_for_each_entry_safe(persistent_gnt, n,
-		                         &info->persistent_gnts, node) {
+		                         &info->grants, node) {
 			list_del(&persistent_gnt->node);
 			if (persistent_gnt->gref != GRANT_INVALID_REF) {
 				gnttab_end_foreign_access(persistent_gnt->gref,
 				                          0, 0UL);
 				info->persistent_gnts_c--;
 			}
-			__free_page(pfn_to_page(persistent_gnt->pfn));
+			if (info->feature_persistent)
+				__free_page(pfn_to_page(persistent_gnt->pfn));
 			kfree(persistent_gnt);
 		}
 	}
 	BUG_ON(info->persistent_gnts_c != 0);
 
+	/*
+	 * Remove indirect pages, this only happens when using indirect
+	 * descriptors but not persistent grants
+	 */
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+
+		BUG_ON(info->feature_persistent);
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+
 	for (i = 0; i < BLK_RING_SIZE; i++) {
 		/*
 		 * Clear persistent grants present in requests already
@@ -936,7 +972,8 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		for (j = 0; j < segs; j++) {
 			persistent_gnt = info->shadow[i].grants_used[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
-			__free_page(pfn_to_page(persistent_gnt->pfn));
+			if (info->feature_persistent)
+				__free_page(pfn_to_page(persistent_gnt->pfn));
 			kfree(persistent_gnt);
 		}
 
@@ -995,7 +1032,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 	nseg = s->req.operation == BLKIF_OP_INDIRECT ?
 		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
 
-	if (bret->operation == BLKIF_OP_READ) {
+	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
 		/*
 		 * Copy the data received from the backend into the bvec.
 		 * Since bv_offset can be different than 0, and bv_len different
@@ -1023,7 +1060,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			 * we add it at the head of the list, so it will be
 			 * reused first.
 			 */
-			list_add(&s->grants_used[i]->node, &info->persistent_gnts);
+			if (!info->feature_persistent)
+				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+						     s->grants_used[i]->gref);
+			list_add(&s->grants_used[i]->node, &info->grants);
 			info->persistent_gnts_c++;
 		} else {
 			/*
@@ -1034,19 +1074,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			 */
 			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
 			s->grants_used[i]->gref = GRANT_INVALID_REF;
-			list_add_tail(&s->grants_used[i]->node, &info->persistent_gnts);
+			list_add_tail(&s->grants_used[i]->node, &info->grants);
 		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
 		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
 			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
-				list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
+				if (!info->feature_persistent)
+					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+							     s->indirect_grants[i]->gref);
+				list_add(&s->indirect_grants[i]->node, &info->grants);
 				info->persistent_gnts_c++;
 			} else {
+				struct page *indirect_page;
+
 				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+				/*
+				 * Add the used indirect page back to the list of
+				 * available pages for indirect grefs.
+				 */
+				indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
+				list_add(&indirect_page->lru, &info->indirect_pages);
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-				list_add_tail(&s->indirect_grants[i]->node,
-				              &info->persistent_gnts);
+				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
 			}
 		}
 	}
@@ -1341,7 +1391,8 @@ static int blkfront_probe(struct xenbus_device *dev,
 	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
-	INIT_LIST_HEAD(&info->persistent_gnts);
+	INIT_LIST_HEAD(&info->grants);
+	INIT_LIST_HEAD(&info->indirect_pages);
 	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
@@ -1637,6 +1688,23 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 	if (err)
 		goto out_of_memory;
 
+	if (!info->feature_persistent && info->max_indirect_segments) {
+		/*
+		 * We are using indirect descriptors but not persistent
+		 * grants, we need to allocate a set of pages that can be
+		 * used for mapping indirect grefs
+		 */
+		int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
+
+		BUG_ON(!list_empty(&info->indirect_pages));
+		for (i = 0; i < num; i++) {
+			struct page *indirect_page = alloc_page(GFP_NOIO);
+			if (!indirect_page)
+				goto out_of_memory;
+			list_add(&indirect_page->lru, &info->indirect_pages);
+		}
+	}
+
 	for (i = 0; i < BLK_RING_SIZE; i++) {
 		info->shadow[i].grants_used = kzalloc(
 			sizeof(info->shadow[i].grants_used[0]) * segs,
@@ -1667,6 +1735,13 @@ out_of_memory:
 		kfree(info->shadow[i].indirect_grants);
 		info->shadow[i].indirect_grants = NULL;
 	}
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
 	return -ENOMEM;
 }
 

From 49c2856af779bb1a36fa598804e4c0b2251fae57 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 6 Nov 2013 09:24:02 +0100
Subject: [PATCH 40/94] pktcdvd: debugfs functions return NULL on error

My static checker complains correctly that this is potential NULL
dereference because debugfs functions return NULL on error.  They return
an ERR_PTR if they are configured out.

We don't need to check for ERR_PTR because if debugfs is stubbed out the
dummy functions won't complain about that.  We don't need to check the
values before calling debugfs_remove() because that accepts ERR_PTRs and
NULL pointers.

We don't need to set pkt->dfs_f_info to NULL in pkt_debugfs_dev_new()
because it was initialized with kzalloc() so I have removed that.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/block/pktcdvd.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 56188475cfd3..ff8668c5efb1 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -473,45 +473,31 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
 {
 	if (!pkt_debugfs_root)
 		return;
-	pd->dfs_f_info = NULL;
 	pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
-	if (IS_ERR(pd->dfs_d_root)) {
-		pd->dfs_d_root = NULL;
+	if (!pd->dfs_d_root)
 		return;
-	}
+
 	pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
 				pd->dfs_d_root, pd, &debug_fops);
-	if (IS_ERR(pd->dfs_f_info)) {
-		pd->dfs_f_info = NULL;
-		return;
-	}
 }
 
 static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
 {
 	if (!pkt_debugfs_root)
 		return;
-	if (pd->dfs_f_info)
-		debugfs_remove(pd->dfs_f_info);
+	debugfs_remove(pd->dfs_f_info);
+	debugfs_remove(pd->dfs_d_root);
 	pd->dfs_f_info = NULL;
-	if (pd->dfs_d_root)
-		debugfs_remove(pd->dfs_d_root);
 	pd->dfs_d_root = NULL;
 }
 
 static void pkt_debugfs_init(void)
 {
 	pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
-	if (IS_ERR(pkt_debugfs_root)) {
-		pkt_debugfs_root = NULL;
-		return;
-	}
 }
 
 static void pkt_debugfs_cleanup(void)
 {
-	if (!pkt_debugfs_root)
-		return;
 	debugfs_remove(pkt_debugfs_root);
 	pkt_debugfs_root = NULL;
 }

From f7cb20f03dc6dff1b19942cf3dda6d154c86f29b Mon Sep 17 00:00:00 2001
From: Ben Harris <bjh21@cam.ac.uk>
Date: Fri, 18 Oct 2013 21:23:35 +0100
Subject: [PATCH 41/94] floppy: Correct documentation of driver options when
 used as a module.

The options have to be passed space-separated and prefixed by "floppy=",
rather than separately and unprefixed.

This fixes <http://bugs.debian.org/726655>.

Signed-off-by: Ben Harris <bjh21@cam.ac.uk>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/blockdev/floppy.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt
index 470fe4b5e379..e2240f5ab64d 100644
--- a/Documentation/blockdev/floppy.txt
+++ b/Documentation/blockdev/floppy.txt
@@ -39,15 +39,15 @@ Module configuration options
 ============================
 
  If you use the floppy driver as a module, use the following syntax:
-modprobe floppy <options>
+modprobe floppy floppy="<options>"
 
 Example:
- modprobe floppy omnibook messages
+ modprobe floppy floppy="omnibook messages"
 
  If you need certain options enabled every time you load the floppy driver,
 you can put:
 
- options floppy omnibook messages
+ options floppy floppy="omnibook messages"
 
 in a configuration file in /etc/modprobe.d/.
 

From 1fa8455deb92e9ec7756df23030e73b2d28eeca7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 10 Nov 2013 21:55:27 -0800
Subject: [PATCH 42/94] bcache: Fix dirty_data accounting

Dirty data accounting wasn't quite right - firstly, we were adding the key we're
inserting after it could have merged with another dirty key already in the
btree, and secondly we could sometimes pass the wrong offset to
bcache_dev_sectors_dirty_add() for dirty data we were overwriting - which is
important when tracking dirty data by stripe.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.10
---
 drivers/md/bcache/btree.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f42fc7ed9cd6..d1734d9d9c79 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1742,6 +1742,9 @@ static bool fix_overlapping_extents(struct btree *b,
 		if (bkey_cmp(insert, k) < 0) {
 			bch_cut_front(insert, k);
 		} else {
+			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+				old_offset = KEY_START(insert);
+
 			if (bkey_written(b, k) &&
 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 				/*
@@ -1803,6 +1806,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		if (fix_overlapping_extents(b, k, &iter, op))
 			return false;
 
+		if (KEY_DIRTY(k))
+			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+						     KEY_START(k), KEY_SIZE(k));
+
 		while (m != end(i) &&
 		       bkey_cmp(k, &START_KEY(m)) > 0)
 			prev = m, m = bkey_next(m);
@@ -1831,10 +1838,6 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
-	if (KEY_DIRTY(k))
-		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-					     KEY_START(k), KEY_SIZE(k));
-
 	bch_check_keys(b, "%u for %s", status, op_type(op));
 
 	if (b->level && !KEY_OFFSET(k))

From 7857d5d470ec53bae187d144c69065ad3c0ebc21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 8 Oct 2013 15:50:46 -0700
Subject: [PATCH 43/94] bcache: Fix a journalling performance bug

---
 drivers/md/bcache/journal.c | 47 ++++++++++++++++++++-----------------
 drivers/md/bcache/journal.h |  3 ++-
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8435f81e5d85..7c9e6bf6aaba 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -554,32 +554,26 @@ static void journal_write_endio(struct bio *bio, int error)
 	struct journal_write *w = bio->bi_private;
 
 	cache_set_err_on(error, w->c, "journal io error");
-	closure_put(&w->c->journal.io.cl);
+	closure_put(&w->c->journal.io);
 }
 
 static void journal_write(struct closure *);
 
 static void journal_write_done(struct closure *cl)
 {
-	struct journal *j = container_of(cl, struct journal, io.cl);
-	struct cache_set *c = container_of(j, struct cache_set, journal);
-
+	struct journal *j = container_of(cl, struct journal, io);
 	struct journal_write *w = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
 
 	__closure_wake_up(&w->wait);
-
-	if (c->journal_delay_ms)
-		closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
-
-	continue_at(cl, journal_write, system_wq);
+	continue_at_nobarrier(cl, journal_write, system_wq);
 }
 
 static void journal_write_unlocked(struct closure *cl)
 	__releases(c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
@@ -660,7 +654,7 @@ static void journal_write_unlocked(struct closure *cl)
 
 static void journal_write(struct closure *cl)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 
 	spin_lock(&c->journal.lock);
 	journal_write_unlocked(cl);
@@ -669,7 +663,10 @@ static void journal_write(struct closure *cl)
 static void __journal_try_write(struct cache_set *c, bool noflush)
 	__releases(c->journal.lock)
 {
-	struct closure *cl = &c->journal.io.cl;
+	struct closure *cl = &c->journal.io;
+	struct journal_write *w = c->journal.cur;
+
+	w->need_write = true;
 
 	if (!closure_trylock(cl, &c->cl))
 		spin_unlock(&c->journal.lock);
@@ -688,18 +685,24 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
 
 	if (CACHE_SYNC(&c->sb)) {
 		spin_lock(&c->journal.lock);
-
 		w = c->journal.cur;
-		w->need_write = true;
 
 		if (cl)
 			BUG_ON(!closure_wait(&w->wait, cl));
 
-		closure_flush(&c->journal.io);
 		__journal_try_write(c, true);
 	}
 }
 
+static void journal_write_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+					   struct cache_set,
+					   journal.work);
+	spin_lock(&c->journal.lock);
+	journal_try_write(c);
+}
+
 /*
  * Entry point to the journalling code - bio_insert() and btree_invalidate()
  * pass bch_journal() a list of keys to be journalled, and then
@@ -739,7 +742,6 @@ void bch_journal(struct closure *cl)
 	}
 
 	w = c->journal.cur;
-	w->need_write = true;
 	b = __set_blocks(w->data, w->data->keys + n, c);
 
 	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
@@ -755,8 +757,6 @@ void bch_journal(struct closure *cl)
 
 		BUG_ON(!closure_wait(&w->wait, cl));
 
-		closure_flush(&c->journal.io);
-
 		journal_try_write(c);
 		continue_at(cl, bch_journal, bcache_wq);
 	}
@@ -768,11 +768,15 @@ void bch_journal(struct closure *cl)
 	atomic_inc(op->journal);
 
 	if (op->flush_journal) {
-		closure_flush(&c->journal.io);
 		closure_wait(&w->wait, cl->parent);
+		journal_try_write(c);
+	} else if (!w->need_write) {
+		schedule_delayed_work(&c->journal.work,
+				      msecs_to_jiffies(c->journal_delay_ms));
+		spin_unlock(&c->journal.lock);
+	} else {
+		spin_unlock(&c->journal.lock);
 	}
-
-	journal_try_write(c);
 out:
 	bch_btree_insert_async(cl);
 }
@@ -790,6 +794,7 @@ int bch_journal_alloc(struct cache_set *c)
 
 	closure_init_unlocked(&j->io);
 	spin_lock_init(&j->lock);
+	INIT_DELAYED_WORK(&j->work, journal_write_work);
 
 	c->journal_delay_ms = 100;
 
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3d7851274b04..3ca93d3d566c 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -140,7 +140,8 @@ struct journal {
 	spinlock_t		lock;
 	/* used when waiting because the journal was full */
 	struct closure_waitlist	wait;
-	struct closure_with_timer io;
+	struct closure		io;
+	struct delayed_work	work;
 
 	/* Number of blocks free in the bucket(s) we're currently writing to */
 	unsigned		blocks_free;

From dd9ec84da530d3a92e99ad1f52edae44533cc27f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:12:52 -0700
Subject: [PATCH 44/94] bcache: Fix a lockdep splat

bch_keybuf_del() takes a spinlock that can't be taken in interrupt context -
whoops. Fortunately, this code isn't enabled by default (you have to toggle a
sysfs thing).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1a3b4f4786c3..dd8a035c5ae1 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -113,7 +113,7 @@ static void write_moving(struct closure *cl)
 		bch_insert_data(&s->op.cl);
 	}
 
-	continue_at(cl, write_moving_finish, NULL);
+	continue_at(cl, write_moving_finish, bch_gc_wq);
 }
 
 static void read_moving_submit(struct closure *cl)

From 49b1212dfacfe51f951442563d1617bb06aac575 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:16:09 -0700
Subject: [PATCH 45/94] bcache: Use blkdev_issue_discard()

The old asynchronous discard code was really a relic from when all the
allocation code was asynchronous - now that allocation runs out of a
dedicated thread there's no point in keeping around all that complicated
machinery.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c  | 114 ++++---------------------------------
 drivers/md/bcache/bcache.h |  10 ----
 drivers/md/bcache/super.c  |   4 --
 3 files changed, 11 insertions(+), 117 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e45f5575fd4d..e033b0203b68 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,13 +63,12 @@
 #include "bcache.h"
 #include "btree.h"
 
+#include <linux/blkdev.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
-#define MAX_IN_FLIGHT_DISCARDS		8U
-
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	mutex_unlock(&c->bucket_lock);
 }
 
-/* Discard/TRIM */
-
-struct discard {
-	struct list_head	list;
-	struct work_struct	work;
-	struct cache		*ca;
-	long			bucket;
-
-	struct bio		bio;
-	struct bio_vec		bv;
-};
-
-static void discard_finish(struct work_struct *w)
-{
-	struct discard *d = container_of(w, struct discard, work);
-	struct cache *ca = d->ca;
-	char buf[BDEVNAME_SIZE];
-
-	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
-		pr_notice("discard error on %s, disabling",
-			 bdevname(ca->bdev, buf));
-		d->ca->discard = 0;
-	}
-
-	mutex_lock(&ca->set->bucket_lock);
-
-	fifo_push(&ca->free, d->bucket);
-	list_add(&d->list, &ca->discards);
-	atomic_dec(&ca->discards_in_flight);
-
-	mutex_unlock(&ca->set->bucket_lock);
-
-	closure_wake_up(&ca->set->bucket_wait);
-	wake_up_process(ca->alloc_thread);
-
-	closure_put(&ca->set->cl);
-}
-
-static void discard_endio(struct bio *bio, int error)
-{
-	struct discard *d = container_of(bio, struct discard, bio);
-	schedule_work(&d->work);
-}
-
-static void do_discard(struct cache *ca, long bucket)
-{
-	struct discard *d = list_first_entry(&ca->discards,
-					     struct discard, list);
-
-	list_del(&d->list);
-	d->bucket = bucket;
-
-	atomic_inc(&ca->discards_in_flight);
-	closure_get(&ca->set->cl);
-
-	bio_init(&d->bio);
-
-	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket);
-	d->bio.bi_bdev		= ca->bdev;
-	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD;
-	d->bio.bi_max_vecs	= 1;
-	d->bio.bi_io_vec	= d->bio.bi_inline_vecs;
-	d->bio.bi_size		= bucket_bytes(ca);
-	d->bio.bi_end_io	= discard_endio;
-	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
-	submit_bio(0, &d->bio);
-}
-
 /* Allocation */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg)
 			else
 				break;
 
-			allocator_wait(ca, (int) fifo_free(&ca->free) >
-				       atomic_read(&ca->discards_in_flight));
-
 			if (ca->discard) {
-				allocator_wait(ca, !list_empty(&ca->discards));
-				do_discard(ca, bucket);
-			} else {
-				fifo_push(&ca->free, bucket);
-				closure_wake_up(&ca->set->bucket_wait);
+				mutex_unlock(&ca->set->bucket_lock);
+				blkdev_issue_discard(ca->bdev,
+					bucket_to_sector(ca->set, bucket),
+					ca->sb.block_size, GFP_KERNEL, 0);
+				mutex_lock(&ca->set->bucket_lock);
 			}
+
+			allocator_wait(ca, !fifo_full(&ca->free));
+
+			fifo_push(&ca->free, bucket);
+			closure_wake_up(&ca->set->bucket_wait);
 		}
 
 		/*
@@ -556,22 +488,8 @@ int bch_cache_allocator_start(struct cache *ca)
 	return 0;
 }
 
-void bch_cache_allocator_exit(struct cache *ca)
-{
-	struct discard *d;
-
-	while (!list_empty(&ca->discards)) {
-		d = list_first_entry(&ca->discards, struct discard, list);
-		cancel_work_sync(&d->work);
-		list_del(&d->list);
-		kfree(d);
-	}
-}
-
 int bch_cache_allocator_init(struct cache *ca)
 {
-	unsigned i;
-
 	/*
 	 * Reserve:
 	 * Prio/gen writes first
@@ -589,15 +507,5 @@ int bch_cache_allocator_init(struct cache *ca)
 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
 		ca->watermark[WATERMARK_MOVINGGC];
 
-	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
-		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
-		if (!d)
-			return -ENOMEM;
-
-		d->ca = ca;
-		INIT_WORK(&d->work, discard_finish);
-		list_add(&d->list, &ca->discards);
-	}
-
 	return 0;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0f12382aa35d..bf37474a6888 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -620,15 +620,6 @@ struct cache {
 
 	bool			discard; /* Get rid of? */
 
-	/*
-	 * We preallocate structs for issuing discards to buckets, and keep them
-	 * on this list when they're not in use; do_discard() issues discards
-	 * whenever there's work to do and is called by free_some_buckets() and
-	 * when a discard finishes.
-	 */
-	atomic_t		discards_in_flight;
-	struct list_head	discards;
-
 	struct journal_device	journal;
 
 	/* The rest of this all shows up in sysfs */
@@ -1222,7 +1213,6 @@ int bch_btree_cache_alloc(struct cache_set *);
 void bch_moving_init_cache_set(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
-void bch_cache_allocator_exit(struct cache *ca);
 int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 547c4c57b052..fd37342a7eb5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1725,8 +1725,6 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->set)
 		ca->set->cache[ca->sb.nr_this_dev] = NULL;
 
-	bch_cache_allocator_exit(ca);
-
 	bio_split_pool_free(&ca->bio_split_hook);
 
 	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
@@ -1758,8 +1756,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	__module_get(THIS_MODULE);
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	INIT_LIST_HEAD(&ca->discards);
-
 	bio_init(&ca->journal.bio);
 	ca->journal.bio.bi_max_vecs = 8;
 	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;

From 77c320eb46e216c17aee5c943949229ccfed6904 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 11 Jul 2013 19:42:51 -0700
Subject: [PATCH 46/94] bcache: Add on error panic/unregister setting

Works kind of like the ext4 setting, to panic or remount read only on
errors.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  6 ++++++
 drivers/md/bcache/journal.c |  7 +++----
 drivers/md/bcache/super.c   |  6 +++++-
 drivers/md/bcache/sysfs.c   | 21 +++++++++++++++++++++
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index bf37474a6888..578615604be5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -843,8 +843,14 @@ struct cache_set {
 	atomic_long_t		cache_read_races;
 	atomic_long_t		writeback_keys_done;
 	atomic_long_t		writeback_keys_failed;
+
+	enum			{
+		ON_ERROR_UNREGISTER,
+		ON_ERROR_PANIC,
+	}			on_error;
 	unsigned		error_limit;
 	unsigned		error_decay;
+
 	unsigned short		journal_delay_ms;
 	unsigned		verify:1;
 	unsigned		key_merging_disabled:1;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7c9e6bf6aaba..9e8775872dba 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -305,10 +305,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
 
-		if (n != i->j.seq)
-			pr_err(
-		"journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
-		n, i->j.seq - 1, start, end);
+		cache_set_err_on(n != i->j.seq, s,
+"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				 n, i->j.seq - 1, start, end);
 
 		for (k = i->j.start;
 		     k < end(&i->j);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fd37342a7eb5..74f2e902d876 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1260,7 +1260,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 {
 	va_list args;
 
-	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+	if (c->on_error != ON_ERROR_PANIC &&
+	    test_bit(CACHE_SET_STOPPING, &c->flags))
 		return false;
 
 	/* XXX: we can be called from atomic context
@@ -1275,6 +1276,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 
 	printk(", disabling caching\n");
 
+	if (c->on_error == ON_ERROR_PANIC)
+		panic("panic forced after error\n");
+
 	bch_cache_set_unregister(c);
 	return true;
 }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 924dcfdae111..4211d82179fc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = {
 	NULL
 };
 
+static const char * const error_actions[] = {
+	"unregister",
+	"panic",
+	NULL
+};
+
 write_attribute(attach);
 write_attribute(detach);
 write_attribute(unregister);
@@ -90,6 +96,7 @@ rw_attribute(discard);
 rw_attribute(running);
 rw_attribute(label);
 rw_attribute(readahead);
+rw_attribute(errors);
 rw_attribute(io_error_limit);
 rw_attribute(io_error_halflife);
 rw_attribute(verify);
@@ -492,6 +499,10 @@ lock_root:
 	sysfs_print(writeback_keys_failed,
 		    atomic_long_read(&c->writeback_keys_failed));
 
+	if (attr == &sysfs_errors)
+		return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
+					       c->on_error);
+
 	/* See count_io_errors for why 88 */
 	sysfs_print(io_error_halflife,	c->error_decay * 88);
 	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
@@ -569,6 +580,15 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(congested_write_threshold_us,
 		      c->congested_write_threshold_us);
 
+	if (attr == &sysfs_errors) {
+		ssize_t v = bch_read_string_list(buf, error_actions);
+
+		if (v < 0)
+			return v;
+
+		c->on_error = v;
+	}
+
 	if (attr == &sysfs_io_error_limit)
 		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
 
@@ -620,6 +640,7 @@ static struct attribute *bch_cache_set_files[] = {
 	&sysfs_average_key_size,
 	&sysfs_dirty_data,
 
+	&sysfs_errors,
 	&sysfs_io_error_limit,
 	&sysfs_io_error_halflife,
 	&sysfs_congested,

From 2d679fc75678551485df62274edaed452becd16d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 17 Aug 2013 02:13:15 -0700
Subject: [PATCH 47/94] bcache: Stripe size isn't necessarily a power of two

Originally I got this right... except that the divides didn't use
do_div(), which broke 32 bit kernels. When I went to fix that, I forgot
that the raid stripe size usually isn't a power of two... doh

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  2 +-
 drivers/md/bcache/super.c     |  7 +++----
 drivers/md/bcache/sysfs.c     |  2 +-
 drivers/md/bcache/writeback.c | 33 +++++++++++++++++----------------
 drivers/md/bcache/writeback.h |  8 +++++---
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 578615604be5..6e836f22f276 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -437,7 +437,7 @@ struct bcache_device {
 	int			flush_done;
 
 	uint64_t		nr_stripes;
-	unsigned		stripe_size_bits;
+	unsigned		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
 
 	unsigned long		sectors_dirty_last;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 74f2e902d876..d3169c0652f8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -761,11 +761,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	struct request_queue *q;
 	size_t n;
 
-	if (!d->stripe_size_bits)
-		d->stripe_size_bits = 31;
+	if (!d->stripe_size)
+		d->stripe_size = 1 << 31;
 
-	d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
-		d->stripe_size_bits;
+	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 
 	if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
 		return -ENOMEM;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4211d82179fc..b3a66f17231d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -157,7 +157,7 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(dirty_data,
 		     bcache_dev_sectors_dirty(&dc->disk) << 9);
 
-	sysfs_hprint(stripe_size,	(1 << dc->disk.stripe_size_bits) << 9);
+	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
 	var_printf(partial_stripes_expensive,	"%u");
 
 	var_printf(sequential_merge,	"%i");
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ba3ee48320f2..b842fbfbf1db 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -114,25 +114,25 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 
 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
 {
-	uint64_t stripe;
+	uint64_t stripe = KEY_START(k);
 	unsigned nr_sectors = KEY_SIZE(k);
 	struct cached_dev *dc = container_of(buf, struct cached_dev,
 					     writeback_keys);
-	unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
 
 	if (!KEY_DIRTY(k))
 		return false;
 
-	stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
-	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
-		    stripe_size)
-			return false;
+	do_div(stripe, dc->disk.stripe_size);
 
-		if (nr_sectors <= stripe_size)
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
+		    dc->disk.stripe_size)
 			return true;
 
-		nr_sectors -= stripe_size;
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
 		stripe++;
 	}
 }
@@ -186,11 +186,12 @@ static void refill_dirty(struct closure *cl)
 
 		for (i = 0; i < dc->disk.nr_stripes; i++)
 			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    1 << dc->disk.stripe_size_bits)
+			    dc->disk.stripe_size)
 				goto full_stripes;
 
 		goto normal_refill;
 full_stripes:
+		searched_from_start = false;	/* not searching entire btree */
 		bch_refill_keybuf(dc->disk.c, buf, &end,
 				  dirty_full_stripe_pred);
 	} else {
@@ -252,19 +253,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_size, stripe_offset;
-	uint64_t stripe;
+	unsigned stripe_offset;
+	uint64_t stripe = offset;
 
 	if (!d)
 		return;
 
-	stripe_size = 1 << d->stripe_size_bits;
-	stripe = offset >> d->stripe_size_bits;
-	stripe_offset = offset & (stripe_size - 1);
+	do_div(stripe, d->stripe_size);
+
+	stripe_offset = offset & (d->stripe_size - 1);
 
 	while (nr_sectors) {
 		int s = min_t(unsigned, abs(nr_sectors),
-			      stripe_size - stripe_offset);
+			      d->stripe_size - stripe_offset);
 
 		if (nr_sectors < 0)
 			s = -s;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c91f61bb95b6..34961888b5a9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -18,16 +18,18 @@ static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
 					   uint64_t offset,
 					   unsigned nr_sectors)
 {
-	uint64_t stripe = offset >> d->stripe_size_bits;
+	uint64_t stripe = offset;
+
+	do_div(stripe, d->stripe_size);
 
 	while (1) {
 		if (atomic_read(d->stripe_sectors_dirty + stripe))
 			return true;
 
-		if (nr_sectors <= 1 << d->stripe_size_bits)
+		if (nr_sectors <= d->stripe_size)
 			return false;
 
-		nr_sectors -= 1 << d->stripe_size_bits;
+		nr_sectors -= d->stripe_size;
 		stripe++;
 	}
 }

From 8304ad4dc818ffd701c2f3e90683b5b8013f44e2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:20:00 -0700
Subject: [PATCH 48/94] bcache: Remove unnecessary check in should_split()

Checking i->seq was redundant, because since ages ago we always
initialize the new bset when advancing b->written

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3333d3723633..8a1c7e6bbbe3 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -353,7 +353,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 		    _w == insert_lock(op, _b))				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c, &(op)->cl);		\
+		bch_cannibalize_unlock(c, &(op)->cl);			\
 	} while (_r == -EINTR);						\
 									\
 	_r;								\
@@ -363,8 +363,7 @@ static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
 	return b->written >= btree_blocks(b) ||
-		(i->seq == b->sets[0].data->seq &&
-		 b->written + __set_blocks(i, i->keys + 15, b->c)
+		(b->written + __set_blocks(i, i->keys + 15, b->c)
 		 > btree_blocks(b));
 }
 

From d6fd3b11cea82346837957feab25b0be48aa424c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:20:19 -0700
Subject: [PATCH 49/94] bcache: Explicitly track btree node's parent

This is prep work for the reworked btree insertion code.

The way we set b->parent is ugly and hacky... the problem is, when
btree_split() or garbage collection splits or rewrites a btree node, the
parent changes for all its (potentially already cached) children.

I may change this later and add some code to look through the btree node
cache and find all our cached child nodes and change the parent pointer
then...

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 14 ++++++++++----
 drivers/md/bcache/btree.h | 16 ++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index d1734d9d9c79..87299baa8a1b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -884,6 +884,7 @@ out:
 
 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
 	b->level	= level;
+	b->parent	= (void *) ~0UL;
 
 	mca_reinit(b);
 
@@ -1898,7 +1899,7 @@ out:
 
 static int btree_split(struct btree *b, struct btree_op *op)
 {
-	bool split, root = b == b->c->root;
+	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 
@@ -1920,7 +1921,7 @@ static int btree_split(struct btree *b, struct btree_op *op)
 		if (IS_ERR(n2))
 			goto err_free1;
 
-		if (root) {
+		if (!b->parent) {
 			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
 			if (IS_ERR(n3))
 				goto err_free2;
@@ -1928,7 +1929,8 @@ static int btree_split(struct btree *b, struct btree_op *op)
 
 		bch_btree_insert_keys(n1, op);
 
-		/* Has to be a linear search because we don't have an auxiliary
+		/*
+		 * Has to be a linear search because we don't have an auxiliary
 		 * search tree yet
 		 */
 
@@ -1960,6 +1962,8 @@ static int btree_split(struct btree *b, struct btree_op *op)
 	bch_btree_node_write(n1, &op->cl);
 
 	if (n3) {
+		/* Depth increases, make a new root */
+
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op);
 		bch_btree_node_write(n3, &op->cl);
@@ -1967,7 +1971,9 @@ static int btree_split(struct btree *b, struct btree_op *op)
 		closure_sync(&op->cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
-	} else if (root) {
+	} else if (!b->parent) {
+		/* Root filled up but didn't need to be split */
+
 		op->keys.top = op->keys.bottom;
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 8a1c7e6bbbe3..6d2fb7550706 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -125,6 +125,7 @@ struct btree {
 	unsigned long		seq;
 	struct rw_semaphore	lock;
 	struct cache_set	*c;
+	struct btree		*parent;
 
 	unsigned long		flags;
 	uint16_t		written;	/* would be nice to kill */
@@ -327,12 +328,13 @@ static inline void rw_unlock(bool w, struct btree *b)
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_b = bch_btree_node_get((b)->c, key, l, op);	\
-	if (!IS_ERR(_b)) {						\
-		_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);		\
-		rw_unlock(_w, _b);					\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, op);	\
+	if (!IS_ERR(_child)) {						\
+		_child->parent = (b);					\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
 	} else								\
-		_r = PTR_ERR(_b);					\
+		_r = PTR_ERR(_child);					\
 	_r;								\
 })
 
@@ -350,8 +352,10 @@ static inline void rw_unlock(bool w, struct btree *b)
 		bool _w = insert_lock(op, _b);				\
 		rw_lock(_w, _b, _b->level);				\
 		if (_b == (c)->root &&					\
-		    _w == insert_lock(op, _b))				\
+		    _w == insert_lock(op, _b)) {			\
+			_b->parent = NULL;				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
 		rw_unlock(_w, _b);					\
 		bch_cannibalize_unlock(c, &(op)->cl);			\
 	} while (_r == -EINTR);						\

From 26c949f8062cb9221a28b2228104f1cc5b265097 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:41:15 -0700
Subject: [PATCH 50/94] bcache: Add btree_insert_node()

The flow of control in the old btree insertion code was rather -
backwards; we'd recurse down the btree (in btree_insert_recurse()), and
then if we needed to split the keys to be inserted into the parent node
would be effectively returned up to btree_insert_recurse(), which would
notice there was more work to do and finish the insertion.

The main problem with this was that the full logic for btree insertion
could only be used by calling btree_insert_recurse; if you'd gotten to a
btree leaf some other way and had a key to insert, if it turned out that
node needed to be split you were SOL.

This inverts the flow of control so btree_insert_node() does _full_
btree insertion, including splitting - and takes a (leaf) btree node to
insert into as a parameter.

This means we can now _correctly_ handle cache misses - for cache
misses, we need to insert a fake "check" key into the btree when we
discover we have a cache miss - while we still have the btree locked.
Previously, if the btree node was full inserting a cache miss would just
fail.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  |  12 +++
 drivers/md/bcache/bset.h  |   1 +
 drivers/md/bcache/btree.c | 158 ++++++++++++++++++++++----------------
 3 files changed, 105 insertions(+), 66 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 22d1ae72c282..830eede86d56 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -73,6 +73,18 @@ struct bkey *bch_keylist_pop(struct keylist *l)
 	return l->top = k;
 }
 
+void bch_keylist_pop_front(struct keylist *l)
+{
+	struct bkey *next = bkey_next(l->bottom);
+	size_t bytes = ((void *) l->top) - ((void *) next);
+
+	memmove(l->bottom,
+		next,
+		bytes);
+
+	l->top = ((void *) l->bottom) + bytes;
+}
+
 /* Pointer validation */
 
 bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index ae115a253d73..a3627d0cd88b 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -267,6 +267,7 @@ static inline void bch_keylist_free(struct keylist *l)
 
 void bch_keylist_copy(struct keylist *, struct keylist *);
 struct bkey *bch_keylist_pop(struct keylist *);
+void bch_keylist_pop_front(struct keylist *);
 int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
 
 void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 87299baa8a1b..c2722e075f18 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1849,15 +1849,43 @@ merged:
 	return true;
 }
 
-static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
+				  struct keylist *insert_keys)
 {
 	bool ret = false;
-	struct bkey *k;
 	unsigned oldsize = bch_count_data(b);
 
-	while ((k = bch_keylist_pop(&op->keys))) {
-		bkey_put(b->c, k, b->level);
-		ret |= btree_insert_key(b, op, k);
+	BUG_ON(!insert_lock(op, b));
+
+	while (!bch_keylist_empty(insert_keys)) {
+		struct bkey *k = insert_keys->bottom;
+
+		if (b->level ||
+		    bkey_cmp(k, &b->key) <= 0) {
+			bkey_put(b->c, k, b->level);
+
+			ret |= btree_insert_key(b, op, k);
+			bch_keylist_pop_front(insert_keys);
+		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
+#if 0
+			if (op->type == BTREE_REPLACE) {
+				bkey_put(b->c, k, b->level);
+				bch_keylist_pop_front(insert_keys);
+				op->insert_collision = true;
+				break;
+			}
+#endif
+			BKEY_PADDED(key) temp;
+			bkey_copy(&temp.key, insert_keys->bottom);
+
+			bch_cut_back(&b->key, &temp.key);
+			bch_cut_front(&b->key, insert_keys->bottom);
+
+			ret |= btree_insert_key(b, op, &temp.key);
+			break;
+		} else {
+			break;
+		}
 	}
 
 	BUG_ON(bch_count_data(b) < oldsize);
@@ -1897,7 +1925,9 @@ out:
 	return ret;
 }
 
-static int btree_split(struct btree *b, struct btree_op *op)
+static int btree_split(struct btree *b, struct btree_op *op,
+		       struct keylist *insert_keys,
+		       struct keylist *parent_keys)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
@@ -1927,7 +1957,7 @@ static int btree_split(struct btree *b, struct btree_op *op)
 				goto err_free2;
 		}
 
-		bch_btree_insert_keys(n1, op);
+		bch_btree_insert_keys(n1, op, insert_keys);
 
 		/*
 		 * Has to be a linear search because we don't have an auxiliary
@@ -1949,23 +1979,23 @@ static int btree_split(struct btree *b, struct btree_op *op)
 
 		bkey_copy_key(&n2->key, &b->key);
 
-		bch_keylist_add(&op->keys, &n2->key);
+		bch_keylist_add(parent_keys, &n2->key);
 		bch_btree_node_write(n2, &op->cl);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
 
-		bch_btree_insert_keys(n1, op);
+		bch_btree_insert_keys(n1, op, insert_keys);
 	}
 
-	bch_keylist_add(&op->keys, &n1->key);
+	bch_keylist_add(parent_keys, &n1->key);
 	bch_btree_node_write(n1, &op->cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op);
+		bch_btree_insert_keys(n3, op, parent_keys);
 		bch_btree_node_write(n3, &op->cl);
 
 		closure_sync(&op->cl);
@@ -1974,22 +2004,22 @@ static int btree_split(struct btree *b, struct btree_op *op)
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
-		op->keys.top = op->keys.bottom;
+		parent_keys->top = parent_keys->bottom;
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
 	} else {
 		unsigned i;
 
-		bkey_copy(op->keys.top, &b->key);
-		bkey_copy_key(op->keys.top, &ZERO_KEY);
+		bkey_copy(parent_keys->top, &b->key);
+		bkey_copy_key(parent_keys->top, &ZERO_KEY);
 
 		for (i = 0; i < KEY_PTRS(&b->key); i++) {
 			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
 
-			SET_PTR_GEN(op->keys.top, i, g);
+			SET_PTR_GEN(parent_keys->top, i, g);
 		}
 
-		bch_keylist_push(&op->keys);
+		bch_keylist_push(parent_keys);
 		closure_sync(&op->cl);
 		atomic_inc(&b->c->prio_blocked);
 	}
@@ -2018,11 +2048,50 @@ err:
 	return -ENOMEM;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *stack_keys)
+static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
+				 struct keylist *insert_keys)
+{
+	int ret = 0;
+	struct keylist split_keys;
+
+	bch_keylist_init(&split_keys);
+
+	BUG_ON(b->level);
+
+	do {
+		if (should_split(b)) {
+			if (current->bio_list) {
+				op->lock = b->c->root->level + 1;
+				ret = -EAGAIN;
+			} else if (op->lock <= b->c->root->level) {
+				op->lock = b->c->root->level + 1;
+				ret = -EINTR;
+			} else {
+				struct btree *parent = b->parent;
+
+				ret = btree_split(b, op, insert_keys,
+						  &split_keys);
+				insert_keys = &split_keys;
+				b = parent;
+			}
+		} else {
+			BUG_ON(write_block(b) != b->sets[b->nsets].data);
+
+			if (bch_btree_insert_keys(b, op, insert_keys)) {
+				if (!b->level)
+					bch_btree_leaf_dirty(b, op);
+				else
+					bch_btree_node_write(b, &op->cl);
+			}
+		}
+	} while (!bch_keylist_empty(&split_keys));
+
+	return ret;
+}
+
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 {
 	if (b->level) {
-		int ret;
 		struct bkey *insert = op->keys.bottom;
 		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
 
@@ -2034,53 +2103,10 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 			return -EIO;
 		}
 
-		if (bkey_cmp(insert, k) > 0) {
-			unsigned i;
-
-			if (op->type == BTREE_REPLACE) {
-				__bkey_put(b->c, insert);
-				op->keys.top = op->keys.bottom;
-				op->insert_collision = true;
-				return 0;
-			}
-
-			for (i = 0; i < KEY_PTRS(insert); i++)
-				atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
-
-			bkey_copy(stack_keys->top, insert);
-
-			bch_cut_back(k, insert);
-			bch_cut_front(k, stack_keys->top);
-
-			bch_keylist_push(stack_keys);
-		}
-
-		ret = btree(insert_recurse, k, b, op, stack_keys);
-		if (ret)
-			return ret;
+		return btree(insert_recurse, k, b, op);
+	} else {
+		return bch_btree_insert_node(b, op, &op->keys);
 	}
-
-	if (!bch_keylist_empty(&op->keys)) {
-		if (should_split(b)) {
-			if (op->lock <= b->c->root->level) {
-				BUG_ON(b->level);
-				op->lock = b->c->root->level + 1;
-				return -EINTR;
-			}
-			return btree_split(b, op);
-		}
-
-		BUG_ON(write_block(b) != b->sets[b->nsets].data);
-
-		if (bch_btree_insert_keys(b, op)) {
-			if (!b->level)
-				bch_btree_leaf_dirty(b, op);
-			else
-				bch_btree_node_write(b, &op->cl);
-		}
-	}
-
-	return 0;
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
@@ -2106,7 +2132,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			op->lock = 0;
 		}
 
-		ret = btree_root(insert_recurse, c, op, &stack_keys);
+		ret = btree_root(insert_recurse, c, op);
 
 		if (ret == -EAGAIN) {
 			ret = 0;

From 403b6cdeb1a38d896ffcb1f99ddcfd4e343b5d69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:22:44 -0700
Subject: [PATCH 51/94] bcache: Insert multiple keys at a time

We'll often end up with a list of adjacent keys to insert -
because bch_data_insert() may have to fragment the data it writes.

Originally, to simplify things and avoid having to deal with corner
cases bch_btree_insert() would pass keys from this list one at a time to
btree_insert_recurse() - mainly because the list of keys might span leaf
nodes, so it was easier this way.

With the btree_insert_node() refactoring, it's now a lot easier to just
pass down the whole list and have btree_insert_recurse() iterate over
leaf nodes until it's done.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index c2722e075f18..60d06465d772 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1858,10 +1858,14 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	BUG_ON(!insert_lock(op, b));
 
 	while (!bch_keylist_empty(insert_keys)) {
+		struct bset *i = write_block(b);
 		struct bkey *k = insert_keys->bottom;
 
-		if (b->level ||
-		    bkey_cmp(k, &b->key) <= 0) {
+		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
+		    > btree_blocks(b))
+			break;
+
+		if (bkey_cmp(k, &b->key) <= 0) {
 			bkey_put(b->c, k, b->level);
 
 			ret |= btree_insert_key(b, op, k);
@@ -1888,6 +1892,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 		}
 	}
 
+	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
+
 	BUG_ON(bch_count_data(b) < oldsize);
 	return ret;
 }
@@ -2073,6 +2079,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 						  &split_keys);
 				insert_keys = &split_keys;
 				b = parent;
+				if (!ret)
+					ret = -EINTR;
 			}
 		} else {
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
@@ -2091,6 +2099,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 {
+	if (bch_keylist_empty(&op->keys))
+		return 0;
+
 	if (b->level) {
 		struct bkey *insert = op->keys.bottom;
 		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
@@ -2112,7 +2123,6 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 {
 	int ret = 0;
-	struct keylist stack_keys;
 
 	/*
 	 * Don't want to block with the btree locked unless we have to,
@@ -2121,17 +2131,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 	clear_closure_blocking(&op->cl);
 
 	BUG_ON(bch_keylist_empty(&op->keys));
-	bch_keylist_copy(&stack_keys, &op->keys);
-	bch_keylist_init(&op->keys);
-
-	while (!bch_keylist_empty(&stack_keys) ||
-	       !bch_keylist_empty(&op->keys)) {
-		if (bch_keylist_empty(&op->keys)) {
-			bch_keylist_add(&op->keys,
-					bch_keylist_pop(&stack_keys));
-			op->lock = 0;
-		}
 
+	while (!bch_keylist_empty(&op->keys)) {
+		op->lock = 0;
 		ret = btree_root(insert_recurse, c, op);
 
 		if (ret == -EAGAIN) {
@@ -2143,14 +2145,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			pr_err("error %i trying to insert key for %s",
 			       ret, op_type(op));
 
-			while ((k = bch_keylist_pop(&stack_keys) ?:
-				    bch_keylist_pop(&op->keys)))
+			while ((k = bch_keylist_pop(&op->keys)))
 				bkey_put(c, k, 0);
 		}
 	}
 
-	bch_keylist_free(&stack_keys);
-
 	if (op->journal)
 		atomic_dec_bug(op->journal);
 	op->journal = NULL;

From e7c590eb63509c5d5f48a390d23aa25f4417ac96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:39:16 -0700
Subject: [PATCH 52/94] bcache: Convert btree_insert_check_key() to
 btree_insert_node()

This was the main point of all this refactoring - now,
btree_insert_check_key() won't fail just because the leaf node happened
to be full.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  8 ----
 drivers/md/bcache/btree.c   | 82 +++++++++++++++++++++----------------
 drivers/md/bcache/btree.h   |  6 ++-
 drivers/md/bcache/request.c | 55 +++++++++++++------------
 4 files changed, 79 insertions(+), 72 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6e836f22f276..10ce0c825fce 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1091,14 +1091,6 @@ do {									\
 	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
 	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
 
-static inline void __bkey_put(struct cache_set *c, struct bkey *k)
-{
-	unsigned i;
-
-	for (i = 0; i < KEY_PTRS(k); i++)
-		atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
-}
-
 static inline void cached_dev_put(struct cached_dev *dc)
 {
 	if (atomic_dec_and_test(&dc->count))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 60d06465d772..08a85327431c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -118,6 +118,15 @@ void bch_btree_op_init_stack(struct btree_op *op)
 
 /* Btree key manipulation */
 
+void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i))
+			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
 static void bkey_put(struct cache_set *c, struct bkey *k, int level)
 {
 	if ((level && KEY_OFFSET(k)) || !level)
@@ -1855,8 +1864,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	bool ret = false;
 	unsigned oldsize = bch_count_data(b);
 
-	BUG_ON(!insert_lock(op, b));
-
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
 		struct bkey *k = insert_keys->bottom;
@@ -1898,39 +1905,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	return ret;
 }
 
-bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
-				   struct bio *bio)
-{
-	bool ret = false;
-	uint64_t btree_ptr = b->key.ptr[0];
-	unsigned long seq = b->seq;
-	BKEY_PADDED(k) tmp;
-
-	rw_unlock(false, b);
-	rw_lock(true, b, b->level);
-
-	if (b->key.ptr[0] != btree_ptr ||
-	    b->seq != seq + 1 ||
-	    should_split(b))
-		goto out;
-
-	op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
-
-	SET_KEY_PTRS(&op->replace, 1);
-	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
-
-	SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
-
-	bkey_copy(&tmp.k, &op->replace);
-
-	BUG_ON(op->type != BTREE_INSERT);
-	BUG_ON(!btree_insert_key(b, op, &tmp.k));
-	ret = true;
-out:
-	downgrade_write(&b->lock);
-	return ret;
-}
-
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
 		       struct keylist *parent_keys)
@@ -2097,6 +2071,44 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 	return ret;
 }
 
+int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+			       struct bkey *check_key)
+{
+	int ret = -EINTR;
+	uint64_t btree_ptr = b->key.ptr[0];
+	unsigned long seq = b->seq;
+	struct keylist insert;
+	bool upgrade = op->lock == -1;
+
+	bch_keylist_init(&insert);
+
+	if (upgrade) {
+		rw_unlock(false, b);
+		rw_lock(true, b, b->level);
+
+		if (b->key.ptr[0] != btree_ptr ||
+		    b->seq != seq + 1)
+			goto out;
+	}
+
+	SET_KEY_PTRS(check_key, 1);
+	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
+
+	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
+
+	bch_keylist_add(&insert, check_key);
+
+	BUG_ON(op->type != BTREE_INSERT);
+
+	ret = bch_btree_insert_node(b, op, &insert);
+
+	BUG_ON(!ret && !bch_keylist_empty(&insert));
+out:
+	if (upgrade)
+		downgrade_write(&b->lock);
+	return ret;
+}
+
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 {
 	if (bch_keylist_empty(&op->keys))
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 6d2fb7550706..73bd62105e39 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -216,6 +216,8 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
+void __bkey_put(struct cache_set *c, struct bkey *k);
+
 /* Looping macros */
 
 #define for_each_cached_btree(b, c, iter)				\
@@ -380,8 +382,8 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 				int, struct btree_op *);
 
-bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
-				   struct bio *);
+int bch_btree_insert_check_key(struct btree *, struct btree_op *,
+			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 2a7f0dd6abab..9ed334ca484d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -869,35 +869,39 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 				 struct bio *bio, unsigned sectors)
 {
 	int ret = 0;
-	unsigned reada;
+	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss;
 
+	if (s->cache_miss || s->op.skip) {
+		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		if (miss == bio)
+			s->op.lookup_done = true;
+		goto out_submit;
+	}
+
+	if (!(bio->bi_rw & REQ_RAHEAD) &&
+	    !(bio->bi_rw & REQ_META) &&
+	    s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+		reada = min_t(sector_t, dc->readahead >> 9,
+			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+
+	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+
+	s->op.replace = KEY(s->op.inode, bio->bi_sector +
+			    s->cache_bio_sectors, s->cache_bio_sectors);
+
+	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
+	if (ret)
+		return ret;
+
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 	if (miss == bio)
 		s->op.lookup_done = true;
+	else
+		/* btree_search_recurse()'s btree iterator is no good anymore */
+		ret = -EINTR;
 
-	miss->bi_end_io		= request_endio;
-	miss->bi_private	= &s->cl;
-
-	if (s->cache_miss || s->op.skip)
-		goto out_submit;
-
-	if (miss != bio ||
-	    (bio->bi_rw & REQ_RAHEAD) ||
-	    (bio->bi_rw & REQ_META) ||
-	    s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
-		reada = 0;
-	else {
-		reada = min(dc->readahead >> 9,
-			    sectors - bio_sectors(miss));
-
-		if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
-			reada = bdev_sectors(miss->bi_bdev) -
-				bio_end_sector(miss);
-	}
-
-	s->cache_bio_sectors = bio_sectors(miss) + reada;
 	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
@@ -912,11 +916,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->op.cache_bio->bi_end_io	= request_endio;
 	s->op.cache_bio->bi_private	= &s->cl;
 
-	/* btree_search_recurse()'s btree iterator is no good anymore */
-	ret = -EINTR;
-	if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
-		goto out_put;
-
 	bch_bio_map(s->op.cache_bio, NULL);
 	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
@@ -931,6 +930,8 @@ out_put:
 	bio_put(s->op.cache_bio);
 	s->op.cache_bio = NULL;
 out_submit:
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
 	closure_bio_submit(miss, &s->cl, s->d);
 	return ret;
 }

From 4f3d40147b8d0ce7055e241e1d263e0aa2b2b46d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:46:36 -0700
Subject: [PATCH 53/94] bcache: Add explicit keylist arg to btree_insert()

Some refactoring - better to explicitly pass stuff around instead of
having it all in the "big bag of state", struct btree_op. Going to prune
struct btree_op quite a bit over time.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 26 ++++++++++++++------------
 drivers/md/bcache/btree.h     |  2 +-
 drivers/md/bcache/journal.c   |  2 +-
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/writeback.c |  2 +-
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 08a85327431c..fc3cae5c94b2 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2109,30 +2109,32 @@ out:
 	return ret;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
+				    struct keylist *keys)
 {
-	if (bch_keylist_empty(&op->keys))
+	if (bch_keylist_empty(keys))
 		return 0;
 
 	if (b->level) {
-		struct bkey *insert = op->keys.bottom;
-		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
+		struct bkey *k;
 
+		k = bch_next_recurse_key(b, &START_KEY(keys->bottom));
 		if (!k) {
 			btree_bug(b, "no key to recurse on at level %i/%i",
 				  b->level, b->c->root->level);
 
-			op->keys.top = op->keys.bottom;
+			keys->top = keys->bottom;
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op);
+		return btree(insert_recurse, k, b, op, keys);
 	} else {
-		return bch_btree_insert_node(b, op, &op->keys);
+		return bch_btree_insert_node(b, op, keys);
 	}
 }
 
-int bch_btree_insert(struct btree_op *op, struct cache_set *c)
+int bch_btree_insert(struct btree_op *op, struct cache_set *c,
+		     struct keylist *keys)
 {
 	int ret = 0;
 
@@ -2142,11 +2144,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 	 */
 	clear_closure_blocking(&op->cl);
 
-	BUG_ON(bch_keylist_empty(&op->keys));
+	BUG_ON(bch_keylist_empty(keys));
 
-	while (!bch_keylist_empty(&op->keys)) {
+	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op);
+		ret = btree_root(insert_recurse, c, op, keys);
 
 		if (ret == -EAGAIN) {
 			ret = 0;
@@ -2157,7 +2159,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			pr_err("error %i trying to insert key for %s",
 			       ret, op_type(op));
 
-			while ((k = bch_keylist_pop(&op->keys)))
+			while ((k = bch_keylist_pop(keys)))
 				bkey_put(c, k, 0);
 		}
 	}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 73bd62105e39..967aacd20625 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -384,7 +384,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *);
+int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 9e8775872dba..5abe5d5fc183 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -320,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			op->journal = i->pin;
 			atomic_inc(op->journal);
 
-			ret = bch_btree_insert(op, s);
+			ret = bch_btree_insert(op, s, &op->keys);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9ed334ca484d..7fd84ce9e835 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -607,7 +607,7 @@ void bch_btree_insert_async(struct closure *cl)
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 
-	if (bch_btree_insert(op, op->c)) {
+	if (bch_btree_insert(op, op->c, &op->keys)) {
 		s->error		= -ENOMEM;
 		op->insert_data_done	= true;
 	}
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b842fbfbf1db..8ffc8ec7231d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -311,7 +311,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c);
+		bch_btree_insert(&op, dc->disk.c, &op.keys);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)

From c2f95ae2ebbe1ab61b1d4437f5923fdf720d4d4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:24:25 -0700
Subject: [PATCH 54/94] bcache: Clean up keylist code

More random refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c    | 44 +++++++++++++------------------------
 drivers/md/bcache/bset.h    | 35 +++++++++++++++++++++--------
 drivers/md/bcache/btree.c   | 12 +++++-----
 drivers/md/bcache/journal.c | 14 +++++++-----
 drivers/md/bcache/request.c |  4 ++--
 5 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 830eede86d56..d0512e451dda 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -14,22 +14,12 @@
 
 /* Keylists */
 
-void bch_keylist_copy(struct keylist *dest, struct keylist *src)
-{
-	*dest = *src;
-
-	if (src->list == src->d) {
-		size_t n = (uint64_t *) src->top - src->d;
-		dest->top = (struct bkey *) &dest->d[n];
-		dest->list = dest->d;
-	}
-}
-
 int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
 {
-	unsigned oldsize = (uint64_t *) l->top - l->list;
-	unsigned newsize = oldsize + 2 + nptrs;
-	uint64_t *new;
+	size_t oldsize = bch_keylist_nkeys(l);
+	size_t newsize = oldsize + 2 + nptrs;
+	uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
+	uint64_t *new_keys;
 
 	/* The journalling code doesn't handle the case where the keys to insert
 	 * is bigger than an empty write: If we just return -ENOMEM here,
@@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
 	    roundup_pow_of_two(oldsize) == newsize)
 		return 0;
 
-	new = krealloc(l->list == l->d ? NULL : l->list,
-		       sizeof(uint64_t) * newsize, GFP_NOIO);
+	new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
 
-	if (!new)
+	if (!new_keys)
 		return -ENOMEM;
 
-	if (l->list == l->d)
-		memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
+	if (!old_keys)
+		memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
 
-	l->list = new;
-	l->top = (struct bkey *) (&l->list[oldsize]);
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
 
 	return 0;
 }
 
 struct bkey *bch_keylist_pop(struct keylist *l)
 {
-	struct bkey *k = l->bottom;
+	struct bkey *k = l->keys;
 
 	if (k == l->top)
 		return NULL;
@@ -75,14 +64,11 @@ struct bkey *bch_keylist_pop(struct keylist *l)
 
 void bch_keylist_pop_front(struct keylist *l)
 {
-	struct bkey *next = bkey_next(l->bottom);
-	size_t bytes = ((void *) l->top) - ((void *) next);
+	l->top_p -= bkey_u64s(l->keys);
 
-	memmove(l->bottom,
-		next,
-		bytes);
-
-	l->top = ((void *) l->bottom) + bytes;
+	memmove(l->keys,
+		bkey_next(l->keys),
+		bch_keylist_bytes(l));
 }
 
 /* Pointer validation */
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index a3627d0cd88b..8a9305685b7e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -227,20 +227,23 @@ static inline struct bkey *bkey_next(const struct bkey *k)
 /* Keylists */
 
 struct keylist {
-	struct bkey		*top;
 	union {
-		uint64_t		*list;
-		struct bkey		*bottom;
+		struct bkey		*keys;
+		uint64_t		*keys_p;
+	};
+	union {
+		struct bkey		*top;
+		uint64_t		*top_p;
 	};
 
 	/* Enough room for btree_split's keys without realloc */
 #define KEYLIST_INLINE		16
-	uint64_t		d[KEYLIST_INLINE];
+	uint64_t		inline_keys[KEYLIST_INLINE];
 };
 
 static inline void bch_keylist_init(struct keylist *l)
 {
-	l->top = (void *) (l->list = l->d);
+	l->top_p = l->keys_p = l->inline_keys;
 }
 
 static inline void bch_keylist_push(struct keylist *l)
@@ -256,16 +259,30 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
 
 static inline bool bch_keylist_empty(struct keylist *l)
 {
-	return l->top == (void *) l->list;
+	return l->top == l->keys;
+}
+
+static inline void bch_keylist_reset(struct keylist *l)
+{
+	l->top = l->keys;
 }
 
 static inline void bch_keylist_free(struct keylist *l)
 {
-	if (l->list != l->d)
-		kfree(l->list);
+	if (l->keys_p != l->inline_keys)
+		kfree(l->keys_p);
+}
+
+static inline size_t bch_keylist_nkeys(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_nkeys(l) * sizeof(uint64_t);
 }
 
-void bch_keylist_copy(struct keylist *, struct keylist *);
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
 int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fc3cae5c94b2..f960607f1f25 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1866,7 +1866,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
-		struct bkey *k = insert_keys->bottom;
+		struct bkey *k = insert_keys->keys;
 
 		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
 		    > btree_blocks(b))
@@ -1887,10 +1887,10 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			}
 #endif
 			BKEY_PADDED(key) temp;
-			bkey_copy(&temp.key, insert_keys->bottom);
+			bkey_copy(&temp.key, insert_keys->keys);
 
 			bch_cut_back(&b->key, &temp.key);
-			bch_cut_front(&b->key, insert_keys->bottom);
+			bch_cut_front(&b->key, insert_keys->keys);
 
 			ret |= btree_insert_key(b, op, &temp.key);
 			break;
@@ -1984,7 +1984,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
-		parent_keys->top = parent_keys->bottom;
+		bch_keylist_reset(parent_keys);
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
 	} else {
@@ -2118,12 +2118,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 	if (b->level) {
 		struct bkey *k;
 
-		k = bch_next_recurse_key(b, &START_KEY(keys->bottom));
+		k = bch_next_recurse_key(b, &START_KEY(keys->keys));
 		if (!k) {
 			btree_bug(b, "no key to recurse on at level %i/%i",
 				  b->level, b->c->root->level);
 
-			keys->top = keys->bottom;
+			bch_keylist_reset(keys);
 			return -EIO;
 		}
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5abe5d5fc183..1bdefdb1fa71 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -713,7 +713,7 @@ void bch_journal(struct closure *cl)
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct cache_set *c = op->c;
 	struct journal_write *w;
-	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+	size_t sectors, nkeys;
 
 	if (op->type != BTREE_INSERT ||
 	    !CACHE_SYNC(&c->sb))
@@ -741,10 +741,12 @@ void bch_journal(struct closure *cl)
 	}
 
 	w = c->journal.cur;
-	b = __set_blocks(w->data, w->data->keys + n, c);
+	nkeys = w->data->keys + bch_keylist_nkeys(&op->keys);
+	sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size;
 
-	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
-	    b > c->journal.blocks_free) {
+	if (sectors > min_t(size_t,
+			    c->journal.blocks_free * c->sb.block_size,
+			    PAGE_SECTORS << JSET_BITS)) {
 		trace_bcache_journal_entry_full(c);
 
 		/*
@@ -760,8 +762,8 @@ void bch_journal(struct closure *cl)
 		continue_at(cl, bch_journal, bcache_wq);
 	}
 
-	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
-	w->data->keys += n;
+	memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys));
+	w->data->keys += bch_keylist_nkeys(&op->keys);
 
 	op->journal = &fifo_back(&c->journal.pin);
 	atomic_inc(op->journal);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 7fd84ce9e835..a000e918b795 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -438,13 +438,13 @@ static void bch_insert_data_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
+	struct bkey *src = op->keys.keys, *dst = op->keys.keys;
 
 	while (src != op->keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
-		bkey_copy(dst, src);
+		memmove(dst, src, bkey_bytes(src));
 
 		dst = bkey_next(dst);
 		src = n;

From 84f0db03ea1e024f2a9e6cfcf7ac0323e4f84d3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:24:52 -0700
Subject: [PATCH 55/94] bcache: Refactor request_write()

Try to improve some of the naming a bit to be more consistent, and also
improve the flow of control in request_write() a bit.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.h   |   2 +-
 drivers/md/bcache/request.c | 368 ++++++++++++++++++------------------
 2 files changed, 183 insertions(+), 187 deletions(-)

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 967aacd20625..ea0814b51574 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -260,7 +260,7 @@ struct btree_op {
 	} type:8;
 
 	unsigned		csum:1;
-	unsigned		skip:1;
+	unsigned		bypass:1;
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a000e918b795..dbc2ef6e7a35 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,8 +25,6 @@
 
 struct kmem_cache *bch_search_cache;
 
-static void check_should_skip(struct cached_dev *, struct search *);
-
 /* Cgroup interface */
 
 #ifdef CONFIG_CGROUP_BCACHE
@@ -480,7 +478,7 @@ static void bch_insert_data_loop(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio, *n;
 
-	if (op->skip)
+	if (op->bypass)
 		return bio_invalidate(cl);
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
@@ -557,7 +555,7 @@ err:
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		op->skip = true;
+		op->bypass = true;
 		return bio_invalidate(cl);
 	} else {
 		/*
@@ -590,8 +588,8 @@ err:
  * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If op->skip is true, instead of inserting the data it invalidates the region
- * of the cache represented by op->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->cache_bio and op->inode.
  */
 void bch_insert_data(struct closure *cl)
 {
@@ -717,7 +715,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 	s->op.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
-	s->op.skip		= (bio->bi_rw & REQ_DISCARD) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
@@ -757,6 +754,134 @@ static void cached_dev_bio_complete(struct closure *cl)
 	cached_dev_put(dc);
 }
 
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	ewma_add(t->sequential_io_avg,
+		 t->sequential_io, 8, 0);
+
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct search *s)
+{
+	struct cache_set *c = s->op.c;
+	struct bio *bio = &s->bio.bio;
+	unsigned mode = cache_mode(dc, bio);
+	unsigned sectors, congested = bch_get_congested(c);
+
+	if (atomic_read(&dc->disk.detaching) ||
+	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+	    (bio->bi_rw & REQ_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     (bio->bi_rw & REQ_WRITE)))
+		goto skip;
+
+	if (bio->bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    (bio->bi_rw & REQ_WRITE) &&
+	    (bio->bi_rw & REQ_SYNC))
+		goto rescale;
+
+	if (dc->sequential_merge) {
+		struct io *i;
+
+		spin_lock(&dc->io_lock);
+
+		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+			if (i->last == bio->bi_sector &&
+			    time_before(jiffies, i->jiffies))
+				goto found;
+
+		i = list_first_entry(&dc->io_lru, struct io, lru);
+
+		add_sequential(s->task);
+		i->sequential = 0;
+found:
+		if (i->sequential + bio->bi_size > i->sequential)
+			i->sequential	+= bio->bi_size;
+
+		i->last			 = bio_end_sector(bio);
+		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+		s->task->sequential_io	 = i->sequential;
+
+		hlist_del(&i->hash);
+		hlist_add_head(&i->hash, iohash(dc, i->last));
+		list_move_tail(&i->lru, &dc->io_lru);
+
+		spin_unlock(&dc->io_lock);
+	} else {
+		s->task->sequential_io = bio->bi_size;
+
+		add_sequential(s->task);
+	}
+
+	sectors = max(s->task->sequential_io,
+		      s->task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(s->orig_bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(s->orig_bio);
+		goto skip;
+	}
+
+rescale:
+	bch_rescale_priorities(c, bio_sectors(bio));
+	return false;
+skip:
+	bch_mark_sectors_bypassed(s, bio_sectors(bio));
+	return true;
+}
+
 /* Process reads */
 
 static void cached_dev_read_complete(struct closure *cl)
@@ -854,8 +979,8 @@ static void request_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
+	bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
 
 	if (s->error)
 		continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -873,7 +998,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss;
 
-	if (s->cache_miss || s->op.skip) {
+	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		if (miss == bio)
 			s->op.lookup_done = true;
@@ -940,9 +1065,7 @@ static void request_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	check_should_skip(dc, s);
 	closure_call(&s->op.cl, btree_read_async, NULL, cl);
-
 	continue_at(cl, request_read_done_bh, NULL);
 }
 
@@ -961,41 +1084,48 @@ static void request_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
-	struct bkey start, end;
-	start = KEY(dc->disk.id, bio->bi_sector, 0);
-	end = KEY(dc->disk.id, bio_end_sector(bio), 0);
+	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
 	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
 
-	check_should_skip(dc, s);
 	down_read_non_owner(&dc->writeback_lock);
-
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
-		s->op.skip	= false;
+		/*
+		 * We overlap with some dirty data undergoing background
+		 * writeback, force this write to writeback
+		 */
+		s->op.bypass	= false;
 		s->writeback	= true;
 	}
 
+	/*
+	 * Discards aren't _required_ to do anything, so skipping if
+	 * check_overlapping returned true is ok
+	 *
+	 * But check_overlapping drops dirty keys for which io hasn't started,
+	 * so we still want to call it.
+	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		goto skip;
+		s->op.bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->op.skip)) {
-		s->op.skip = false;
+			     s->op.bypass)) {
+		s->op.bypass = false;
 		s->writeback = true;
 	}
 
-	if (s->op.skip)
-		goto skip;
+	trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass);
 
-	trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
+	if (s->op.bypass) {
+		s->op.cache_bio = s->orig_bio;
+		bio_get(s->op.cache_bio);
 
-	if (!s->writeback) {
-		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						   dc->disk.bio_split);
-
-		closure_bio_submit(bio, cl, s->d);
-	} else {
+		if (!(bio->bi_rw & REQ_DISCARD) ||
+		    blk_queue_discard(bdev_get_queue(dc->bdev)))
+			closure_bio_submit(bio, cl, s->d);
+	} else if (s->writeback) {
 		bch_writeback_add(dc);
 		s->op.cache_bio = bio;
 
@@ -1011,21 +1141,15 @@ static void request_write(struct cached_dev *dc, struct search *s)
 
 			closure_bio_submit(flush, cl, s->d);
 		}
+	} else {
+		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+						   dc->disk.bio_split);
+
+		closure_bio_submit(bio, cl, s->d);
 	}
-out:
+
 	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
-skip:
-	s->op.skip = true;
-	s->op.cache_bio = s->orig_bio;
-	bio_get(s->op.cache_bio);
-
-	if ((bio->bi_rw & REQ_DISCARD) &&
-	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
-		goto out;
-
-	closure_bio_submit(bio, cl, s->d);
-	goto out;
 }
 
 static void request_nodata(struct cached_dev *dc, struct search *s)
@@ -1033,14 +1157,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
 
-	if (bio->bi_rw & REQ_DISCARD) {
-		request_write(dc, s);
-		return;
-	}
-
 	if (s->op.flush_journal)
 		bch_journal_meta(s->op.c, cl);
 
+	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
 
 	continue_at(cl, cached_dev_bio_complete, NULL);
@@ -1048,134 +1168,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
 
 /* Cached devices - read & write stuff */
 
-unsigned bch_get_congested(struct cache_set *c)
-{
-	int i;
-	long rand;
-
-	if (!c->congested_read_threshold_us &&
-	    !c->congested_write_threshold_us)
-		return 0;
-
-	i = (local_clock_us() - c->congested_last_us) / 1024;
-	if (i < 0)
-		return 0;
-
-	i += atomic_read(&c->congested);
-	if (i >= 0)
-		return 0;
-
-	i += CONGESTED_MAX;
-
-	if (i > 0)
-		i = fract_exp_two(i, 6);
-
-	rand = get_random_int();
-	i -= bitmap_weight(&rand, BITS_PER_LONG);
-
-	return i > 0 ? i : 1;
-}
-
-static void add_sequential(struct task_struct *t)
-{
-	ewma_add(t->sequential_io_avg,
-		 t->sequential_io, 8, 0);
-
-	t->sequential_io = 0;
-}
-
-static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
-{
-	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
-}
-
-static void check_should_skip(struct cached_dev *dc, struct search *s)
-{
-	struct cache_set *c = s->op.c;
-	struct bio *bio = &s->bio.bio;
-	unsigned mode = cache_mode(dc, bio);
-	unsigned sectors, congested = bch_get_congested(c);
-
-	if (atomic_read(&dc->disk.detaching) ||
-	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
-	    (bio->bi_rw & REQ_DISCARD))
-		goto skip;
-
-	if (mode == CACHE_MODE_NONE ||
-	    (mode == CACHE_MODE_WRITEAROUND &&
-	     (bio->bi_rw & REQ_WRITE)))
-		goto skip;
-
-	if (bio->bi_sector   & (c->sb.block_size - 1) ||
-	    bio_sectors(bio) & (c->sb.block_size - 1)) {
-		pr_debug("skipping unaligned io");
-		goto skip;
-	}
-
-	if (!congested && !dc->sequential_cutoff)
-		goto rescale;
-
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    (bio->bi_rw & REQ_WRITE) &&
-	    (bio->bi_rw & REQ_SYNC))
-		goto rescale;
-
-	if (dc->sequential_merge) {
-		struct io *i;
-
-		spin_lock(&dc->io_lock);
-
-		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-			if (i->last == bio->bi_sector &&
-			    time_before(jiffies, i->jiffies))
-				goto found;
-
-		i = list_first_entry(&dc->io_lru, struct io, lru);
-
-		add_sequential(s->task);
-		i->sequential = 0;
-found:
-		if (i->sequential + bio->bi_size > i->sequential)
-			i->sequential	+= bio->bi_size;
-
-		i->last			 = bio_end_sector(bio);
-		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		s->task->sequential_io	 = i->sequential;
-
-		hlist_del(&i->hash);
-		hlist_add_head(&i->hash, iohash(dc, i->last));
-		list_move_tail(&i->lru, &dc->io_lru);
-
-		spin_unlock(&dc->io_lock);
-	} else {
-		s->task->sequential_io = bio->bi_size;
-
-		add_sequential(s->task);
-	}
-
-	sectors = max(s->task->sequential_io,
-		      s->task->sequential_io_avg) >> 9;
-
-	if (dc->sequential_cutoff &&
-	    sectors >= dc->sequential_cutoff >> 9) {
-		trace_bcache_bypass_sequential(s->orig_bio);
-		goto skip;
-	}
-
-	if (congested && sectors >= congested) {
-		trace_bcache_bypass_congested(s->orig_bio);
-		goto skip;
-	}
-
-rescale:
-	bch_rescale_priorities(c, bio_sectors(bio));
-	return;
-skip:
-	bch_mark_sectors_bypassed(s, bio_sectors(bio));
-	s->op.skip = true;
-}
-
 static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
@@ -1195,12 +1187,16 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s, bio);
 
-		if (!bio_has_data(bio))
+		if (!bio->bi_size)
 			request_nodata(dc, s);
-		else if (rw)
-			request_write(dc, s);
-		else
-			request_read(dc, s);
+		else {
+			s->op.bypass = check_should_bypass(dc, s);
+
+			if (rw)
+				request_write(dc, s);
+			else
+				request_read(dc, s);
+		}
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1298,21 +1294,21 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 	trace_bcache_request_start(s, bio);
 
-	if (bio_has_data(bio) && !rw) {
-		closure_call(&s->op.cl, btree_read_async, NULL, cl);
-	} else if (bio_has_data(bio) || s->op.skip) {
+	if (!bio->bi_size) {
+		if (s->op.flush_journal)
+			bch_journal_meta(s->op.c, cl);
+	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
+		s->op.bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
 		s->writeback	= true;
 		s->op.cache_bio	= bio;
 
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	} else {
-		/* No data - probably a cache flush */
-		if (s->op.flush_journal)
-			bch_journal_meta(s->op.c, cl);
+		closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);

From cdd972b164be8fc69f6ee8533c5a07b621da74c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 17:06:17 -0700
Subject: [PATCH 56/94] bcache: Refactor read request code a bit

More refactoring, and renaming.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c | 71 ++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index dbc2ef6e7a35..3b85f33ae4c7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -884,7 +884,7 @@ skip:
 
 /* Process reads */
 
-static void cached_dev_read_complete(struct closure *cl)
+static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
@@ -902,9 +902,10 @@ static void cached_dev_read_complete(struct closure *cl)
 	cached_dev_bio_complete(cl);
 }
 
-static void request_read_error(struct closure *cl)
+static void cached_dev_read_error(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->bio.bio;
 	struct bio_vec *bv;
 	int i;
 
@@ -928,20 +929,20 @@ static void request_read_error(struct closure *cl)
 
 		/* XXX: invalidate cache */
 
-		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
+		closure_bio_submit(bio, cl, s->d);
 	}
 
-	continue_at(cl, cached_dev_read_complete, NULL);
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }
 
-static void request_read_done(struct closure *cl)
+static void cached_dev_read_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	/*
-	 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
-	 * contains data ready to be inserted into the cache.
+	 * We had a cache miss; cache_bio now contains data ready to be inserted
+	 * into the cache.
 	 *
 	 * First, we copy the data we just read from cache_bio's bounce buffers
 	 * to the buffers the original bio pointed to:
@@ -971,10 +972,10 @@ static void request_read_done(struct closure *cl)
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	}
 
-	continue_at(cl, cached_dev_read_complete, NULL);
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }
 
-static void request_read_done_bh(struct closure *cl)
+static void cached_dev_read_done_bh(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
@@ -983,11 +984,11 @@ static void request_read_done_bh(struct closure *cl)
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
 
 	if (s->error)
-		continue_at_nobarrier(cl, request_read_error, bcache_wq);
+		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
 	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
-		continue_at_nobarrier(cl, request_read_done, bcache_wq);
+		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
-		continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
+		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
 }
 
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
@@ -996,7 +997,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	int ret = 0;
 	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-	struct bio *miss;
+	struct bio *miss, *cache_bio;
 
 	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
@@ -1027,33 +1028,31 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		/* btree_search_recurse()'s btree iterator is no good anymore */
 		ret = -EINTR;
 
-	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
-
-	if (!s->op.cache_bio)
+	if (!cache_bio)
 		goto out_submit;
 
-	s->op.cache_bio->bi_sector	= miss->bi_sector;
-	s->op.cache_bio->bi_bdev	= miss->bi_bdev;
-	s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
+	cache_bio->bi_sector	= miss->bi_sector;
+	cache_bio->bi_bdev	= miss->bi_bdev;
+	cache_bio->bi_size	= s->cache_bio_sectors << 9;
 
-	s->op.cache_bio->bi_end_io	= request_endio;
-	s->op.cache_bio->bi_private	= &s->cl;
+	cache_bio->bi_end_io	= request_endio;
+	cache_bio->bi_private	= &s->cl;
 
-	bch_bio_map(s->op.cache_bio, NULL);
-	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+	bch_bio_map(cache_bio, NULL);
+	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 
-	s->cache_miss = miss;
-	bio_get(s->op.cache_bio);
-
-	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
+	s->cache_miss	= miss;
+	s->op.cache_bio = cache_bio;
+	bio_get(cache_bio);
+	closure_bio_submit(cache_bio, &s->cl, s->d);
 
 	return ret;
 out_put:
-	bio_put(s->op.cache_bio);
-	s->op.cache_bio = NULL;
+	bio_put(cache_bio);
 out_submit:
 	miss->bi_end_io		= request_endio;
 	miss->bi_private	= &s->cl;
@@ -1061,12 +1060,12 @@ out_submit:
 	return ret;
 }
 
-static void request_read(struct cached_dev *dc, struct search *s)
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
 	closure_call(&s->op.cl, btree_read_async, NULL, cl);
-	continue_at(cl, request_read_done_bh, NULL);
+	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
 /* Process writes */
@@ -1080,7 +1079,7 @@ static void cached_dev_write_complete(struct closure *cl)
 	cached_dev_bio_complete(cl);
 }
 
-static void request_write(struct cached_dev *dc, struct search *s)
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
@@ -1152,7 +1151,7 @@ static void request_write(struct cached_dev *dc, struct search *s)
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
-static void request_nodata(struct cached_dev *dc, struct search *s)
+static void cached_dev_nodata(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
@@ -1188,14 +1187,14 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		trace_bcache_request_start(s, bio);
 
 		if (!bio->bi_size)
-			request_nodata(dc, s);
+			cached_dev_nodata(dc, s);
 		else {
 			s->op.bypass = check_should_bypass(dc, s);
 
 			if (rw)
-				request_write(dc, s);
+				cached_dev_write(dc, s);
 			else
-				request_read(dc, s);
+				cached_dev_read(dc, s);
 		}
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&

From a34a8bfd4e6358c646928320d37b0425c0762f8a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:07:04 -0700
Subject: [PATCH 57/94] bcache: Refactor journalling flow control

Making things less asynchronous that don't need to be - bch_journal()
only has to block when the journal or journal entry is full, which is
emphatically not a fast path. So make it a normal function that just
returns when it finishes, to make the code and control flow easier to
follow.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c    |   3 -
 drivers/md/bcache/closure.h  |   2 +-
 drivers/md/bcache/journal.c  | 217 ++++++++++++++++-------------------
 drivers/md/bcache/journal.h  |   3 +-
 drivers/md/bcache/movinggc.c |   2 +-
 drivers/md/bcache/request.c  | 162 ++++++++++++++++----------
 drivers/md/bcache/request.h  |   3 +-
 7 files changed, 210 insertions(+), 182 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f960607f1f25..777c01d67ef0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2164,9 +2164,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		}
 	}
 
-	if (op->journal)
-		atomic_dec_bug(op->journal);
-	op->journal = NULL;
 	return ret;
 }
 
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 00039924ea9d..ab011f03801f 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -642,7 +642,7 @@ do {									\
 #define continue_at_nobarrier(_cl, _fn, _wq)				\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
-	closure_queue(cl);						\
+	closure_queue(_cl);						\
 	return;								\
 } while (0)
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1bdefdb1fa71..940e89e0d706 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -318,7 +318,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			bch_keylist_push(&op->keys);
 
 			op->journal = i->pin;
-			atomic_inc(op->journal);
 
 			ret = bch_btree_insert(op, s, &op->keys);
 			if (ret)
@@ -357,48 +356,35 @@ static void btree_flush_write(struct cache_set *c)
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
-	struct btree *b, *best = NULL;
-	unsigned iter;
+	struct btree *b, *best;
+	unsigned i;
+retry:
+	best = NULL;
 
-	for_each_cached_btree(b, c, iter) {
-		if (!down_write_trylock(&b->lock))
-			continue;
-
-		if (!btree_node_dirty(b) ||
-		    !btree_current_write(b)->journal) {
-			rw_unlock(true, b);
-			continue;
+	for_each_cached_btree(b, c, i)
+		if (btree_current_write(b)->journal) {
+			if (!best)
+				best = b;
+			else if (journal_pin_cmp(c,
+						 btree_current_write(best),
+						 btree_current_write(b))) {
+				best = b;
+			}
 		}
 
-		if (!best)
-			best = b;
-		else if (journal_pin_cmp(c,
-					 btree_current_write(best),
-					 btree_current_write(b))) {
-			rw_unlock(true, best);
-			best = b;
-		} else
+	b = best;
+	if (b) {
+		rw_lock(true, b, b->level);
+
+		if (!btree_current_write(b)->journal) {
 			rw_unlock(true, b);
+			/* We raced */
+			goto retry;
+		}
+
+		bch_btree_node_write(b, NULL);
+		rw_unlock(true, b);
 	}
-
-	if (best)
-		goto out;
-
-	/* We can't find the best btree node, just pick the first */
-	list_for_each_entry(b, &c->btree_cache, list)
-		if (!b->level && btree_node_dirty(b)) {
-			best = b;
-			rw_lock(true, best, best->level);
-			goto found;
-		}
-
-out:
-	if (!best)
-		return;
-found:
-	if (btree_node_dirty(best))
-		bch_btree_node_write(best, NULL);
-	rw_unlock(true, best);
 }
 
 #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -494,7 +480,7 @@ static void journal_reclaim(struct cache_set *c)
 		do_journal_discard(ca);
 
 	if (c->journal.blocks_free)
-		return;
+		goto out;
 
 	/*
 	 * Allocate:
@@ -520,7 +506,7 @@ static void journal_reclaim(struct cache_set *c)
 
 	if (n)
 		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
-
+out:
 	if (!journal_full(&c->journal))
 		__closure_wake_up(&c->journal.wait);
 }
@@ -659,7 +645,7 @@ static void journal_write(struct closure *cl)
 	journal_write_unlocked(cl);
 }
 
-static void __journal_try_write(struct cache_set *c, bool noflush)
+static void journal_try_write(struct cache_set *c)
 	__releases(c->journal.lock)
 {
 	struct closure *cl = &c->journal.io;
@@ -667,29 +653,59 @@ static void __journal_try_write(struct cache_set *c, bool noflush)
 
 	w->need_write = true;
 
-	if (!closure_trylock(cl, &c->cl))
-		spin_unlock(&c->journal.lock);
-	else if (noflush && journal_full(&c->journal)) {
-		spin_unlock(&c->journal.lock);
-		continue_at(cl, journal_write, system_wq);
-	} else
+	if (closure_trylock(cl, &c->cl))
 		journal_write_unlocked(cl);
+	else
+		spin_unlock(&c->journal.lock);
 }
 
-#define journal_try_write(c)	__journal_try_write(c, false)
-
-void bch_journal_meta(struct cache_set *c, struct closure *cl)
+static struct journal_write *journal_wait_for_write(struct cache_set *c,
+						    unsigned nkeys)
 {
-	struct journal_write *w;
+	size_t sectors;
+	struct closure cl;
 
-	if (CACHE_SYNC(&c->sb)) {
+	closure_init_stack(&cl);
+
+	spin_lock(&c->journal.lock);
+
+	while (1) {
+		struct journal_write *w = c->journal.cur;
+
+		sectors = __set_blocks(w->data, w->data->keys + nkeys,
+				       c) * c->sb.block_size;
+
+		if (sectors <= min_t(size_t,
+				     c->journal.blocks_free * c->sb.block_size,
+				     PAGE_SECTORS << JSET_BITS))
+			return w;
+
+		/* XXX: tracepoint */
+		if (!journal_full(&c->journal)) {
+			trace_bcache_journal_entry_full(c);
+
+			/*
+			 * XXX: If we were inserting so many keys that they
+			 * won't fit in an _empty_ journal write, we'll
+			 * deadlock. For now, handle this in
+			 * bch_keylist_realloc() - but something to think about.
+			 */
+			BUG_ON(!w->data->keys);
+
+			closure_wait(&w->wait, &cl);
+			journal_try_write(c); /* unlocks */
+		} else {
+			trace_bcache_journal_full(c);
+
+			closure_wait(&c->journal.wait, &cl);
+			journal_reclaim(c);
+			spin_unlock(&c->journal.lock);
+
+			btree_flush_write(c);
+		}
+
+		closure_sync(&cl);
 		spin_lock(&c->journal.lock);
-		w = c->journal.cur;
-
-		if (cl)
-			BUG_ON(!closure_wait(&w->wait, cl));
-
-		__journal_try_write(c, true);
 	}
 }
 
@@ -708,68 +724,26 @@ static void journal_write_work(struct work_struct *work)
  * bch_journal() hands those same keys off to btree_insert_async()
  */
 
-void bch_journal(struct closure *cl)
+atomic_t *bch_journal(struct cache_set *c,
+		      struct keylist *keys,
+		      struct closure *parent)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct cache_set *c = op->c;
 	struct journal_write *w;
-	size_t sectors, nkeys;
+	atomic_t *ret;
 
-	if (op->type != BTREE_INSERT ||
-	    !CACHE_SYNC(&c->sb))
-		goto out;
+	if (!CACHE_SYNC(&c->sb))
+		return NULL;
 
-	/*
-	 * If we're looping because we errored, might already be waiting on
-	 * another journal write:
-	 */
-	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
-		closure_sync(cl->parent);
+	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
 
-	spin_lock(&c->journal.lock);
+	memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+	w->data->keys += bch_keylist_nkeys(keys);
 
-	if (journal_full(&c->journal)) {
-		trace_bcache_journal_full(c);
+	ret = &fifo_back(&c->journal.pin);
+	atomic_inc(ret);
 
-		closure_wait(&c->journal.wait, cl);
-
-		journal_reclaim(c);
-		spin_unlock(&c->journal.lock);
-
-		btree_flush_write(c);
-		continue_at(cl, bch_journal, bcache_wq);
-	}
-
-	w = c->journal.cur;
-	nkeys = w->data->keys + bch_keylist_nkeys(&op->keys);
-	sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size;
-
-	if (sectors > min_t(size_t,
-			    c->journal.blocks_free * c->sb.block_size,
-			    PAGE_SECTORS << JSET_BITS)) {
-		trace_bcache_journal_entry_full(c);
-
-		/*
-		 * XXX: If we were inserting so many keys that they won't fit in
-		 * an _empty_ journal write, we'll deadlock. For now, handle
-		 * this in bch_keylist_realloc() - but something to think about.
-		 */
-		BUG_ON(!w->data->keys);
-
-		BUG_ON(!closure_wait(&w->wait, cl));
-
-		journal_try_write(c);
-		continue_at(cl, bch_journal, bcache_wq);
-	}
-
-	memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys));
-	w->data->keys += bch_keylist_nkeys(&op->keys);
-
-	op->journal = &fifo_back(&c->journal.pin);
-	atomic_inc(op->journal);
-
-	if (op->flush_journal) {
-		closure_wait(&w->wait, cl->parent);
+	if (parent) {
+		closure_wait(&w->wait, parent);
 		journal_try_write(c);
 	} else if (!w->need_write) {
 		schedule_delayed_work(&c->journal.work,
@@ -778,8 +752,21 @@ void bch_journal(struct closure *cl)
 	} else {
 		spin_unlock(&c->journal.lock);
 	}
-out:
-	bch_btree_insert_async(cl);
+
+
+	return ret;
+}
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+	struct keylist keys;
+	atomic_t *ref;
+
+	bch_keylist_init(&keys);
+
+	ref = bch_journal(c, &keys, cl);
+	if (ref)
+		atomic_dec_bug(ref);
 }
 
 void bch_journal_free(struct cache_set *c)
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3ca93d3d566c..7045e6fd2d5a 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -200,8 +200,9 @@ struct journal_device {
 struct closure;
 struct cache_set;
 struct btree_op;
+struct keylist;
 
-void bch_journal(struct closure *);
+atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
 void bch_journal_next(struct journal *);
 void bch_journal_mark(struct cache_set *, struct list_head *);
 void bch_journal_meta(struct cache_set *, struct closure *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index dd8a035c5ae1..2c42377a65aa 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -110,7 +110,7 @@ static void write_moving(struct closure *cl)
 		bkey_copy(&s->op.replace, &io->w->key);
 
 		closure_init(&s->op.cl, cl);
-		bch_insert_data(&s->op.cl);
+		bch_data_insert(&s->op.cl);
 	}
 
 	continue_at(cl, write_moving_finish, bch_gc_wq);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3b85f33ae4c7..1c3af44b097b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,6 +25,8 @@
 
 struct kmem_cache *bch_search_cache;
 
+static void bch_data_insert_start(struct closure *);
+
 /* Cgroup interface */
 
 #ifdef CONFIG_CGROUP_BCACHE
@@ -211,31 +213,42 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 /* Insert data into cache */
 
-static void bio_invalidate(struct closure *cl)
+static void bch_data_insert_keys(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct bio *bio = op->cache_bio;
+	struct search *s = container_of(op, struct search, op);
 
-	pr_debug("invalidating %i sectors from %llu",
-		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+	/*
+	 * If we're looping, might already be waiting on
+	 * another journal write - can't wait on more than one journal write at
+	 * a time
+	 *
+	 * XXX: this looks wrong
+	 */
+#if 0
+	while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+		closure_sync(&s->cl);
+#endif
 
-	while (bio_sectors(bio)) {
-		unsigned len = min(bio_sectors(bio), 1U << 14);
+	if (s->write)
+		op->journal = bch_journal(op->c, &op->keys,
+					  op->flush_journal
+					  ? &s->cl : NULL);
 
-		if (bch_keylist_realloc(&op->keys, 0, op->c))
-			goto out;
-
-		bio->bi_sector	+= len;
-		bio->bi_size	-= len << 9;
-
-		bch_keylist_add(&op->keys,
-				&KEY(op->inode, bio->bi_sector, len));
+	if (bch_btree_insert(op, op->c, &op->keys)) {
+		s->error		= -ENOMEM;
+		op->insert_data_done	= true;
 	}
 
-	op->insert_data_done = true;
-	bio_put(bio);
-out:
-	continue_at(cl, bch_journal, bcache_wq);
+	if (op->journal)
+		atomic_dec_bug(op->journal);
+	op->journal = NULL;
+
+	if (!op->insert_data_done)
+		continue_at(cl, bch_data_insert_start, bcache_wq);
+
+	bch_keylist_free(&op->keys);
+	closure_return(cl);
 }
 
 struct open_bucket {
@@ -423,7 +436,34 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	return true;
 }
 
-static void bch_insert_data_error(struct closure *cl)
+static void bch_data_invalidate(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct bio *bio = op->cache_bio;
+
+	pr_debug("invalidating %i sectors from %llu",
+		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+
+	while (bio_sectors(bio)) {
+		unsigned len = min(bio_sectors(bio), 1U << 14);
+
+		if (bch_keylist_realloc(&op->keys, 0, op->c))
+			goto out;
+
+		bio->bi_sector	+= len;
+		bio->bi_size	-= len << 9;
+
+		bch_keylist_add(&op->keys, &KEY(op->inode,
+						bio->bi_sector, len));
+	}
+
+	op->insert_data_done = true;
+	bio_put(bio);
+out:
+	continue_at(cl, bch_data_insert_keys, bcache_wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 
@@ -450,10 +490,10 @@ static void bch_insert_data_error(struct closure *cl)
 
 	op->keys.top = dst;
 
-	bch_journal(cl);
+	bch_data_insert_keys(cl);
 }
 
-static void bch_insert_data_endio(struct bio *bio, int error)
+static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
@@ -464,7 +504,7 @@ static void bch_insert_data_endio(struct bio *bio, int error)
 		if (s->writeback)
 			s->error = error;
 		else if (s->write)
-			set_closure_fn(cl, bch_insert_data_error, bcache_wq);
+			set_closure_fn(cl, bch_data_insert_error, bcache_wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
@@ -472,14 +512,14 @@ static void bch_insert_data_endio(struct bio *bio, int error)
 	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 }
 
-static void bch_insert_data_loop(struct closure *cl)
+static void bch_data_insert_start(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio, *n;
 
 	if (op->bypass)
-		return bio_invalidate(cl);
+		return bch_data_invalidate(cl);
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 		set_gc_sectors(op->c);
@@ -502,7 +542,7 @@ static void bch_insert_data_loop(struct closure *cl)
 		if (bch_keylist_realloc(&op->keys,
 					1 + (op->csum ? 1 : 0),
 					op->c))
-			continue_at(cl, bch_journal, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
 		k = op->keys.top;
 		bkey_init(k);
@@ -514,7 +554,7 @@ static void bch_insert_data_loop(struct closure *cl)
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
 
-		n->bi_end_io	= bch_insert_data_endio;
+		n->bi_end_io	= bch_data_insert_endio;
 		n->bi_private	= cl;
 
 		if (s->writeback) {
@@ -537,7 +577,7 @@ static void bch_insert_data_loop(struct closure *cl)
 	} while (n != bio);
 
 	op->insert_data_done = true;
-	continue_at(cl, bch_journal, bcache_wq);
+	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(s->writeback);
@@ -556,7 +596,7 @@ err:
 		 * rest of the write.
 		 */
 		op->bypass = true;
-		return bio_invalidate(cl);
+		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
@@ -566,14 +606,14 @@ err:
 		bio_put(bio);
 
 		if (!bch_keylist_empty(&op->keys))
-			continue_at(cl, bch_journal, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
 	}
 }
 
 /**
- * bch_insert_data - stick some data in the cache
+ * bch_data_insert - stick some data in the cache
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
@@ -591,30 +631,13 @@ err:
  * If op->bypass is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->cache_bio and op->inode.
  */
-void bch_insert_data(struct closure *cl)
+void bch_data_insert(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 
 	bch_keylist_init(&op->keys);
 	bio_get(op->cache_bio);
-	bch_insert_data_loop(cl);
-}
-
-void bch_btree_insert_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
-
-	if (bch_btree_insert(op, op->c, &op->keys)) {
-		s->error		= -ENOMEM;
-		op->insert_data_done	= true;
-	}
-
-	if (op->insert_data_done) {
-		bch_keylist_free(&op->keys);
-		closure_return(cl);
-	} else
-		continue_at(cl, bch_insert_data_loop, bcache_wq);
+	bch_data_insert_start(cl);
 }
 
 /* Common code for the make_request functions */
@@ -969,7 +992,7 @@ static void cached_dev_read_done(struct closure *cl)
 	if (s->op.cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
 		s->op.type = BTREE_REPLACE;
-		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1147,13 +1170,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+	closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
-static void cached_dev_nodata(struct cached_dev *dc, struct search *s)
+static void cached_dev_nodata(struct closure *cl)
 {
-	struct closure *cl = &s->cl;
+	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
 	if (s->op.flush_journal)
@@ -1186,9 +1209,15 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s, bio);
 
-		if (!bio->bi_size)
-			cached_dev_nodata(dc, s);
-		else {
+		if (!bio->bi_size) {
+			/*
+			 * can't call bch_journal_meta from under
+			 * generic_make_request
+			 */
+			continue_at_nobarrier(&s->cl,
+					      cached_dev_nodata,
+					      bcache_wq);
+		} else {
 			s->op.bypass = check_should_bypass(dc, s);
 
 			if (rw)
@@ -1275,6 +1304,16 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	return 0;
 }
 
+static void flash_dev_nodata(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	if (s->op.flush_journal)
+		bch_journal_meta(s->op.c, cl);
+
+	continue_at(cl, search_free, NULL);
+}
+
 static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
@@ -1294,8 +1333,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	trace_bcache_request_start(s, bio);
 
 	if (!bio->bi_size) {
-		if (s->op.flush_journal)
-			bch_journal_meta(s->op.c, cl);
+		/*
+		 * can't call bch_journal_meta from under
+		 * generic_make_request
+		 */
+		continue_at_nobarrier(&s->cl,
+				      flash_dev_nodata,
+				      bcache_wq);
 	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
@@ -1305,7 +1349,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		s->writeback	= true;
 		s->op.cache_bio	= bio;
 
-		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
 		closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 57dc4784f4f4..1f1b59d38db5 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -31,8 +31,7 @@ struct search {
 
 void bch_cache_read_endio(struct bio *, int);
 unsigned bch_get_congested(struct cache_set *);
-void bch_insert_data(struct closure *cl);
-void bch_btree_insert_async(struct closure *);
+void bch_data_insert(struct closure *cl);
 void bch_cache_read_endio(struct bio *, int);
 
 void bch_open_buckets_free(struct cache_set *);

From 0b93207abb40d3c42bb83eba1e1e7edc1da77810 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:26:51 -0700
Subject: [PATCH 58/94] bcache: Move keylist out of btree_op

Slowly working on pruning struct btree_op - the aim is for it to only
contain things that are actually necessary for traversing the btree.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     |  1 -
 drivers/md/bcache/btree.h     |  4 ----
 drivers/md/bcache/journal.c   | 11 +++++++----
 drivers/md/bcache/request.c   | 37 ++++++++++++++++++++---------------
 drivers/md/bcache/request.h   |  4 +++-
 drivers/md/bcache/writeback.c |  7 +++++--
 6 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 777c01d67ef0..731cd8e3fe90 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -113,7 +113,6 @@ void bch_btree_op_init_stack(struct btree_op *op)
 	memset(op, 0, sizeof(struct btree_op));
 	closure_init_stack(&op->cl);
 	op->lock = -1;
-	bch_keylist_init(&op->keys);
 }
 
 /* Btree key manipulation */
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index ea0814b51574..17b7a4e39c7e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -267,10 +267,6 @@ struct btree_op {
 	unsigned		lookup_done:1;
 	unsigned		insert_collision:1;
 
-	/* Anything after this point won't get zeroed in do_bio_hook() */
-
-	/* Keys to be inserted */
-	struct keylist		keys;
 	BKEY_PADDED(replace);
 };
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 940e89e0d706..8866f8ee3a07 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -301,6 +301,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		list_entry(list->prev, struct journal_replay, list);
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+	struct keylist keylist;
+
+	bch_keylist_init(&keylist);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -314,16 +317,16 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		     k = bkey_next(k)) {
 			trace_bcache_journal_replay_key(k);
 
-			bkey_copy(op->keys.top, k);
-			bch_keylist_push(&op->keys);
+			bkey_copy(keylist.top, k);
+			bch_keylist_push(&keylist);
 
 			op->journal = i->pin;
 
-			ret = bch_btree_insert(op, s, &op->keys);
+			ret = bch_btree_insert(op, s, &keylist);
 			if (ret)
 				goto err;
 
-			BUG_ON(!bch_keylist_empty(&op->keys));
+			BUG_ON(!bch_keylist_empty(&keylist));
 			keys++;
 
 			cond_resched();
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 1c3af44b097b..d85c7001df61 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -231,11 +231,11 @@ static void bch_data_insert_keys(struct closure *cl)
 #endif
 
 	if (s->write)
-		op->journal = bch_journal(op->c, &op->keys,
+		op->journal = bch_journal(op->c, &s->insert_keys,
 					  op->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, op->c, &op->keys)) {
+	if (bch_btree_insert(op, op->c, &s->insert_keys)) {
 		s->error		= -ENOMEM;
 		op->insert_data_done	= true;
 	}
@@ -247,7 +247,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (!op->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
-	bch_keylist_free(&op->keys);
+	bch_keylist_free(&s->insert_keys);
 	closure_return(cl);
 }
 
@@ -439,6 +439,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 static void bch_data_invalidate(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
@@ -447,14 +448,14 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&op->keys, 0, op->c))
+		if (bch_keylist_realloc(&s->insert_keys, 0, op->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
-		bch_keylist_add(&op->keys, &KEY(op->inode,
-						bio->bi_sector, len));
+		bch_keylist_add(&s->insert_keys,
+				&KEY(op->inode, bio->bi_sector, len));
 	}
 
 	op->insert_data_done = true;
@@ -466,6 +467,7 @@ out:
 static void bch_data_insert_error(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -476,9 +478,9 @@ static void bch_data_insert_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = op->keys.keys, *dst = op->keys.keys;
+	struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys;
 
-	while (src != op->keys.top) {
+	while (src != s->insert_keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
@@ -488,7 +490,7 @@ static void bch_data_insert_error(struct closure *cl)
 		src = n;
 	}
 
-	op->keys.top = dst;
+	s->insert_keys.top = dst;
 
 	bch_data_insert_keys(cl);
 }
@@ -539,12 +541,12 @@ static void bch_data_insert_start(struct closure *cl)
 			? s->d->bio_split : op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
-		if (bch_keylist_realloc(&op->keys,
+		if (bch_keylist_realloc(&s->insert_keys,
 					1 + (op->csum ? 1 : 0),
 					op->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
-		k = op->keys.top;
+		k = s->insert_keys.top;
 		bkey_init(k);
 		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
@@ -570,7 +572,7 @@ static void bch_data_insert_start(struct closure *cl)
 			bio_csum(n, k);
 
 		trace_bcache_cache_insert(k);
-		bch_keylist_push(&op->keys);
+		bch_keylist_push(&s->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
 		bch_submit_bbio(n, op->c, k, 0);
@@ -605,7 +607,7 @@ err:
 		op->insert_data_done = true;
 		bio_put(bio);
 
-		if (!bch_keylist_empty(&op->keys))
+		if (!bch_keylist_empty(&s->insert_keys))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
@@ -634,8 +636,9 @@ err:
 void bch_data_insert(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 
-	bch_keylist_init(&op->keys);
+	bch_keylist_init(&s->insert_keys);
 	bio_get(op->cache_bio);
 	bch_data_insert_start(cl);
 }
@@ -724,9 +727,11 @@ static void search_free(struct closure *cl)
 
 static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 {
+	struct search *s;
 	struct bio_vec *bv;
-	struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
-	memset(s, 0, offsetof(struct search, op.keys));
+
+	s = mempool_alloc(d->c->search, GFP_NOIO);
+	memset(s, 0, offsetof(struct search, insert_keys));
 
 	__closure_init(&s->cl, NULL);
 
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 1f1b59d38db5..7d02ac5f936e 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -25,8 +25,10 @@ struct search {
 	short			error;
 	unsigned long		start_time;
 
-	/* Anything past op->keys won't get zeroed in do_bio_hook */
 	struct btree_op		op;
+
+	/* Anything past this point won't get zeroed in search_alloc() */
+	struct keylist		insert_keys;
 };
 
 void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 8ffc8ec7231d..51dc709c9bf7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -300,18 +300,21 @@ static void write_dirty_finish(struct closure *cl)
 	if (KEY_DIRTY(&w->key)) {
 		unsigned i;
 		struct btree_op op;
+		struct keylist keys;
+
 		bch_btree_op_init_stack(&op);
+		bch_keylist_init(&keys);
 
 		op.type = BTREE_REPLACE;
 		bkey_copy(&op.replace, &w->key);
 
 		SET_KEY_DIRTY(&w->key, false);
-		bch_keylist_add(&op.keys, &w->key);
+		bch_keylist_add(&keys, &w->key);
 
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &op.keys);
+		bch_btree_insert(&op, dc->disk.c, &keys);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)

From e8e1d4682c8cb06dbcb5ef7bb851bf9bcb889c84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:27:07 -0700
Subject: [PATCH 59/94] bcache: Convert try_wait to wait_queue_head_t

We never waited on c->try_wait asynchronously, so just use the standard
primitives.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |   4 +-
 drivers/md/bcache/btree.c  | 150 +++++++++++++++----------------------
 drivers/md/bcache/btree.h  |  10 ++-
 drivers/md/bcache/super.c  |  10 ++-
 4 files changed, 75 insertions(+), 99 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 10ce0c825fce..c1c44191afb1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -735,8 +735,8 @@ struct cache_set {
 	 * basically a lock for this that we can wait on asynchronously. The
 	 * btree_root() macro releases the lock when it returns.
 	 */
-	struct closure		*try_harder;
-	struct closure_waitlist	try_wait;
+	struct task_struct	*try_harder;
+	wait_queue_head_t	try_wait;
 	uint64_t		try_harder_start;
 
 	/*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 731cd8e3fe90..4d50f1e7006e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -437,7 +437,7 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 
 	set_btree_node_dirty(b);
 
-	if (op && op->journal) {
+	if (op->journal) {
 		if (w->journal &&
 		    journal_pin_cmp(b->c, w, op)) {
 			atomic_dec_bug(w->journal);
@@ -574,34 +574,35 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 	return b;
 }
 
-static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
+static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 {
+	struct closure cl;
+
+	closure_init_stack(&cl);
 	lockdep_assert_held(&b->c->bucket_lock);
 
 	if (!down_write_trylock(&b->lock))
 		return -ENOMEM;
 
-	if (b->page_order < min_order) {
+	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+
+	if (b->page_order < min_order ||
+	    (!flush &&
+	     (btree_node_dirty(b) ||
+	      atomic_read(&b->io.cl.remaining) != -1))) {
 		rw_unlock(true, b);
 		return -ENOMEM;
 	}
 
-	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
-
-	if (cl && btree_node_dirty(b))
-		bch_btree_node_write(b, NULL);
-
-	if (cl)
-		closure_wait_event_async(&b->io.wait, cl,
-			 atomic_read(&b->io.cl.remaining) == -1);
-
-	if (btree_node_dirty(b) ||
-	    !closure_is_unlocked(&b->io.cl) ||
-	    work_pending(&b->work.work)) {
-		rw_unlock(true, b);
-		return -EAGAIN;
+	if (btree_node_dirty(b)) {
+		bch_btree_node_write(b, &cl);
+		closure_sync(&cl);
 	}
 
+	/* wait for any in flight btree write */
+	closure_wait_event_sync(&b->io.wait, &cl,
+		atomic_read(&b->io.cl.remaining) == -1);
+
 	return 0;
 }
 
@@ -641,7 +642,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 			break;
 
 		if (++i > 3 &&
-		    !mca_reap(b, NULL, 0)) {
+		    !mca_reap(b, 0, false)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
 			freed++;
@@ -660,7 +661,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		list_rotate_left(&c->btree_cache);
 
 		if (!b->accessed &&
-		    !mca_reap(b, NULL, 0)) {
+		    !mca_reap(b, 0, false)) {
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
@@ -783,52 +784,27 @@ out:
 	return b;
 }
 
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
-				     int level, struct closure *cl)
+static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
 {
-	int ret = -ENOMEM;
-	struct btree *i;
+	struct btree *b;
 
 	trace_bcache_btree_cache_cannibalize(c);
 
-	if (!cl)
-		return ERR_PTR(-ENOMEM);
+	if (!c->try_harder) {
+		c->try_harder = current;
+		c->try_harder_start = local_clock();
+	} else if (c->try_harder != current)
+		return ERR_PTR(-ENOSPC);
 
-	/*
-	 * Trying to free up some memory - i.e. reuse some btree nodes - may
-	 * require initiating IO to flush the dirty part of the node. If we're
-	 * running under generic_make_request(), that IO will never finish and
-	 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
-	 * punt to workqueue and retry.
-	 */
-	if (current->bio_list)
-		return ERR_PTR(-EAGAIN);
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), false))
+			return b;
 
-	if (c->try_harder && c->try_harder != cl) {
-		closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
-		return ERR_PTR(-EAGAIN);
-	}
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), true))
+			return b;
 
-	c->try_harder = cl;
-	c->try_harder_start = local_clock();
-retry:
-	list_for_each_entry_reverse(i, &c->btree_cache, list) {
-		int r = mca_reap(i, cl, btree_order(k));
-		if (!r)
-			return i;
-		if (r != -ENOMEM)
-			ret = r;
-	}
-
-	if (ret == -EAGAIN &&
-	    closure_blocking(cl)) {
-		mutex_unlock(&c->bucket_lock);
-		closure_sync(cl);
-		mutex_lock(&c->bucket_lock);
-		goto retry;
-	}
-
-	return ERR_PTR(ret);
+	return ERR_PTR(-ENOMEM);
 }
 
 /*
@@ -839,18 +815,19 @@ retry:
  */
 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
 {
-	if (c->try_harder == cl) {
+	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
 		c->try_harder = NULL;
-		__closure_wake_up(&c->try_wait);
+		wake_up(&c->try_wait);
 	}
 }
 
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
-			       int level, struct closure *cl)
+static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
 {
 	struct btree *b;
 
+	BUG_ON(current->bio_list);
+
 	lockdep_assert_held(&c->bucket_lock);
 
 	if (mca_find(c, k))
@@ -860,14 +837,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
 	 * the list. Check if there's any freed nodes there:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freeable, list)
-		if (!mca_reap(b, NULL, btree_order(k)))
+		if (!mca_reap(b, btree_order(k), false))
 			goto out;
 
 	/* We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freed, list)
-		if (!mca_reap(b, NULL, 0)) {
+		if (!mca_reap(b, 0, false)) {
 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
 			if (!b->sets[0].data)
 				goto err;
@@ -901,7 +878,7 @@ err:
 	if (b)
 		rw_unlock(true, b);
 
-	b = mca_cannibalize(c, k, level, cl);
+	b = mca_cannibalize(c, k);
 	if (!IS_ERR(b))
 		goto out;
 
@@ -919,10 +896,9 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
-				 int level, struct btree_op *op)
+				 int level, bool write)
 {
 	int i = 0;
-	bool write = level <= op->lock;
 	struct btree *b;
 
 	BUG_ON(level < 0);
@@ -934,7 +910,7 @@ retry:
 			return ERR_PTR(-EAGAIN);
 
 		mutex_lock(&c->bucket_lock);
-		b = mca_alloc(c, k, level, &op->cl);
+		b = mca_alloc(c, k, level);
 		mutex_unlock(&c->bucket_lock);
 
 		if (!b)
@@ -980,7 +956,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 	struct btree *b;
 
 	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, k, level, NULL);
+	b = mca_alloc(c, k, level);
 	mutex_unlock(&c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
@@ -991,17 +967,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 
 /* Btree alloc */
 
-static void btree_node_free(struct btree *b, struct btree_op *op)
+static void btree_node_free(struct btree *b)
 {
 	unsigned i;
 
 	trace_bcache_btree_node_free(b);
 
-	/*
-	 * The BUG_ON() in btree_node_get() implies that we must have a write
-	 * lock on parent to free or even invalidate a node
-	 */
-	BUG_ON(op->lock <= b->level);
 	BUG_ON(b == b->c->root);
 
 	if (btree_node_dirty(b))
@@ -1037,7 +1008,7 @@ retry:
 
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
-	b = mca_alloc(c, &k.key, level, cl);
+	b = mca_alloc(c, &k.key, level);
 	if (IS_ERR(b))
 		goto err_free;
 
@@ -1173,8 +1144,7 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 	return stale;
 }
 
-static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
-				    struct btree_op *op)
+static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 {
 	/*
 	 * We block priorities from being written for the duration of garbage
@@ -1191,7 +1161,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));
 
-		btree_node_free(n, op);
+		btree_node_free(n);
 		up_write(&n->lock);
 	}
 
@@ -1211,8 +1181,8 @@ struct gc_merge_info {
 	unsigned	keys;
 };
 
-static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
-			      struct gc_stat *gc, struct gc_merge_info *r)
+static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
+			      struct gc_merge_info *r)
 {
 	unsigned nodes = 0, keys = 0, blocks;
 	int i;
@@ -1228,7 +1198,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 
 	for (i = nodes - 1; i >= 0; --i) {
 		if (r[i].b->written)
-			r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
+			r[i].b = btree_gc_alloc(r[i].b, r[i].k);
 
 		if (r[i].b->written)
 			return;
@@ -1292,7 +1262,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		r[i - 1].keys	= n2->keys;
 	}
 
-	btree_node_free(r->b, op);
+	btree_node_free(r->b);
 	up_write(&r->b->lock);
 
 	trace_bcache_btree_gc_coalesce(nodes);
@@ -1324,7 +1294,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 	memset(r, 0, sizeof(r));
 
 	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
-		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
+		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
 
 		if (IS_ERR(r->b)) {
 			ret = PTR_ERR(r->b);
@@ -1337,7 +1307,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		if (!b->written &&
 		    (r->b->level || stale > 10 ||
 		     b->c->gc_always_rewrite))
-			r->b = btree_gc_alloc(r->b, r->k, op);
+			r->b = btree_gc_alloc(r->b, r->k);
 
 		if (r->b->level)
 			ret = btree_gc_recurse(r->b, op, writes, gc);
@@ -1350,7 +1320,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		bkey_copy_key(&b->c->gc_done, r->k);
 
 		if (!b->written)
-			btree_gc_coalesce(b, op, gc, r);
+			btree_gc_coalesce(b, gc, r);
 
 		if (r[GC_MERGE_NODES - 1].b)
 			write(r[GC_MERGE_NODES - 1].b);
@@ -1404,7 +1374,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	if (!IS_ERR_OR_NULL(n)) {
 		closure_sync(&op->cl);
 		bch_btree_set_root(b);
-		btree_node_free(n, op);
+		btree_node_free(n);
 		rw_unlock(true, b);
 	}
 
@@ -2004,18 +1974,18 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	}
 
 	rw_unlock(true, n1);
-	btree_node_free(b, op);
+	btree_node_free(b);
 
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
 
 	return 0;
 err_free2:
 	__bkey_put(n2->c, &n2->key);
-	btree_node_free(n2, op);
+	btree_node_free(n2);
 	rw_unlock(true, n2);
 err_free1:
 	__bkey_put(n1->c, &n1->key);
-	btree_node_free(n1, op);
+	btree_node_free(n1);
 	rw_unlock(true, n1);
 err:
 	if (n3 == ERR_PTR(-EAGAIN) ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 17b7a4e39c7e..72794ab8e8e5 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -326,7 +326,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, op);	\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
 	if (!IS_ERR(_child)) {						\
 		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
@@ -356,6 +356,11 @@ static inline void rw_unlock(bool w, struct btree *b)
 		}							\
 		rw_unlock(_w, _b);					\
 		bch_cannibalize_unlock(c, &(op)->cl);			\
+		if (_r == -ENOSPC) {					\
+			wait_event((c)->try_wait,			\
+				   !(c)->try_harder);			\
+			_r = -EINTR;					\
+		}							\
 	} while (_r == -EINTR);						\
 									\
 	_r;								\
@@ -375,8 +380,7 @@ void bch_btree_node_write(struct btree *, struct closure *);
 void bch_cannibalize_unlock(struct cache_set *, struct closure *);
 void bch_btree_set_root(struct btree *);
 struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
-struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
-				int, struct btree_op *);
+struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index d3169c0652f8..9a164cd4058c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1436,12 +1436,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	c->sort_crit_factor = int_sqrt(c->btree_pages);
 
-	mutex_init(&c->bucket_lock);
-	mutex_init(&c->sort_lock);
-	spin_lock_init(&c->sort_time_lock);
 	closure_init_unlocked(&c->sb_write);
+	mutex_init(&c->bucket_lock);
+	init_waitqueue_head(&c->try_wait);
 	closure_init_unlocked(&c->uuid_write);
+	spin_lock_init(&c->sort_time_lock);
+	mutex_init(&c->sort_lock);
 	spin_lock_init(&c->btree_read_time_lock);
+
 	bch_moving_init_cache_set(c);
 
 	INIT_LIST_HEAD(&c->list);
@@ -1529,7 +1531,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "error reading btree root";
-		c->root = bch_btree_node_get(c, k, j->btree_level, &op);
+		c->root = bch_btree_node_get(c, k, j->btree_level, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 

From 35fcd848d72683141052aa9880542461577f2dbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:29:09 -0700
Subject: [PATCH 60/94] bcache: Convert bucket_wait to wait_queue_head_t

At one point we did do fancy asynchronous waiting stuff with
bucket_wait, but that's all gone (and bucket_wait is used a lot less
than it used to be). So use the standard primitives.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c   | 82 +++++++++++++++++++++----------------
 drivers/md/bcache/bcache.h  |  8 ++--
 drivers/md/bcache/btree.c   | 25 +++++------
 drivers/md/bcache/btree.h   |  6 +--
 drivers/md/bcache/request.c |  9 +---
 drivers/md/bcache/super.c   |  7 ++--
 6 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e033b0203b68..1b64e662e81b 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -339,7 +339,7 @@ static int bch_allocator_thread(void *arg)
 			allocator_wait(ca, !fifo_full(&ca->free));
 
 			fifo_push(&ca->free, bucket);
-			closure_wake_up(&ca->set->bucket_wait);
+			wake_up(&ca->set->bucket_wait);
 		}
 
 		/*
@@ -365,16 +365,41 @@ static int bch_allocator_thread(void *arg)
 	}
 }
 
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
+long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
 {
-	long r = -1;
-again:
+	DEFINE_WAIT(w);
+	struct bucket *b;
+	long r;
+
+	/* fastpath */
+	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+		fifo_pop(&ca->free, r);
+		goto out;
+	}
+
+	if (!wait)
+		return -1;
+
+	while (1) {
+		if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+			fifo_pop(&ca->free, r);
+			break;
+		}
+
+		prepare_to_wait(&ca->set->bucket_wait, &w,
+				TASK_UNINTERRUPTIBLE);
+
+		mutex_unlock(&ca->set->bucket_lock);
+		schedule();
+		mutex_lock(&ca->set->bucket_lock);
+	}
+
+	finish_wait(&ca->set->bucket_wait, &w);
+out:
 	wake_up_process(ca->alloc_thread);
 
-	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
-	    fifo_pop(&ca->free, r)) {
-		struct bucket *b = ca->buckets + r;
 #ifdef CONFIG_BCACHE_EDEBUG
+	{
 		size_t iter;
 		long i;
 
@@ -387,36 +412,23 @@ again:
 			BUG_ON(i == r);
 		fifo_for_each(i, &ca->unused, iter)
 			BUG_ON(i == r);
+	}
 #endif
-		BUG_ON(atomic_read(&b->pin) != 1);
+	b = ca->buckets + r;
 
-		SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+	BUG_ON(atomic_read(&b->pin) != 1);
 
-		if (watermark <= WATERMARK_METADATA) {
-			SET_GC_MARK(b, GC_MARK_METADATA);
-			b->prio = BTREE_PRIO;
-		} else {
-			SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
-			b->prio = INITIAL_PRIO;
-		}
+	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
 
-		return r;
+	if (watermark <= WATERMARK_METADATA) {
+		SET_GC_MARK(b, GC_MARK_METADATA);
+		b->prio = BTREE_PRIO;
+	} else {
+		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+		b->prio = INITIAL_PRIO;
 	}
 
-	trace_bcache_alloc_fail(ca);
-
-	if (cl) {
-		closure_wait(&ca->set->bucket_wait, cl);
-
-		if (closure_blocking(cl)) {
-			mutex_unlock(&ca->set->bucket_lock);
-			closure_sync(cl);
-			mutex_lock(&ca->set->bucket_lock);
-			goto again;
-		}
-	}
-
-	return -1;
+	return r;
 }
 
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -433,7 +445,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
-			   struct bkey *k, int n, struct closure *cl)
+			   struct bkey *k, int n, bool wait)
 {
 	int i;
 
@@ -446,7 +458,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 
 	for (i = 0; i < n; i++) {
 		struct cache *ca = c->cache_by_alloc[i];
-		long b = bch_bucket_alloc(ca, watermark, cl);
+		long b = bch_bucket_alloc(ca, watermark, wait);
 
 		if (b == -1)
 			goto err;
@@ -466,11 +478,11 @@ err:
 }
 
 int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
-			 struct bkey *k, int n, struct closure *cl)
+			 struct bkey *k, int n, bool wait)
 {
 	int ret;
 	mutex_lock(&c->bucket_lock);
-	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
+	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
 	mutex_unlock(&c->bucket_lock);
 	return ret;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c1c44191afb1..d3520748bc27 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -750,7 +750,7 @@ struct cache_set {
 	 * written.
 	 */
 	atomic_t		prio_blocked;
-	struct closure_waitlist	bucket_wait;
+	wait_queue_head_t	bucket_wait;
 
 	/*
 	 * For any bio we don't skip we subtract the number of sectors from
@@ -1162,13 +1162,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
 bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
-			   struct bkey *, int, struct closure *);
+			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
-			 struct bkey *, int, struct closure *);
+			 struct bkey *, int, bool);
 
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 4d50f1e7006e..935d90df397b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -813,7 +813,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
-void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
+void bch_cannibalize_unlock(struct cache_set *c)
 {
 	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
@@ -995,15 +995,14 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
-				   struct closure *cl)
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
+	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		goto err;
 
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
@@ -1036,10 +1035,9 @@ err:
 	return b;
 }
 
-static struct btree *btree_node_alloc_replacement(struct btree *b,
-						  struct closure *cl)
+static struct btree *btree_node_alloc_replacement(struct btree *b)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
+	struct btree *n = bch_btree_node_alloc(b->c, b->level);
 	if (!IS_ERR_OR_NULL(n))
 		bch_btree_sort_into(b, n);
 
@@ -1152,7 +1150,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
 	 * our closure.
 	 */
-	struct btree *n = btree_node_alloc_replacement(b, NULL);
+	struct btree *n = btree_node_alloc_replacement(b);
 
 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
@@ -1359,7 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
 
 	if (b->level || stale > 10)
-		n = btree_node_alloc_replacement(b, NULL);
+		n = btree_node_alloc_replacement(b);
 
 	if (!IS_ERR_OR_NULL(n))
 		swap(b, n);
@@ -1882,10 +1880,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 
-	if (b->level)
-		set_closure_blocking(&op->cl);
-
-	n1 = btree_node_alloc_replacement(b, &op->cl);
+	n1 = btree_node_alloc_replacement(b);
 	if (IS_ERR(n1))
 		goto err;
 
@@ -1896,12 +1891,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
 
-		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
+		n2 = bch_btree_node_alloc(b->c, b->level);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
+			n3 = bch_btree_node_alloc(b->c, b->level + 1);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 72794ab8e8e5..d691d954730e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -355,7 +355,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c, &(op)->cl);			\
+		bch_cannibalize_unlock(c);				\
 		if (_r == -ENOSPC) {					\
 			wait_event((c)->try_wait,			\
 				   !(c)->try_harder);			\
@@ -377,9 +377,9 @@ static inline bool should_split(struct btree *b)
 void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
-void bch_cannibalize_unlock(struct cache_set *, struct closure *);
+void bch_cannibalize_unlock(struct cache_set *);
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
+struct btree *bch_btree_node_alloc(struct cache_set *, int);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d85c7001df61..26d18f4bf4a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -350,14 +350,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	struct cache_set *c = s->op.c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
-	struct closure cl, *w = NULL;
 	unsigned i;
 
-	if (s->writeback) {
-		closure_init_stack(&cl);
-		w = &cl;
-	}
-
 	/*
 	 * We might have to allocate a new bucket, which we can't do with a
 	 * spinlock held. So if we have to allocate, we drop the lock, allocate
@@ -375,7 +369,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 		spin_unlock(&c->data_bucket_lock);
 
-		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
+					 1, s->writeback))
 			return false;
 
 		spin_lock(&c->data_bucket_lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9a164cd4058c..84398a82fbe3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -427,7 +427,7 @@ static int __uuid_write(struct cache_set *c)
 
 	lockdep_assert_held(&bch_register_lock);
 
-	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
+	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		return 1;
 
 	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -565,7 +565,7 @@ void bch_prio_write(struct cache *ca)
 		p->magic	= pset_magic(ca);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
-		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
+		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
 		BUG_ON(bucket == -1);
 
 		mutex_unlock(&ca->set->bucket_lock);
@@ -1439,6 +1439,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	closure_init_unlocked(&c->sb_write);
 	mutex_init(&c->bucket_lock);
 	init_waitqueue_head(&c->try_wait);
+	init_waitqueue_head(&c->bucket_wait);
 	closure_init_unlocked(&c->uuid_write);
 	spin_lock_init(&c->sort_time_lock);
 	mutex_init(&c->sort_lock);
@@ -1608,7 +1609,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err_unlock_gc;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, 0, &op.cl);
+		c->root = bch_btree_node_alloc(c, 0);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err_unlock_gc;
 

From 72a44517f3ca3725dc86081d105457df46448679 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:19:26 -0700
Subject: [PATCH 61/94] bcache: Convert gc to a kthread

We needed a dedicated rescuer workqueue for gc anyways... and gc was
conceptually a dedicated thread, just one that wasn't running all the
time. Switch it to a dedicated thread to make the code a bit more
straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c    |  6 ++---
 drivers/md/bcache/bcache.h   |  9 +++----
 drivers/md/bcache/btree.c    | 50 ++++++++++++++++++++++++------------
 drivers/md/bcache/btree.h    | 10 ++++++--
 drivers/md/bcache/movinggc.c | 35 ++++++++++---------------
 drivers/md/bcache/request.c  |  2 +-
 drivers/md/bcache/super.c    | 20 +++++++--------
 drivers/md/bcache/sysfs.c    |  2 +-
 8 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 1b64e662e81b..b9bd5866055d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -210,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 			 * multiple times when it can't do anything
 			 */
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 
@@ -235,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 	}
@@ -260,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 	}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3520748bc27..09410eb07d82 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -773,7 +773,7 @@ struct cache_set {
 	struct gc_stat		gc_stats;
 	size_t			nbuckets;
 
-	struct closure_with_waitlist gc;
+	struct task_struct	*gc_thread;
 	/* Where in the btree gc currently is */
 	struct bkey		gc_done;
 
@@ -786,11 +786,10 @@ struct cache_set {
 	/* Counts how many sectors bio_insert has added to the cache */
 	atomic_t		sectors_to_gc;
 
-	struct closure		moving_gc;
-	struct closure_waitlist	moving_gc_wait;
+	wait_queue_head_t	moving_gc_wait;
 	struct keybuf		moving_gc_keys;
 	/* Number of moving GC bios in flight */
-	atomic_t		in_flight;
+	struct semaphore	moving_in_flight;
 
 	struct btree		*root;
 
@@ -1176,7 +1175,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...);
 void bch_prio_write(struct cache *);
 void bch_write_bdev_super(struct cached_dev *, struct closure *);
 
-extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
+extern struct workqueue_struct *bcache_wq;
 extern const char * const bch_cache_modes[];
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 935d90df397b..17bfd87fc8f4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -28,7 +28,9 @@
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/freezer.h>
 #include <linux/hash.h>
+#include <linux/kthread.h>
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
@@ -105,7 +107,6 @@ static const char *op_type(struct btree_op *op)
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 
-struct workqueue_struct *bch_gc_wq;
 static struct workqueue_struct *btree_io_wq;
 
 void bch_btree_op_init_stack(struct btree_op *op)
@@ -732,12 +733,9 @@ int bch_btree_cache_alloc(struct cache_set *c)
 {
 	unsigned i;
 
-	/* XXX: doesn't check for errors */
-
-	closure_init_unlocked(&c->gc);
-
 	for (i = 0; i < mca_reserve(c); i++)
-		mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
+			return -ENOMEM;
 
 	list_splice_init(&c->btree_cache,
 			 &c->btree_cache_freeable);
@@ -1456,9 +1454,8 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 	return available;
 }
 
-static void bch_btree_gc(struct closure *cl)
+static void bch_btree_gc(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	int ret;
 	unsigned long available;
 	struct gc_stat stats;
@@ -1483,7 +1480,7 @@ static void bch_btree_gc(struct closure *cl)
 
 	if (ret) {
 		pr_warn("gc failed!");
-		continue_at(cl, bch_btree_gc, bch_gc_wq);
+		return;
 	}
 
 	/* Possibly wait for new UUIDs or whatever to hit disk */
@@ -1505,12 +1502,35 @@ static void bch_btree_gc(struct closure *cl)
 
 	trace_bcache_gc_end(c);
 
-	continue_at(cl, bch_moving_gc, bch_gc_wq);
+	bch_moving_gc(c);
 }
 
-void bch_queue_gc(struct cache_set *c)
+static int bch_gc_thread(void *arg)
 {
-	closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
+	struct cache_set *c = arg;
+
+	while (1) {
+		bch_btree_gc(c);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		try_to_freeze();
+		schedule();
+	}
+
+	return 0;
+}
+
+int bch_gc_thread_start(struct cache_set *c)
+{
+	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(c->gc_thread))
+		return PTR_ERR(c->gc_thread);
+
+	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
+	return 0;
 }
 
 /* Initial partial gc */
@@ -2480,14 +2500,12 @@ void bch_btree_exit(void)
 {
 	if (btree_io_wq)
 		destroy_workqueue(btree_io_wq);
-	if (bch_gc_wq)
-		destroy_workqueue(bch_gc_wq);
 }
 
 int __init bch_btree_init(void)
 {
-	if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
-	    !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
+	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
+	if (!btree_io_wq)
 		return -ENOMEM;
 
 	return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d691d954730e..fa9641aaed39 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -388,12 +388,18 @@ int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
 
-void bch_queue_gc(struct cache_set *);
+int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
-void bch_moving_gc(struct closure *);
+void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *, struct btree_op *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 
+static inline void wake_up_gc(struct cache_set *c)
+{
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
+}
+
 void bch_keybuf_init(struct keybuf *);
 void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
 		       keybuf_pred_fn *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 2c42377a65aa..6ba050456ec8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -57,8 +57,7 @@ static void write_moving_finish(struct closure *cl)
 
 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
 
-	atomic_dec_bug(&io->s.op.c->in_flight);
-	closure_wake_up(&io->s.op.c->moving_gc_wait);
+	up(&io->s.op.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -113,7 +112,7 @@ static void write_moving(struct closure *cl)
 		bch_data_insert(&s->op.cl);
 	}
 
-	continue_at(cl, write_moving_finish, bch_gc_wq);
+	continue_at(cl, write_moving_finish, system_wq);
 }
 
 static void read_moving_submit(struct closure *cl)
@@ -124,15 +123,17 @@ static void read_moving_submit(struct closure *cl)
 
 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
 
-	continue_at(cl, write_moving, bch_gc_wq);
+	continue_at(cl, write_moving, system_wq);
 }
 
-static void read_moving(struct closure *cl)
+static void read_moving(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
 	struct keybuf_key *w;
 	struct moving_io *io;
 	struct bio *bio;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	/* XXX: if we error, background writeback could stall indefinitely */
 
@@ -164,13 +165,8 @@ static void read_moving(struct closure *cl)
 
 		trace_bcache_gc_copy(&w->key);
 
-		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
-
-		if (atomic_inc_return(&c->in_flight) >= 64) {
-			closure_wait_event(&c->moving_gc_wait, cl,
-					   atomic_read(&c->in_flight) < 64);
-			continue_at(cl, read_moving, bch_gc_wq);
-		}
+		down(&c->moving_in_flight);
+		closure_call(&io->s.cl, read_moving_submit, NULL, &cl);
 	}
 
 	if (0) {
@@ -180,7 +176,7 @@ err:		if (!IS_ERR_OR_NULL(w->private))
 		bch_keybuf_del(&c->moving_gc_keys, w);
 	}
 
-	closure_return(cl);
+	closure_sync(&cl);
 }
 
 static bool bucket_cmp(struct bucket *l, struct bucket *r)
@@ -193,15 +189,14 @@ static unsigned bucket_heap_top(struct cache *ca)
 	return GC_SECTORS_USED(heap_peek(&ca->heap));
 }
 
-void bch_moving_gc(struct closure *cl)
+void bch_moving_gc(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	struct cache *ca;
 	struct bucket *b;
 	unsigned i;
 
 	if (!c->copy_gc_enabled)
-		closure_return(cl);
+		return;
 
 	mutex_lock(&c->bucket_lock);
 
@@ -242,13 +237,11 @@ void bch_moving_gc(struct closure *cl)
 
 	c->moving_gc_keys.last_scanned = ZERO_KEY;
 
-	closure_init(&c->moving_gc, cl);
-	read_moving(&c->moving_gc);
-
-	closure_return(cl);
+	read_moving(c);
 }
 
 void bch_moving_init_cache_set(struct cache_set *c)
 {
 	bch_keybuf_init(&c->moving_gc_keys);
+	sema_init(&c->moving_in_flight, 64);
 }
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 26d18f4bf4a0..f779eb420d69 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -520,7 +520,7 @@ static void bch_data_insert_start(struct closure *cl)
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 		set_gc_sectors(op->c);
-		bch_queue_gc(op->c);
+		wake_up_gc(op->c);
 	}
 
 	/*
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 84398a82fbe3..f89e2296bde1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1342,6 +1342,9 @@ static void cache_set_flush(struct closure *cl)
 	kobject_put(&c->internal);
 	kobject_del(&c->kobj);
 
+	if (c->gc_thread)
+		kthread_stop(c->gc_thread);
+
 	if (!IS_ERR_OR_NULL(c->root))
 		list_add(&c->root->list, &c->btree_cache);
 
@@ -1579,8 +1582,6 @@ static void run_cache_set(struct cache_set *c)
 		bch_journal_replay(c, &journal, &op);
 	} else {
 		pr_notice("invalidating existing data");
-		/* Don't want invalidate_buckets() to queue a gc yet */
-		closure_lock(&c->gc, NULL);
 
 		for_each_cache(ca, c, i) {
 			unsigned j;
@@ -1606,12 +1607,12 @@ static void run_cache_set(struct cache_set *c)
 
 		err = "cannot allocate new UUID bucket";
 		if (__uuid_write(c))
-			goto err_unlock_gc;
+			goto err;
 
 		err = "cannot allocate new btree root";
 		c->root = bch_btree_node_alloc(c, 0);
 		if (IS_ERR_OR_NULL(c->root))
-			goto err_unlock_gc;
+			goto err;
 
 		bkey_copy_key(&c->root->key, &MAX_KEY);
 		bch_btree_node_write(c->root, &op.cl);
@@ -1628,12 +1629,12 @@ static void run_cache_set(struct cache_set *c)
 
 		bch_journal_next(&c->journal);
 		bch_journal_meta(c, &op.cl);
-
-		/* Unlock */
-		closure_set_stopped(&c->gc.cl);
-		closure_put(&c->gc.cl);
 	}
 
+	err = "error starting gc thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
 	closure_sync(&op.cl);
 	c->sb.last_mount = get_seconds();
 	bcache_write_super(c);
@@ -1644,9 +1645,6 @@ static void run_cache_set(struct cache_set *c)
 	flash_devs_run(c);
 
 	return;
-err_unlock_gc:
-	closure_set_stopped(&c->gc.cl);
-	closure_put(&c->gc.cl);
 err:
 	closure_sync(&op.cl);
 	/* XXX: test this, it's broken */
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3a66f17231d..ab286b9b5e40 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -566,7 +566,7 @@ STORE(__bch_cache_set)
 	}
 
 	if (attr == &sysfs_trigger_gc)
-		bch_queue_gc(c);
+		wake_up_gc(c);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;

From 5e6926daac267dd99552ae613f041a9e88bbf258 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:50:06 -0700
Subject: [PATCH 62/94] bcache: Convert writeback to a kthread

This simplifies the writeback flow control quite a bit - previously, it
was conceptually two coroutines, refill_dirty() and read_dirty(). This
makes the code quite a bit more straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  10 +-
 drivers/md/bcache/super.c     |   3 +-
 drivers/md/bcache/writeback.c | 371 ++++++++++++++++------------------
 drivers/md/bcache/writeback.h |  25 ++-
 4 files changed, 203 insertions(+), 206 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 09410eb07d82..674e2f42e778 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -509,7 +509,7 @@ struct cached_dev {
 
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
-	struct closure_with_timer writeback;
+	struct task_struct	*writeback_thread;
 
 	struct keybuf		writeback_keys;
 
@@ -1038,7 +1038,11 @@ static inline void bkey_init(struct bkey *k)
 
 #define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
 #define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
-#define MAX_KEY			KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+
+#define MAX_KEY_INODE		(~(~0 << 20))
+#define MAX_KEY_OFFSET		(((uint64_t) ~0) >> 1)
+#define MAX_KEY			KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
+
 #define ZERO_KEY		KEY(0, 0, 0)
 
 /*
@@ -1214,8 +1218,6 @@ int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
 int bch_debug_init(struct kobject *);
-void bch_writeback_exit(void);
-int bch_writeback_init(void);
 void bch_request_exit(void);
 int bch_request_init(void);
 void bch_btree_exit(void);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f89e2296bde1..b79dd5a6679e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1029,6 +1029,7 @@ static void cached_dev_free(struct closure *cl)
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
+	kthread_stop(dc->writeback_thread);
 
 	mutex_lock(&bch_register_lock);
 
@@ -2006,7 +2007,6 @@ static struct notifier_block reboot = {
 static void bcache_exit(void)
 {
 	bch_debug_exit();
-	bch_writeback_exit();
 	bch_request_exit();
 	bch_btree_exit();
 	if (bcache_kobj)
@@ -2039,7 +2039,6 @@ static int __init bcache_init(void)
 	    sysfs_create_files(bcache_kobj, files) ||
 	    bch_btree_init() ||
 	    bch_request_init() ||
-	    bch_writeback_init() ||
 	    bch_debug_init(bcache_kobj))
 		goto err;
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 51dc709c9bf7..4392f3f38d62 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -11,18 +11,11 @@
 #include "debug.h"
 #include "writeback.h"
 
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
 #include <trace/events/bcache.h>
 
-static struct workqueue_struct *dirty_wq;
-
-static void read_dirty(struct closure *);
-
-struct dirty_io {
-	struct closure		cl;
-	struct cached_dev	*dc;
-	struct bio		bio;
-};
-
 /* Rate limiting */
 
 static void __update_writeback_rate(struct cached_dev *dc)
@@ -72,9 +65,6 @@ out:
 	dc->writeback_rate_derivative = derivative;
 	dc->writeback_rate_change = change;
 	dc->writeback_rate_target = target;
-
-	schedule_delayed_work(&dc->writeback_rate_update,
-			      dc->writeback_rate_update_seconds * HZ);
 }
 
 static void update_writeback_rate(struct work_struct *work)
@@ -90,6 +80,9 @@ static void update_writeback_rate(struct work_struct *work)
 		__update_writeback_rate(dc);
 
 	up_read(&dc->writeback_lock);
+
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
 }
 
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
@@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 	return min_t(uint64_t, ret, HZ);
 }
 
-/* Background writeback */
-
-static bool dirty_pred(struct keybuf *buf, struct bkey *k)
-{
-	return KEY_DIRTY(k);
-}
-
-static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
-{
-	uint64_t stripe = KEY_START(k);
-	unsigned nr_sectors = KEY_SIZE(k);
-	struct cached_dev *dc = container_of(buf, struct cached_dev,
-					     writeback_keys);
-
-	if (!KEY_DIRTY(k))
-		return false;
-
-	do_div(stripe, dc->disk.stripe_size);
-
-	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
-		    dc->disk.stripe_size)
-			return true;
-
-		if (nr_sectors <= dc->disk.stripe_size)
-			return false;
-
-		nr_sectors -= dc->disk.stripe_size;
-		stripe++;
-	}
-}
+struct dirty_io {
+	struct closure		cl;
+	struct cached_dev	*dc;
+	struct bio		bio;
+};
 
 static void dirty_init(struct keybuf_key *w)
 {
@@ -153,132 +120,6 @@ static void dirty_init(struct keybuf_key *w)
 	bch_bio_map(bio, NULL);
 }
 
-static void refill_dirty(struct closure *cl)
-{
-	struct cached_dev *dc = container_of(cl, struct cached_dev,
-					     writeback.cl);
-	struct keybuf *buf = &dc->writeback_keys;
-	bool searched_from_start = false;
-	struct bkey end = MAX_KEY;
-	SET_KEY_INODE(&end, dc->disk.id);
-
-	if (!atomic_read(&dc->disk.detaching) &&
-	    !dc->writeback_running)
-		closure_return(cl);
-
-	down_write(&dc->writeback_lock);
-
-	if (!atomic_read(&dc->has_dirty)) {
-		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
-		bch_write_bdev_super(dc, NULL);
-
-		up_write(&dc->writeback_lock);
-		closure_return(cl);
-	}
-
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
-	if (dc->partial_stripes_expensive) {
-		uint64_t i;
-
-		for (i = 0; i < dc->disk.nr_stripes; i++)
-			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    dc->disk.stripe_size)
-				goto full_stripes;
-
-		goto normal_refill;
-full_stripes:
-		searched_from_start = false;	/* not searching entire btree */
-		bch_refill_keybuf(dc->disk.c, buf, &end,
-				  dirty_full_stripe_pred);
-	} else {
-normal_refill:
-		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
-	}
-
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
-		/* Searched the entire btree  - delay awhile */
-
-		if (RB_EMPTY_ROOT(&buf->keys)) {
-			atomic_set(&dc->has_dirty, 0);
-			cached_dev_put(dc);
-		}
-
-		if (!atomic_read(&dc->disk.detaching))
-			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
-	}
-
-	up_write(&dc->writeback_lock);
-
-	bch_ratelimit_reset(&dc->writeback_rate);
-
-	/* Punt to workqueue only so we don't recurse and blow the stack */
-	continue_at(cl, read_dirty, dirty_wq);
-}
-
-void bch_writeback_queue(struct cached_dev *dc)
-{
-	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
-		if (!atomic_read(&dc->disk.detaching))
-			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
-
-		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
-	}
-}
-
-void bch_writeback_add(struct cached_dev *dc)
-{
-	if (!atomic_read(&dc->has_dirty) &&
-	    !atomic_xchg(&dc->has_dirty, 1)) {
-		atomic_inc(&dc->count);
-
-		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
-			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
-			/* XXX: should do this synchronously */
-			bch_write_bdev_super(dc, NULL);
-		}
-
-		bch_writeback_queue(dc);
-
-		if (dc->writeback_percent)
-			schedule_delayed_work(&dc->writeback_rate_update,
-				      dc->writeback_rate_update_seconds * HZ);
-	}
-}
-
-void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
-				  uint64_t offset, int nr_sectors)
-{
-	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_offset;
-	uint64_t stripe = offset;
-
-	if (!d)
-		return;
-
-	do_div(stripe, d->stripe_size);
-
-	stripe_offset = offset & (d->stripe_size - 1);
-
-	while (nr_sectors) {
-		int s = min_t(unsigned, abs(nr_sectors),
-			      d->stripe_size - stripe_offset);
-
-		if (nr_sectors < 0)
-			s = -s;
-
-		atomic_add(s, d->stripe_sectors_dirty + stripe);
-		nr_sectors -= s;
-		stripe_offset = 0;
-		stripe++;
-	}
-}
-
-/* Background writeback - IO loop */
-
 static void dirty_io_destructor(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
@@ -378,30 +219,33 @@ static void read_dirty_submit(struct closure *cl)
 	continue_at(cl, write_dirty, system_wq);
 }
 
-static void read_dirty(struct closure *cl)
+static void read_dirty(struct cached_dev *dc)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev,
-					     writeback.cl);
-	unsigned delay = writeback_delay(dc, 0);
+	unsigned delay = 0;
 	struct keybuf_key *w;
 	struct dirty_io *io;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	/*
 	 * XXX: if we error, background writeback just spins. Should use some
 	 * mempools.
 	 */
 
-	while (1) {
+	while (!kthread_should_stop()) {
+		try_to_freeze();
+
 		w = bch_keybuf_next(&dc->writeback_keys);
 		if (!w)
 			break;
 
 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
 
-		if (delay > 0 &&
-		    (KEY_START(&w->key) != dc->last_read ||
-		     jiffies_to_msecs(delay) > 50))
-			delay = schedule_timeout_uninterruptible(delay);
+		if (KEY_START(&w->key) != dc->last_read ||
+		    jiffies_to_msecs(delay) > 50)
+			while (!kthread_should_stop() && delay)
+				delay = schedule_timeout_interruptible(delay);
 
 		dc->last_read	= KEY_OFFSET(&w->key);
 
@@ -427,7 +271,7 @@ static void read_dirty(struct closure *cl)
 		trace_bcache_writeback(&w->key);
 
 		down(&dc->in_flight);
-		closure_call(&io->cl, read_dirty_submit, NULL, cl);
+		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
 
 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
 	}
@@ -443,7 +287,148 @@ err:
 	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
 	 * freed) before refilling again
 	 */
-	continue_at(cl, refill_dirty, dirty_wq);
+	closure_sync(&cl);
+}
+
+/* Scan for dirty data */
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  uint64_t offset, int nr_sectors)
+{
+	struct bcache_device *d = c->devices[inode];
+	unsigned stripe_offset;
+	uint64_t stripe = offset;
+
+	if (!d)
+		return;
+
+	do_div(stripe, d->stripe_size);
+
+	stripe_offset = offset & (d->stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      d->stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		atomic_add(s, d->stripe_sectors_dirty + stripe);
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+	return KEY_DIRTY(k);
+}
+
+static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+{
+	uint64_t stripe = KEY_START(k);
+	unsigned nr_sectors = KEY_SIZE(k);
+	struct cached_dev *dc = container_of(buf, struct cached_dev,
+					     writeback_keys);
+
+	if (!KEY_DIRTY(k))
+		return false;
+
+	do_div(stripe, dc->disk.stripe_size);
+
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
+		    dc->disk.stripe_size)
+			return true;
+
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
+		stripe++;
+	}
+}
+
+static bool refill_dirty(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	bool searched_from_start = false;
+	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+
+	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+		buf->last_scanned = KEY(dc->disk.id, 0, 0);
+		searched_from_start = true;
+	}
+
+	if (dc->partial_stripes_expensive) {
+		uint64_t i;
+
+		for (i = 0; i < dc->disk.nr_stripes; i++)
+			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
+			    dc->disk.stripe_size)
+				goto full_stripes;
+
+		goto normal_refill;
+full_stripes:
+		searched_from_start = false;	/* not searching entire btree */
+		bch_refill_keybuf(dc->disk.c, buf, &end,
+				  dirty_full_stripe_pred);
+	} else {
+normal_refill:
+		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+	}
+
+	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+	struct cached_dev *dc = arg;
+	bool searched_full_index;
+
+	while (!kthread_should_stop()) {
+		down_write(&dc->writeback_lock);
+		if (!atomic_read(&dc->has_dirty) ||
+		    (!atomic_read(&dc->disk.detaching) &&
+		     !dc->writeback_running)) {
+			up_write(&dc->writeback_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop())
+				return 0;
+
+			try_to_freeze();
+			schedule();
+			continue;
+		}
+
+		searched_full_index = refill_dirty(dc);
+
+		if (searched_full_index &&
+		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		up_write(&dc->writeback_lock);
+
+		bch_ratelimit_reset(&dc->writeback_rate);
+		read_dirty(dc);
+
+		if (searched_full_index) {
+			unsigned delay = dc->writeback_delay * HZ;
+
+			while (delay &&
+			       !kthread_should_stop() &&
+			       !atomic_read(&dc->disk.detaching))
+				delay = schedule_timeout_interruptible(delay);
+		}
+	}
+
+	return 0;
 }
 
 /* Init */
@@ -483,12 +468,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
 }
 
-void bch_cached_dev_writeback_init(struct cached_dev *dc)
+int bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	sema_init(&dc->in_flight, 64);
-	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);
-
 	bch_keybuf_init(&dc->writeback_keys);
 
 	dc->writeback_metadata		= true;
@@ -502,22 +485,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_p_term_inverse = 64;
 	dc->writeback_rate_d_smooth	= 8;
 
+	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+					      "bcache_writeback");
+	if (IS_ERR(dc->writeback_thread))
+		return PTR_ERR(dc->writeback_thread);
+
+	set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
-}
-
-void bch_writeback_exit(void)
-{
-	if (dirty_wq)
-		destroy_workqueue(dirty_wq);
-}
-
-int __init bch_writeback_init(void)
-{
-	dirty_wq = create_workqueue("bcache_writeback");
-	if (!dirty_wq)
-		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 34961888b5a9..60516bfa6052 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -56,11 +56,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		in_use <= CUTOFF_WRITEBACK;
 }
 
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+	wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+	}
+}
+
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
-void bch_writeback_queue(struct cached_dev *);
-void bch_writeback_add(struct cached_dev *);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_init(struct cached_dev *);
 
 #endif

From 48dad8baf92fe8967d9e1358af1cfdda1d2d3298 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:48:51 -0700
Subject: [PATCH 63/94] bcache: Add btree_map() functions

Lots of stuff has been open coding its own btree traversal - which is
generally pretty simple code, but there are a few subtleties.

This adds new new functions, bch_btree_map_nodes() and
bch_btree_map_keys(), which do the traversal for you. Everything that's
open coding btree traversal now (with the exception of garbage
collection) is slowly going to be converted to these two functions;
being able to write other code at a higher level of abstraction  is a
big improvement w.r.t. overall code quality.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |   2 -
 drivers/md/bcache/bset.c      |  33 +++---
 drivers/md/bcache/btree.c     | 188 +++++++++++++++++++++++-----------
 drivers/md/bcache/btree.h     |  37 ++++++-
 drivers/md/bcache/writeback.c |  35 +++----
 5 files changed, 192 insertions(+), 103 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 674e2f42e778..20fe96c121d9 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -384,8 +384,6 @@ struct keybuf_key {
 	void			*private;
 };
 
-typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
-
 struct keybuf {
 	struct bkey		last_scanned;
 	spinlock_t		lock;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index d0512e451dda..14c2a23d3884 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -842,6 +842,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 
 /* Btree iterator */
 
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
 static inline bool btree_iter_cmp(struct btree_iter_set l,
 				  struct btree_iter_set r)
 {
@@ -1146,16 +1153,16 @@ out:
 /* Sysfs stuff */
 
 struct bset_stats {
+	struct btree_op op;
 	size_t nodes;
 	size_t sets_written, sets_unwritten;
 	size_t bytes_written, bytes_unwritten;
 	size_t floats, failed;
 };
 
-static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
-			    struct bset_stats *stats)
+static int btree_bset_stats(struct btree_op *op, struct btree *b)
 {
-	struct bkey *k;
+	struct bset_stats *stats = container_of(op, struct bset_stats, op);
 	unsigned i;
 
 	stats->nodes++;
@@ -1180,30 +1187,20 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
 		}
 	}
 
-	if (b->level) {
-		struct btree_iter iter;
-
-		for_each_key_filter(b, k, &iter, bch_ptr_bad) {
-			int ret = btree(bset_stats, k, b, op, stats);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
+	return MAP_CONTINUE;
 }
 
 int bch_bset_print_stats(struct cache_set *c, char *buf)
 {
-	struct btree_op op;
 	struct bset_stats t;
 	int ret;
 
-	bch_btree_op_init_stack(&op);
 	memset(&t, 0, sizeof(struct bset_stats));
+	bch_btree_op_init_stack(&t.op);
+	t.op.c = c;
 
-	ret = btree_root(bset_stats, c, &op, &t);
-	if (ret)
+	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
+	if (ret < 0)
 		return ret;
 
 	return snprintf(buf, PAGE_SIZE,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 17bfd87fc8f4..cfbdcf349b7f 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2296,6 +2296,82 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
 
+/* Map across nodes or keys */
+
+static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
+				       struct bkey *from,
+				       btree_map_nodes_fn *fn, int flags)
+{
+	int ret = MAP_CONTINUE;
+
+	if (b->level) {
+		struct bkey *k;
+		struct btree_iter iter;
+
+		bch_btree_iter_init(b, &iter, from);
+
+		while ((k = bch_btree_iter_next_filter(&iter, b,
+						       bch_ptr_bad))) {
+			ret = btree(map_nodes_recurse, k, b,
+				    op, from, fn, flags);
+			from = NULL;
+
+			if (ret != MAP_CONTINUE)
+				return ret;
+		}
+	}
+
+	if (!b->level || flags == MAP_ALL_NODES)
+		ret = fn(op, b);
+
+	return ret;
+}
+
+int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
+{
+	int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags);
+	if (closure_blocking(&op->cl))
+		closure_sync(&op->cl);
+	return ret;
+}
+
+static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+				      struct bkey *from, btree_map_keys_fn *fn,
+				      int flags)
+{
+	int ret = MAP_CONTINUE;
+	struct bkey *k;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(b, &iter, from);
+
+	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+		ret = !b->level
+			? fn(op, b, k)
+			: btree(map_keys_recurse, k, b, op, from, fn, flags);
+		from = NULL;
+
+		if (ret != MAP_CONTINUE)
+			return ret;
+	}
+
+	if (!b->level && (flags & MAP_END_KEY))
+		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
+				     KEY_OFFSET(&b->key), 0));
+
+	return ret;
+}
+
+int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
+		       struct bkey *from, btree_map_keys_fn *fn, int flags)
+{
+	int ret = btree_root(map_keys_recurse, c, op, from, fn, flags);
+	if (closure_blocking(&op->cl))
+		closure_sync(&op->cl);
+	return ret;
+}
+
 /* Keybuf code */
 
 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
@@ -2314,74 +2390,70 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
 }
 
-static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
-				   struct keybuf *buf, struct bkey *end,
-				   keybuf_pred_fn *pred)
+struct refill {
+	struct btree_op	op;
+	struct keybuf	*buf;
+	struct bkey	*end;
+	keybuf_pred_fn	*pred;
+};
+
+static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
+			    struct bkey *k)
 {
-	struct btree_iter iter;
-	bch_btree_iter_init(b, &iter, &buf->last_scanned);
+	struct refill *refill = container_of(op, struct refill, op);
+	struct keybuf *buf = refill->buf;
+	int ret = MAP_CONTINUE;
 
-	while (!array_freelist_empty(&buf->freelist)) {
-		struct bkey *k = bch_btree_iter_next_filter(&iter, b,
-							    bch_ptr_bad);
-
-		if (!b->level) {
-			if (!k) {
-				buf->last_scanned = b->key;
-				break;
-			}
-
-			buf->last_scanned = *k;
-			if (bkey_cmp(&buf->last_scanned, end) >= 0)
-				break;
-
-			if (pred(buf, k)) {
-				struct keybuf_key *w;
-
-				spin_lock(&buf->lock);
-
-				w = array_alloc(&buf->freelist);
-
-				w->private = NULL;
-				bkey_copy(&w->key, k);
-
-				if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
-					array_free(&buf->freelist, w);
-
-				spin_unlock(&buf->lock);
-			}
-		} else {
-			if (!k)
-				break;
-
-			btree(refill_keybuf, k, b, op, buf, end, pred);
-			/*
-			 * Might get an error here, but can't really do anything
-			 * and it'll get logged elsewhere. Just read what we
-			 * can.
-			 */
-
-			if (bkey_cmp(&buf->last_scanned, end) >= 0)
-				break;
-
-			cond_resched();
-		}
+	if (bkey_cmp(k, refill->end) >= 0) {
+		ret = MAP_DONE;
+		goto out;
 	}
 
-	return 0;
+	if (!KEY_SIZE(k)) /* end key */
+		goto out;
+
+	if (refill->pred(buf, k)) {
+		struct keybuf_key *w;
+
+		spin_lock(&buf->lock);
+
+		w = array_alloc(&buf->freelist);
+		if (!w) {
+			spin_unlock(&buf->lock);
+			return MAP_DONE;
+		}
+
+		w->private = NULL;
+		bkey_copy(&w->key, k);
+
+		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+			array_free(&buf->freelist, w);
+
+		if (array_freelist_empty(&buf->freelist))
+			ret = MAP_DONE;
+
+		spin_unlock(&buf->lock);
+	}
+out:
+	buf->last_scanned = *k;
+	return ret;
 }
 
 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 		       struct bkey *end, keybuf_pred_fn *pred)
 {
 	struct bkey start = buf->last_scanned;
-	struct btree_op op;
-	bch_btree_op_init_stack(&op);
+	struct refill refill;
 
 	cond_resched();
 
-	btree_root(refill_keybuf, c, &op, buf, end, pred);
-	closure_sync(&op.cl);
+	bch_btree_op_init_stack(&refill.op);
+	refill.buf = buf;
+	refill.end = end;
+	refill.pred = pred;
+
+	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
+			   refill_keybuf_fn, MAP_END_KEY);
 
 	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
 		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
@@ -2465,9 +2537,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
 }
 
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
-					     struct keybuf *buf,
-					     struct bkey *end,
-					     keybuf_pred_fn *pred)
+					  struct keybuf *buf,
+					  struct bkey *end,
+					  keybuf_pred_fn *pred)
 {
 	struct keybuf_key *ret;
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index fa9641aaed39..cafdeb01e219 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -400,9 +400,42 @@ static inline void wake_up_gc(struct cache_set *c)
 		wake_up_process(c->gc_thread);
 }
 
+#define MAP_DONE	0
+#define MAP_CONTINUE	1
+
+#define MAP_ALL_NODES	0
+#define MAP_LEAF_NODES	1
+
+#define MAP_END_KEY	1
+
+typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
+int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
+			  struct bkey *, btree_map_nodes_fn *, int);
+
+static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+				      struct bkey *from, btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
+}
+
+static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
+					   struct cache_set *c,
+					   struct bkey *from,
+					   btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
+}
+
+typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
+				struct bkey *);
+int bch_btree_map_keys(struct btree_op *, struct cache_set *,
+		       struct bkey *, btree_map_keys_fn *, int);
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
 void bch_keybuf_init(struct keybuf *);
-void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
-		       keybuf_pred_fn *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+		       struct bkey *, keybuf_pred_fn *);
 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
 				  struct bkey *);
 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4392f3f38d62..c68de9f12618 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -433,31 +433,17 @@ static int bch_writeback_thread(void *arg)
 
 /* Init */
 
-static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
-					struct cached_dev *dc)
+static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
+				 struct bkey *k)
 {
-	struct bkey *k;
-	struct btree_iter iter;
+	if (KEY_INODE(k) > op->inode)
+		return MAP_DONE;
 
-	bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
-	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
-		if (!b->level) {
-			if (KEY_INODE(k) > dc->disk.id)
-				break;
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+					     KEY_START(k), KEY_SIZE(k));
 
-			if (KEY_DIRTY(k))
-				bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
-							     KEY_START(k),
-							     KEY_SIZE(k));
-		} else {
-			btree(sectors_dirty_init, k, b, op, dc);
-			if (KEY_INODE(k) > dc->disk.id)
-				break;
-
-			cond_resched();
-		}
-
-	return 0;
+	return MAP_CONTINUE;
 }
 
 void bch_sectors_dirty_init(struct cached_dev *dc)
@@ -465,7 +451,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	struct btree_op op;
 
 	bch_btree_op_init_stack(&op);
-	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
+	op.inode = dc->disk.id;
+
+	bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0),
+			   sectors_dirty_init_fn, 0);
 }
 
 int bch_cached_dev_writeback_init(struct cached_dev *dc)

From df8e89701fb02cba6e09c5f46f002778b5b52dd2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:37:59 -0700
Subject: [PATCH 64/94] bcache: Move some stuff to btree.c

With the new btree_map() functions, we don't need to export the stuff
needed for traversing the btree anymore.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   | 95 ++++++++++++++++++++++++++++++++++++-
 drivers/md/bcache/btree.h   | 82 +-------------------------------
 drivers/md/bcache/request.c | 16 +------
 3 files changed, 96 insertions(+), 97 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cfbdcf349b7f..bfd60e6a2312 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,13 @@ static const char *op_type(struct btree_op *op)
 	return op_types[op->type];
 }
 
+enum {
+	BTREE_INSERT_STATUS_INSERT,
+	BTREE_INSERT_STATUS_BACK_MERGE,
+	BTREE_INSERT_STATUS_OVERWROTE,
+	BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
 
@@ -116,6 +123,78 @@ void bch_btree_op_init_stack(struct btree_op *op)
 	op->lock = -1;
 }
 
+static inline bool should_split(struct btree *b)
+{
+	struct bset *i = write_block(b);
+	return b->written >= btree_blocks(b) ||
+		(b->written + __set_blocks(i, i->keys + 15, b->c)
+		 > btree_blocks(b));
+}
+
+#define insert_lock(s, b)	((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:		function to call, which will be passed the child node
+ * @key:	key to recurse on
+ * @b:		parent btree node
+ * @op:		pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)					\
+({									\
+	int _r, l = (b)->level - 1;					\
+	bool _w = l <= (op)->lock;					\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
+	if (!IS_ERR(_child)) {						\
+		_child->parent = (b);					\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
+	} else								\
+		_r = PTR_ERR(_child);					\
+	_r;								\
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:		function to call, which will be passed the child node
+ * @c:		cache set
+ * @op:		pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)					\
+({									\
+	int _r = -EINTR;						\
+	do {								\
+		struct btree *_b = (c)->root;				\
+		bool _w = insert_lock(op, _b);				\
+		rw_lock(_w, _b, _b->level);				\
+		if (_b == (c)->root &&					\
+		    _w == insert_lock(op, _b)) {			\
+			_b->parent = NULL;				\
+			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
+		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
+		if (_r == -ENOSPC) {					\
+			wait_event((c)->try_wait,			\
+				   !(c)->try_harder);			\
+			_r = -EINTR;					\
+		}							\
+	} while (_r == -EINTR);						\
+									\
+	_r;								\
+})
+
 /* Btree key manipulation */
 
 void __bkey_put(struct cache_set *c, struct bkey *k)
@@ -811,7 +890,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
-void bch_cannibalize_unlock(struct cache_set *c)
+static void bch_cannibalize_unlock(struct cache_set *c)
 {
 	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
@@ -2262,7 +2341,7 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
 	return 0;
 }
 
-int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
+static int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 {
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
@@ -2296,6 +2375,18 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
 
+void bch_btree_search_async(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+	int ret = btree_root(search_recurse, op->c, op);
+
+	if (ret == -EAGAIN)
+		continue_at(cl, bch_btree_search_async, bcache_wq);
+
+	closure_return(cl);
+}
+
 /* Map across nodes or keys */
 
 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index cafdeb01e219..1690f4731c1e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -270,13 +270,6 @@ struct btree_op {
 	BKEY_PADDED(replace);
 };
 
-enum {
-	BTREE_INSERT_STATUS_INSERT,
-	BTREE_INSERT_STATUS_BACK_MERGE,
-	BTREE_INSERT_STATUS_OVERWROTE,
-	BTREE_INSERT_STATUS_FRONT_MERGE,
-};
-
 void bch_btree_op_init_stack(struct btree_op *);
 
 static inline void rw_lock(bool w, struct btree *b, int level)
@@ -302,82 +295,9 @@ static inline void rw_unlock(bool w, struct btree *b)
 	(w ? up_write : up_read)(&b->lock);
 }
 
-#define insert_lock(s, b)	((b)->level <= (s)->lock)
-
-/*
- * These macros are for recursing down the btree - they handle the details of
- * locking and looking up nodes in the cache for you. They're best treated as
- * mere syntax when reading code that uses them.
- *
- * op->lock determines whether we take a read or a write lock at a given depth.
- * If you've got a read lock and find that you need a write lock (i.e. you're
- * going to have to split), set op->lock and return -EINTR; btree_root() will
- * call you again and you'll have the correct lock.
- */
-
-/**
- * btree - recurse down the btree on a specified key
- * @fn:		function to call, which will be passed the child node
- * @key:	key to recurse on
- * @b:		parent btree node
- * @op:		pointer to struct btree_op
- */
-#define btree(fn, key, b, op, ...)					\
-({									\
-	int _r, l = (b)->level - 1;					\
-	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
-	if (!IS_ERR(_child)) {						\
-		_child->parent = (b);					\
-		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
-		rw_unlock(_w, _child);					\
-	} else								\
-		_r = PTR_ERR(_child);					\
-	_r;								\
-})
-
-/**
- * btree_root - call a function on the root of the btree
- * @fn:		function to call, which will be passed the child node
- * @c:		cache set
- * @op:		pointer to struct btree_op
- */
-#define btree_root(fn, c, op, ...)					\
-({									\
-	int _r = -EINTR;						\
-	do {								\
-		struct btree *_b = (c)->root;				\
-		bool _w = insert_lock(op, _b);				\
-		rw_lock(_w, _b, _b->level);				\
-		if (_b == (c)->root &&					\
-		    _w == insert_lock(op, _b)) {			\
-			_b->parent = NULL;				\
-			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
-		}							\
-		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c);				\
-		if (_r == -ENOSPC) {					\
-			wait_event((c)->try_wait,			\
-				   !(c)->try_harder);			\
-			_r = -EINTR;					\
-		}							\
-	} while (_r == -EINTR);						\
-									\
-	_r;								\
-})
-
-static inline bool should_split(struct btree *b)
-{
-	struct bset *i = write_block(b);
-	return b->written >= btree_blocks(b) ||
-		(b->written + __set_blocks(i, i->keys + 15, b->c)
-		 > btree_blocks(b));
-}
-
 void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
-void bch_cannibalize_unlock(struct cache_set *);
 void bch_btree_set_root(struct btree *);
 struct btree *bch_btree_node_alloc(struct cache_set *, int);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
@@ -386,7 +306,7 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
-int bch_btree_search_recurse(struct btree *, struct btree_op *);
+void bch_btree_search_async(struct closure *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f779eb420d69..a16872541038 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -754,18 +754,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	return s;
 }
 
-static void btree_read_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-
-	int ret = btree_root(search_recurse, op->c, op);
-
-	if (ret == -EAGAIN)
-		continue_at(cl, btree_read_async, bcache_wq);
-
-	closure_return(cl);
-}
-
 /* Cached devices */
 
 static void cached_dev_bio_complete(struct closure *cl)
@@ -1087,7 +1075,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, btree_read_async, NULL, cl);
+	closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1351,7 +1339,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, btree_read_async, NULL, cl);
+		closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);

From 2c1953e201a05ddfb1ea53f23d81a492c6513028 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:41:08 -0700
Subject: [PATCH 65/94] bcache: Convert bch_btree_read_async() to
 bch_btree_map_keys()

This is a fairly straightforward conversion, mostly reshuffling -
op->lookup_done goes away, replaced by MAP_DONE/MAP_CONTINUE. And the
code for handling cache hits and misses wasn't really btree code, so it
gets moved to request.c.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   | 133 -------------------------------
 drivers/md/bcache/btree.h   |   3 -
 drivers/md/bcache/journal.c |   1 -
 drivers/md/bcache/request.c | 154 +++++++++++++++++++++++++++++-------
 drivers/md/bcache/request.h |   2 -
 5 files changed, 125 insertions(+), 168 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bfd60e6a2312..3949673cb1b0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 #include "writeback.h"
 
 #include <linux/slab.h>
@@ -2255,138 +2254,6 @@ void bch_btree_set_root(struct btree *b)
 	closure_sync(&cl);
 }
 
-/* Cache lookup */
-
-static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
-				     struct bkey *k)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-	int ret = 0;
-
-	while (!ret &&
-	       !op->lookup_done) {
-		unsigned sectors = INT_MAX;
-
-		if (KEY_INODE(k) == op->inode) {
-			if (KEY_START(k) <= bio->bi_sector)
-				break;
-
-			sectors = min_t(uint64_t, sectors,
-					KEY_START(k) - bio->bi_sector);
-		}
-
-		ret = s->d->cache_miss(b, s, bio, sectors);
-	}
-
-	return ret;
-}
-
-/*
- * Read from a single key, handling the initial cache miss if the key starts in
- * the middle of the bio
- */
-static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
-				    struct bkey *k)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-	unsigned ptr;
-	struct bio *n;
-
-	int ret = submit_partial_cache_miss(b, op, k);
-	if (ret || op->lookup_done)
-		return ret;
-
-	/* XXX: figure out best pointer - for multiple cache devices */
-	ptr = 0;
-
-	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
-
-	while (!op->lookup_done &&
-	       KEY_INODE(k) == op->inode &&
-	       bio->bi_sector < KEY_OFFSET(k)) {
-		struct bkey *bio_key;
-		sector_t sector = PTR_OFFSET(k, ptr) +
-			(bio->bi_sector - KEY_START(k));
-		unsigned sectors = min_t(uint64_t, INT_MAX,
-					 KEY_OFFSET(k) - bio->bi_sector);
-
-		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (n == bio)
-			op->lookup_done = true;
-
-		bio_key = &container_of(n, struct bbio, bio)->key;
-
-		/*
-		 * The bucket we're reading from might be reused while our bio
-		 * is in flight, and we could then end up reading the wrong
-		 * data.
-		 *
-		 * We guard against this by checking (in cache_read_endio()) if
-		 * the pointer is stale again; if so, we treat it as an error
-		 * and reread from the backing device (but we don't pass that
-		 * error up anywhere).
-		 */
-
-		bch_bkey_copy_single_ptr(bio_key, k, ptr);
-		SET_PTR_OFFSET(bio_key, 0, sector);
-
-		n->bi_end_io	= bch_cache_read_endio;
-		n->bi_private	= &s->cl;
-
-		__bch_submit_bbio(n, b->c);
-	}
-
-	return 0;
-}
-
-static int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-
-	int ret = 0;
-	struct bkey *k;
-	struct btree_iter iter;
-	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
-
-	do {
-		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
-		if (!k) {
-			/*
-			 * b->key would be exactly what we want, except that
-			 * pointers to btree nodes have nonzero size - we
-			 * wouldn't go far enough
-			 */
-
-			ret = submit_partial_cache_miss(b, op,
-					&KEY(KEY_INODE(&b->key),
-					     KEY_OFFSET(&b->key), 0));
-			break;
-		}
-
-		ret = b->level
-			? btree(search_recurse, k, b, op)
-			: submit_partial_cache_hit(b, op, k);
-	} while (!ret &&
-		 !op->lookup_done);
-
-	return ret;
-}
-
-void bch_btree_search_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-
-	int ret = btree_root(search_recurse, op->c, op);
-
-	if (ret == -EAGAIN)
-		continue_at(cl, bch_btree_search_async, bcache_wq);
-
-	closure_return(cl);
-}
-
 /* Map across nodes or keys */
 
 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 1690f4731c1e..60dadd722ace 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -264,7 +264,6 @@ struct btree_op {
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
-	unsigned		lookup_done:1;
 	unsigned		insert_collision:1;
 
 	BKEY_PADDED(replace);
@@ -306,8 +305,6 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
-void bch_btree_search_async(struct closure *);
-
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8866f8ee3a07..6f4daf031410 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,7 +7,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 
 #include <trace/events/bcache.h>
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a16872541038..854743e85e76 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -638,24 +638,9 @@ void bch_data_insert(struct closure *cl)
 	bch_data_insert_start(cl);
 }
 
-/* Common code for the make_request functions */
+/* Cache lookup */
 
-static void request_endio(struct bio *bio, int error)
-{
-	struct closure *cl = bio->bi_private;
-
-	if (error) {
-		struct search *s = container_of(cl, struct search, cl);
-		s->error = error;
-		/* Only cache read errors are recoverable */
-		s->recoverable = false;
-	}
-
-	bio_put(bio);
-	closure_put(cl);
-}
-
-void bch_cache_read_endio(struct bio *bio, int error)
+static void bch_cache_read_endio(struct bio *bio, int error)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct closure *cl = bio->bi_private;
@@ -678,6 +663,120 @@ void bch_cache_read_endio(struct bio *bio, int error)
 	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 }
 
+static int submit_partial_cache_miss(struct btree *b, struct search *s,
+				     struct bkey *k)
+{
+	struct bio *bio = &s->bio.bio;
+	int ret = MAP_CONTINUE;
+
+	do {
+		unsigned sectors = INT_MAX;
+
+		if (KEY_INODE(k) == s->op.inode) {
+			if (KEY_START(k) <= bio->bi_sector)
+				break;
+
+			sectors = min_t(uint64_t, sectors,
+					KEY_START(k) - bio->bi_sector);
+		}
+
+		ret = s->d->cache_miss(b, s, bio, sectors);
+	} while (ret == MAP_CONTINUE);
+
+	return ret;
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int submit_partial_cache_hit(struct btree_op *op, struct btree *b,
+				    struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+	unsigned ptr;
+	struct bio *n;
+
+	int ret = submit_partial_cache_miss(b, s, k);
+	if (ret != MAP_CONTINUE || !KEY_SIZE(k))
+		return ret;
+
+	/* XXX: figure out best pointer - for multiple cache devices */
+	ptr = 0;
+
+	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+	while (ret == MAP_CONTINUE &&
+	       KEY_INODE(k) == op->inode &&
+	       bio->bi_sector < KEY_OFFSET(k)) {
+		struct bkey *bio_key;
+		sector_t sector = PTR_OFFSET(k, ptr) +
+			(bio->bi_sector - KEY_START(k));
+		unsigned sectors = min_t(uint64_t, INT_MAX,
+					 KEY_OFFSET(k) - bio->bi_sector);
+
+		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		if (n == bio)
+			ret = MAP_DONE;
+
+		bio_key = &container_of(n, struct bbio, bio)->key;
+
+		/*
+		 * The bucket we're reading from might be reused while our bio
+		 * is in flight, and we could then end up reading the wrong
+		 * data.
+		 *
+		 * We guard against this by checking (in cache_read_endio()) if
+		 * the pointer is stale again; if so, we treat it as an error
+		 * and reread from the backing device (but we don't pass that
+		 * error up anywhere).
+		 */
+
+		bch_bkey_copy_single_ptr(bio_key, k, ptr);
+		SET_PTR_OFFSET(bio_key, 0, sector);
+
+		n->bi_end_io	= bch_cache_read_endio;
+		n->bi_private	= &s->cl;
+
+		__bch_submit_bbio(n, b->c);
+	}
+
+	return ret;
+}
+
+static void cache_lookup(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+
+	int ret = bch_btree_map_keys(op, op->c,
+				     &KEY(op->inode, bio->bi_sector, 0),
+				     submit_partial_cache_hit, 1);
+	if (ret == -EAGAIN)
+		continue_at(cl, cache_lookup, bcache_wq);
+
+	closure_return(cl);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->error = error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
@@ -1005,15 +1104,14 @@ static void cached_dev_read_done_bh(struct closure *cl)
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
 				 struct bio *bio, unsigned sectors)
 {
-	int ret = 0;
+	int ret = MAP_CONTINUE;
 	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
 	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (miss == bio)
-			s->op.lookup_done = true;
+		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
 	}
 
@@ -1033,11 +1131,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		return ret;
 
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-	if (miss == bio)
-		s->op.lookup_done = true;
-	else
-		/* btree_search_recurse()'s btree iterator is no good anymore */
-		ret = -EINTR;
+
+	/* btree_search_recurse()'s btree iterator is no good anymore */
+	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
@@ -1075,7 +1171,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
+	closure_call(&s->op.cl, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1287,9 +1383,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	bio_advance(bio, min(sectors << 9, bio->bi_size));
 
 	if (!bio->bi_size)
-		s->op.lookup_done = true;
+		return MAP_DONE;
 
-	return 0;
+	return MAP_CONTINUE;
 }
 
 static void flash_dev_nodata(struct closure *cl)
@@ -1339,7 +1435,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
+		closure_call(&s->op.cl, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 7d02ac5f936e..b0b4b0b5b7e9 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -31,10 +31,8 @@ struct search {
 	struct keylist		insert_keys;
 };
 
-void bch_cache_read_endio(struct bio *, int);
 unsigned bch_get_congested(struct cache_set *);
 void bch_data_insert(struct closure *cl);
-void bch_cache_read_endio(struct bio *, int);
 
 void bch_open_buckets_free(struct cache_set *);
 int bch_open_buckets_alloc(struct cache_set *);

From cc23196631fbcd1bc3eafedbb712413fdbf946a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:41:13 -0700
Subject: [PATCH 66/94] bcache: Clean up cache_lookup_fn

There was some looping in submit_partial_cache_hit() and
submit_partial_cache_hit() that isn't needed anymore - originally, we
wouldn't necessarily process the full hit or miss all at once because
when splitting the bio, we took into account the restrictions of the
device we were sending it to.

But, device bio size restrictions are now handled elsewhere, with a
wrapper around generic_make_request() - so that looping has been
unnecessary for awhile now and we can now do quite a bit of cleanup.

And if we trim the key we're reading from to match the subset we're
actually reading, we don't have to explicitly calculate bi_sector
anymore. Neat.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c | 108 +++++++++++++++---------------------
 1 file changed, 46 insertions(+), 62 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 854743e85e76..de3fc76ffcfc 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -663,86 +663,70 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 }
 
-static int submit_partial_cache_miss(struct btree *b, struct search *s,
-				     struct bkey *k)
-{
-	struct bio *bio = &s->bio.bio;
-	int ret = MAP_CONTINUE;
-
-	do {
-		unsigned sectors = INT_MAX;
-
-		if (KEY_INODE(k) == s->op.inode) {
-			if (KEY_START(k) <= bio->bi_sector)
-				break;
-
-			sectors = min_t(uint64_t, sectors,
-					KEY_START(k) - bio->bi_sector);
-		}
-
-		ret = s->d->cache_miss(b, s, bio, sectors);
-	} while (ret == MAP_CONTINUE);
-
-	return ret;
-}
-
 /*
  * Read from a single key, handling the initial cache miss if the key starts in
  * the middle of the bio
  */
-static int submit_partial_cache_hit(struct btree_op *op, struct btree *b,
-				    struct bkey *k)
+static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 {
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
+	struct bio *n, *bio = &s->bio.bio;
+	struct bkey *bio_key;
 	unsigned ptr;
-	struct bio *n;
 
-	int ret = submit_partial_cache_miss(b, s, k);
-	if (ret != MAP_CONTINUE || !KEY_SIZE(k))
-		return ret;
+	if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0)
+		return MAP_CONTINUE;
+
+	if (KEY_INODE(k) != s->op.inode ||
+	    KEY_START(k) > bio->bi_sector) {
+		unsigned bio_sectors = bio_sectors(bio);
+		unsigned sectors = KEY_INODE(k) == s->op.inode
+			? min_t(uint64_t, INT_MAX,
+				KEY_START(k) - bio->bi_sector)
+			: INT_MAX;
+
+		int ret = s->d->cache_miss(b, s, bio, sectors);
+		if (ret != MAP_CONTINUE)
+			return ret;
+
+		/* if this was a complete miss we shouldn't get here */
+		BUG_ON(bio_sectors <= sectors);
+	}
+
+	if (!KEY_SIZE(k))
+		return MAP_CONTINUE;
 
 	/* XXX: figure out best pointer - for multiple cache devices */
 	ptr = 0;
 
 	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
 
-	while (ret == MAP_CONTINUE &&
-	       KEY_INODE(k) == op->inode &&
-	       bio->bi_sector < KEY_OFFSET(k)) {
-		struct bkey *bio_key;
-		sector_t sector = PTR_OFFSET(k, ptr) +
-			(bio->bi_sector - KEY_START(k));
-		unsigned sectors = min_t(uint64_t, INT_MAX,
-					 KEY_OFFSET(k) - bio->bi_sector);
+	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
+				     KEY_OFFSET(k) - bio->bi_sector),
+			  GFP_NOIO, s->d->bio_split);
 
-		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (n == bio)
-			ret = MAP_DONE;
+	bio_key = &container_of(n, struct bbio, bio)->key;
+	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-		bio_key = &container_of(n, struct bbio, bio)->key;
+	bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key);
 
-		/*
-		 * The bucket we're reading from might be reused while our bio
-		 * is in flight, and we could then end up reading the wrong
-		 * data.
-		 *
-		 * We guard against this by checking (in cache_read_endio()) if
-		 * the pointer is stale again; if so, we treat it as an error
-		 * and reread from the backing device (but we don't pass that
-		 * error up anywhere).
-		 */
+	n->bi_end_io	= bch_cache_read_endio;
+	n->bi_private	= &s->cl;
 
-		bch_bkey_copy_single_ptr(bio_key, k, ptr);
-		SET_PTR_OFFSET(bio_key, 0, sector);
+	/*
+	 * The bucket we're reading from might be reused while our bio
+	 * is in flight, and we could then end up reading the wrong
+	 * data.
+	 *
+	 * We guard against this by checking (in cache_read_endio()) if
+	 * the pointer is stale again; if so, we treat it as an error
+	 * and reread from the backing device (but we don't pass that
+	 * error up anywhere).
+	 */
 
-		n->bi_end_io	= bch_cache_read_endio;
-		n->bi_private	= &s->cl;
-
-		__bch_submit_bbio(n, b->c);
-	}
-
-	return ret;
+	__bch_submit_bbio(n, b->c);
+	return n == bio ? MAP_DONE : MAP_CONTINUE;
 }
 
 static void cache_lookup(struct closure *cl)
@@ -753,7 +737,7 @@ static void cache_lookup(struct closure *cl)
 
 	int ret = bch_btree_map_keys(op, op->c,
 				     &KEY(op->inode, bio->bi_sector, 0),
-				     submit_partial_cache_hit, 1);
+				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
 

From c18536a72ddd7fe30d63e6c1500b5c930ac14594 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:44:17 -0700
Subject: [PATCH 67/94] bcache: Prune struct btree_op

Eventual goal is for struct btree_op to contain only what is necessary
for traversing the btree.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c      |   1 -
 drivers/md/bcache/btree.c     |  32 +++---
 drivers/md/bcache/btree.h     |  21 +---
 drivers/md/bcache/journal.c   |  32 +++---
 drivers/md/bcache/journal.h   |   9 +-
 drivers/md/bcache/movinggc.c  |  18 ++--
 drivers/md/bcache/request.c   | 177 +++++++++++++++++-----------------
 drivers/md/bcache/request.h   |  14 +++
 drivers/md/bcache/stats.c     |   8 +-
 drivers/md/bcache/super.c     |  21 ++--
 drivers/md/bcache/writeback.c |  17 +++-
 11 files changed, 179 insertions(+), 171 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 14c2a23d3884..fae5b7b3f5ab 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1197,7 +1197,6 @@ int bch_bset_print_stats(struct cache_set *c, char *buf)
 
 	memset(&t, 0, sizeof(struct bset_stats));
 	bch_btree_op_init_stack(&t.op);
-	t.op.c = c;
 
 	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
 	if (ret < 0)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3949673cb1b0..5cb59c313dc3 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -503,7 +503,7 @@ static void btree_node_write_work(struct work_struct *w)
 	rw_unlock(true, b);
 }
 
-static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
+static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct btree_write *w = btree_current_write(b);
@@ -516,15 +516,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 
 	set_btree_node_dirty(b);
 
-	if (op->journal) {
+	if (journal_ref) {
 		if (w->journal &&
-		    journal_pin_cmp(b->c, w, op)) {
+		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
 			atomic_dec_bug(w->journal);
 			w->journal = NULL;
 		}
 
 		if (!w->journal) {
-			w->journal = op->journal;
+			w->journal = journal_ref;
 			atomic_inc(w->journal);
 		}
 	}
@@ -1663,13 +1663,16 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 	return 0;
 }
 
-int bch_btree_check(struct cache_set *c, struct btree_op *op)
+int bch_btree_check(struct cache_set *c)
 {
 	int ret = -ENOMEM;
 	unsigned i;
 	unsigned long *seen[MAX_CACHES_PER_SET];
+	struct btree_op op;
 
 	memset(seen, 0, sizeof(seen));
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
 
 	for (i = 0; c->cache[i]; i++) {
 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1681,7 +1684,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op)
 		memset(seen[i], 0xFF, n);
 	}
 
-	ret = btree_root(check_recurse, c, op, seen);
+	ret = btree_root(check_recurse, c, &op, seen);
 err:
 	for (i = 0; i < MAX_CACHES_PER_SET; i++)
 		kfree(seen[i]);
@@ -2091,7 +2094,8 @@ err:
 }
 
 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
-				 struct keylist *insert_keys)
+				 struct keylist *insert_keys,
+				 atomic_t *journal_ref)
 {
 	int ret = 0;
 	struct keylist split_keys;
@@ -2123,7 +2127,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 			if (bch_btree_insert_keys(b, op, insert_keys)) {
 				if (!b->level)
-					bch_btree_leaf_dirty(b, op);
+					bch_btree_leaf_dirty(b, journal_ref);
 				else
 					bch_btree_node_write(b, &op->cl);
 			}
@@ -2162,7 +2166,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 
 	BUG_ON(op->type != BTREE_INSERT);
 
-	ret = bch_btree_insert_node(b, op, &insert);
+	ret = bch_btree_insert_node(b, op, &insert, NULL);
 
 	BUG_ON(!ret && !bch_keylist_empty(&insert));
 out:
@@ -2172,7 +2176,7 @@ out:
 }
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys)
+				    struct keylist *keys, atomic_t *journal_ref)
 {
 	if (bch_keylist_empty(keys))
 		return 0;
@@ -2189,14 +2193,14 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op, keys);
+		return btree(insert_recurse, k, b, op, keys, journal_ref);
 	} else {
-		return bch_btree_insert_node(b, op, keys);
+		return bch_btree_insert_node(b, op, keys, journal_ref);
 	}
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys)
+		     struct keylist *keys, atomic_t *journal_ref)
 {
 	int ret = 0;
 
@@ -2210,7 +2214,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 
 	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys);
+		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
 
 		if (ret == -EAGAIN) {
 			ret = 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 60dadd722ace..3f820b67150c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -238,17 +238,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k);
 
 struct btree_op {
 	struct closure		cl;
-	struct cache_set	*c;
-
-	/* Journal entry we have a refcount on */
-	atomic_t		*journal;
-
-	/* Bio to be inserted into the cache */
-	struct bio		*cache_bio;
-
-	unsigned		inode;
-
-	uint16_t		write_prio;
 
 	/* Btree level at which we start taking write locks */
 	short			lock;
@@ -259,11 +248,6 @@ struct btree_op {
 		BTREE_REPLACE
 	} type:8;
 
-	unsigned		csum:1;
-	unsigned		bypass:1;
-	unsigned		flush_journal:1;
-
-	unsigned		insert_data_done:1;
 	unsigned		insert_collision:1;
 
 	BKEY_PADDED(replace);
@@ -303,12 +287,13 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
+int bch_btree_insert(struct btree_op *, struct cache_set *,
+		     struct keylist *, atomic_t *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
-int bch_btree_check(struct cache_set *, struct btree_op *);
+int bch_btree_check(struct cache_set *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 
 static inline void wake_up_gc(struct cache_set *c)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6f4daf031410..725c8eb9a62a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -30,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error)
 }
 
 static int journal_read_bucket(struct cache *ca, struct list_head *list,
-			       struct btree_op *op, unsigned bucket_index)
+			       unsigned bucket_index)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->bio;
 
 	struct journal_replay *i;
 	struct jset *j, *data = ca->set->journal.w[0].data;
+	struct closure cl;
 	unsigned len, left, offset = 0;
 	int ret = 0;
 	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
 
+	closure_init_stack(&cl);
+
 	pr_debug("reading %llu", (uint64_t) bucket);
 
 	while (offset < ca->sb.bucket_size) {
@@ -54,11 +57,11 @@ reread:		left = ca->sb.bucket_size - offset;
 		bio->bi_size	= len << 9;
 
 		bio->bi_end_io	= journal_read_endio;
-		bio->bi_private = &op->cl;
+		bio->bi_private = &cl;
 		bch_bio_map(bio, data);
 
-		closure_bio_submit(bio, &op->cl, ca);
-		closure_sync(&op->cl);
+		closure_bio_submit(bio, &cl, ca);
+		closure_sync(&cl);
 
 		/* This function could be simpler now since we no longer write
 		 * journal entries that overlap bucket boundaries; this means
@@ -128,12 +131,11 @@ next_set:
 	return ret;
 }
 
-int bch_journal_read(struct cache_set *c, struct list_head *list,
-			struct btree_op *op)
+int bch_journal_read(struct cache_set *c, struct list_head *list)
 {
 #define read_bucket(b)							\
 	({								\
-		int ret = journal_read_bucket(ca, list, op, b);		\
+		int ret = journal_read_bucket(ca, list, b);		\
 		__set_bit(b, bitmap);					\
 		if (ret < 0)						\
 			return ret;					\
@@ -291,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 	}
 }
 
-int bch_journal_replay(struct cache_set *s, struct list_head *list,
-			  struct btree_op *op)
+int bch_journal_replay(struct cache_set *s, struct list_head *list)
 {
 	int ret = 0, keys = 0, entries = 0;
 	struct bkey *k;
@@ -301,8 +302,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	struct keylist keylist;
+	struct btree_op op;
 
 	bch_keylist_init(&keylist);
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -319,9 +323,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			op->journal = i->pin;
-
-			ret = bch_btree_insert(op, s, &keylist);
+			ret = bch_btree_insert(&op, s, &keylist, i->pin);
 			if (ret)
 				goto err;
 
@@ -346,7 +348,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		kfree(i);
 	}
 err:
-	closure_sync(&op->cl);
+	closure_sync(&op.cl);
 	return ret;
 }
 
@@ -368,8 +370,8 @@ retry:
 			if (!best)
 				best = b;
 			else if (journal_pin_cmp(c,
-						 btree_current_write(best),
-						 btree_current_write(b))) {
+					btree_current_write(best)->journal,
+					btree_current_write(b)->journal)) {
 				best = b;
 			}
 		}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 7045e6fd2d5a..5e9edb9ef376 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -189,8 +189,7 @@ struct journal_device {
 };
 
 #define journal_pin_cmp(c, l, r)				\
-	(fifo_idx(&(c)->journal.pin, (l)->journal) >		\
-	 fifo_idx(&(c)->journal.pin, (r)->journal))
+	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
 
 #define JOURNAL_PIN	20000
 
@@ -206,10 +205,8 @@ atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
 void bch_journal_next(struct journal *);
 void bch_journal_mark(struct cache_set *, struct list_head *);
 void bch_journal_meta(struct cache_set *, struct closure *);
-int bch_journal_read(struct cache_set *, struct list_head *,
-			struct btree_op *);
-int bch_journal_replay(struct cache_set *, struct list_head *,
-			  struct btree_op *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
 
 void bch_journal_free(struct cache_set *);
 int bch_journal_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 6ba050456ec8..80e30d77221e 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -55,9 +55,9 @@ static void write_moving_finish(struct closure *cl)
 	if (io->s.op.insert_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
-	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
+	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
 
-	up(&io->s.op.c->moving_in_flight);
+	up(&io->s.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -70,7 +70,7 @@ static void read_moving_endio(struct bio *bio, int error)
 	if (error)
 		io->s.error = error;
 
-	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
+	bch_bbio_endio(io->s.c, bio, error, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -99,11 +99,11 @@ static void write_moving(struct closure *cl)
 
 		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
 		s->op.lock		= -1;
-		s->op.write_prio	= 1;
-		s->op.cache_bio		= &io->bio.bio;
+		s->write_prio		= 1;
+		s->cache_bio		= &io->bio.bio;
 
 		s->writeback		= KEY_DIRTY(&io->w->key);
-		s->op.csum		= KEY_CSUM(&io->w->key);
+		s->csum			= KEY_CSUM(&io->w->key);
 
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
@@ -121,7 +121,7 @@ static void read_moving_submit(struct closure *cl)
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	struct bio *bio = &io->bio.bio;
 
-	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
+	bch_submit_bbio(bio, s->c, &io->w->key, 0);
 
 	continue_at(cl, write_moving, system_wq);
 }
@@ -151,8 +151,8 @@ static void read_moving(struct cache_set *c)
 
 		w->private	= io;
 		io->w		= w;
-		io->s.op.inode	= KEY_INODE(&w->key);
-		io->s.op.c	= c;
+		io->s.inode	= KEY_INODE(&w->key);
+		io->s.c		= c;
 
 		moving_init(io);
 		bio = &io->bio.bio;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index de3fc76ffcfc..818e2e39e71f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
+	atomic_t *journal_ref = NULL;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -231,20 +232,19 @@ static void bch_data_insert_keys(struct closure *cl)
 #endif
 
 	if (s->write)
-		op->journal = bch_journal(op->c, &s->insert_keys,
-					  op->flush_journal
+		journal_ref = bch_journal(s->c, &s->insert_keys,
+					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, op->c, &s->insert_keys)) {
+	if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) {
 		s->error		= -ENOMEM;
-		op->insert_data_done	= true;
+		s->insert_data_done	= true;
 	}
 
-	if (op->journal)
-		atomic_dec_bug(op->journal);
-	op->journal = NULL;
+	if (journal_ref)
+		atomic_dec_bug(journal_ref);
 
-	if (!op->insert_data_done)
+	if (!s->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
 	bch_keylist_free(&s->insert_keys);
@@ -347,7 +347,7 @@ found:
 static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 			      struct search *s)
 {
-	struct cache_set *c = s->op.c;
+	struct cache_set *c = s->c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
 	unsigned i;
@@ -363,7 +363,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	spin_lock(&c->data_bucket_lock);
 
 	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
-		unsigned watermark = s->op.write_prio
+		unsigned watermark = s->write_prio
 			? WATERMARK_MOVINGGC
 			: WATERMARK_NONE;
 
@@ -435,7 +435,7 @@ static void bch_data_invalidate(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = op->cache_bio;
+	struct bio *bio = s->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
@@ -443,17 +443,17 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&s->insert_keys, 0, op->c))
+		if (bch_keylist_realloc(&s->insert_keys, 0, s->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
 		bch_keylist_add(&s->insert_keys,
-				&KEY(op->inode, bio->bi_sector, len));
+				&KEY(s->inode, bio->bi_sector, len));
 	}
 
-	op->insert_data_done = true;
+	s->insert_data_done = true;
 	bio_put(bio);
 out:
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
@@ -506,21 +506,21 @@ static void bch_data_insert_endio(struct bio *bio, int error)
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(op->c, bio, error, "writing data to cache");
+	bch_bbio_endio(s->c, bio, error, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = op->cache_bio, *n;
+	struct bio *bio = s->cache_bio, *n;
 
-	if (op->bypass)
+	if (s->bypass)
 		return bch_data_invalidate(cl);
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
-		set_gc_sectors(op->c);
-		wake_up_gc(op->c);
+	if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) {
+		set_gc_sectors(s->c);
+		wake_up_gc(s->c);
 	}
 
 	/*
@@ -533,17 +533,17 @@ static void bch_data_insert_start(struct closure *cl)
 		unsigned i;
 		struct bkey *k;
 		struct bio_set *split = s->d
-			? s->d->bio_split : op->c->bio_split;
+			? s->d->bio_split : s->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&s->insert_keys,
-					1 + (op->csum ? 1 : 0),
-					op->c))
+					1 + (s->csum ? 1 : 0),
+					s->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
 		k = s->insert_keys.top;
 		bkey_init(k);
-		SET_KEY_INODE(k, op->inode);
+		SET_KEY_INODE(k, s->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
 		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
@@ -558,11 +558,11 @@ static void bch_data_insert_start(struct closure *cl)
 			SET_KEY_DIRTY(k, true);
 
 			for (i = 0; i < KEY_PTRS(k); i++)
-				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+				SET_GC_MARK(PTR_BUCKET(s->c, k, i),
 					    GC_MARK_DIRTY);
 		}
 
-		SET_KEY_CSUM(k, op->csum);
+		SET_KEY_CSUM(k, s->csum);
 		if (KEY_CSUM(k))
 			bio_csum(n, k);
 
@@ -570,10 +570,10 @@ static void bch_data_insert_start(struct closure *cl)
 		bch_keylist_push(&s->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
-		bch_submit_bbio(n, op->c, k, 0);
+		bch_submit_bbio(n, s->c, k, 0);
 	} while (n != bio);
 
-	op->insert_data_done = true;
+	s->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
@@ -592,14 +592,14 @@ err:
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		op->bypass = true;
+		s->bypass = true;
 		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
 		 * we have written or bail out if we didn't do anything.
 		 */
-		op->insert_data_done = true;
+		s->insert_data_done = true;
 		bio_put(bio);
 
 		if (!bch_keylist_empty(&s->insert_keys))
@@ -622,11 +622,11 @@ err:
  * data is written it calls bch_journal, and after the keys have been added to
  * the next journal write they're inserted into the btree.
  *
- * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If op->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->cache_bio and op->inode.
+ * If s->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by s->cache_bio and op->inode.
  */
 void bch_data_insert(struct closure *cl)
 {
@@ -634,7 +634,7 @@ void bch_data_insert(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 
 	bch_keylist_init(&s->insert_keys);
-	bio_get(op->cache_bio);
+	bio_get(s->cache_bio);
 	bch_data_insert_start(cl);
 }
 
@@ -655,12 +655,12 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 
 	if (error)
 		s->error = error;
-	else if (ptr_stale(s->op.c, &b->key, 0)) {
-		atomic_long_inc(&s->op.c->cache_read_races);
+	else if (ptr_stale(s->c, &b->key, 0)) {
+		atomic_long_inc(&s->c->cache_read_races);
 		s->error = -EINTR;
 	}
 
-	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
+	bch_bbio_endio(s->c, bio, error, "reading from cache");
 }
 
 /*
@@ -674,13 +674,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct bkey *bio_key;
 	unsigned ptr;
 
-	if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0)
+	if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
 
-	if (KEY_INODE(k) != s->op.inode ||
+	if (KEY_INODE(k) != s->inode ||
 	    KEY_START(k) > bio->bi_sector) {
 		unsigned bio_sectors = bio_sectors(bio);
-		unsigned sectors = KEY_INODE(k) == s->op.inode
+		unsigned sectors = KEY_INODE(k) == s->inode
 			? min_t(uint64_t, INT_MAX,
 				KEY_START(k) - bio->bi_sector)
 			: INT_MAX;
@@ -708,8 +708,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-	bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key);
-	bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key);
+	bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key);
 
 	n->bi_end_io	= bch_cache_read_endio;
 	n->bi_private	= &s->cl;
@@ -735,8 +735,8 @@ static void cache_lookup(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
 
-	int ret = bch_btree_map_keys(op, op->c,
-				     &KEY(op->inode, bio->bi_sector, 0),
+	int ret = bch_btree_map_keys(op, s->c,
+				     &KEY(s->inode, bio->bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
@@ -793,8 +793,8 @@ static void search_free(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	bio_complete(s);
 
-	if (s->op.cache_bio)
-		bio_put(s->op.cache_bio);
+	if (s->cache_bio)
+		bio_put(s->cache_bio);
 
 	if (s->unaligned_bvec)
 		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
@@ -813,14 +813,14 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 
 	__closure_init(&s->cl, NULL);
 
-	s->op.inode		= d->id;
-	s->op.c			= d->c;
+	s->inode		= d->id;
+	s->c			= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
 	s->task			= current;
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
-	s->op.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
@@ -891,7 +891,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 
 static bool check_should_bypass(struct cached_dev *dc, struct search *s)
 {
-	struct cache_set *c = s->op.c;
+	struct cache_set *c = s->c;
 	struct bio *bio = &s->bio.bio;
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
@@ -985,11 +985,11 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->op.insert_collision)
 		bch_mark_cache_miss_collision(s);
 
-	if (s->op.cache_bio) {
+	if (s->cache_bio) {
 		int i;
 		struct bio_vec *bv;
 
-		__bio_for_each_segment(bv, s->op.cache_bio, i, 0)
+		bio_for_each_segment_all(bv, s->cache_bio, i)
 			__free_page(bv->bv_page);
 	}
 
@@ -1042,14 +1042,15 @@ static void cached_dev_read_done(struct closure *cl)
 	 * to the buffers the original bio pointed to:
 	 */
 
-	if (s->op.cache_bio) {
-		bio_reset(s->op.cache_bio);
-		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
-		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
-		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
-		bch_bio_map(s->op.cache_bio, NULL);
+	if (s->cache_bio) {
+		bio_reset(s->cache_bio);
+		s->cache_bio->bi_sector =
+			s->cache_miss->bi_sector;
+		s->cache_bio->bi_bdev = s->cache_miss->bi_bdev;
+		s->cache_bio->bi_size = s->cache_bio_sectors << 9;
+		bch_bio_map(s->cache_bio, NULL);
 
-		bio_copy_data(s->cache_miss, s->op.cache_bio);
+		bio_copy_data(s->cache_miss, s->cache_bio);
 
 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
@@ -1060,8 +1061,8 @@ static void cached_dev_read_done(struct closure *cl)
 
 	bio_complete(s);
 
-	if (s->op.cache_bio &&
-	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
+	if (s->cache_bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
 		s->op.type = BTREE_REPLACE;
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	}
@@ -1074,12 +1075,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
+	bch_mark_cache_accounting(s, !s->cache_miss, s->bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
 
 	if (s->error)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
+	else if (s->cache_bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -1093,7 +1094,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
-	if (s->cache_miss || s->op.bypass) {
+	if (s->cache_miss || s->bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
@@ -1101,13 +1102,13 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	if (!(bio->bi_rw & REQ_RAHEAD) &&
 	    !(bio->bi_rw & REQ_META) &&
-	    s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+	    s->c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
 			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 
 	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->op.replace = KEY(s->op.inode, bio->bi_sector +
+	s->op.replace = KEY(s->inode, bio->bi_sector +
 			    s->cache_bio_sectors, s->cache_bio_sectors);
 
 	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
@@ -1137,7 +1138,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		goto out_put;
 
 	s->cache_miss	= miss;
-	s->op.cache_bio = cache_bio;
+	s->cache_bio = cache_bio;
 	bio_get(cache_bio);
 	closure_bio_submit(cache_bio, &s->cl, s->d);
 
@@ -1177,7 +1178,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
 	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
-	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
+	bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end);
 
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
@@ -1185,7 +1186,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		 * We overlap with some dirty data undergoing background
 		 * writeback, force this write to writeback
 		 */
-		s->op.bypass	= false;
+		s->bypass	= false;
 		s->writeback	= true;
 	}
 
@@ -1197,27 +1198,27 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	 * so we still want to call it.
 	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		s->op.bypass = true;
+		s->bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->op.bypass)) {
-		s->op.bypass = false;
+			     s->bypass)) {
+		s->bypass = false;
 		s->writeback = true;
 	}
 
-	trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass);
+	trace_bcache_write(s->orig_bio, s->writeback, s->bypass);
 
-	if (s->op.bypass) {
-		s->op.cache_bio = s->orig_bio;
-		bio_get(s->op.cache_bio);
+	if (s->bypass) {
+		s->cache_bio = s->orig_bio;
+		bio_get(s->cache_bio);
 
 		if (!(bio->bi_rw & REQ_DISCARD) ||
 		    blk_queue_discard(bdev_get_queue(dc->bdev)))
 			closure_bio_submit(bio, cl, s->d);
 	} else if (s->writeback) {
 		bch_writeback_add(dc);
-		s->op.cache_bio = bio;
+		s->cache_bio = bio;
 
 		if (bio->bi_rw & REQ_FLUSH) {
 			/* Also need to send a flush to the backing device */
@@ -1232,8 +1233,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(flush, cl, s->d);
 		}
 	} else {
-		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						   dc->disk.bio_split);
+		s->cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+						dc->disk.bio_split);
 
 		closure_bio_submit(bio, cl, s->d);
 	}
@@ -1247,8 +1248,8 @@ static void cached_dev_nodata(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
-	if (s->op.flush_journal)
-		bch_journal_meta(s->op.c, cl);
+	if (s->flush_journal)
+		bch_journal_meta(s->c, cl);
 
 	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
@@ -1286,7 +1287,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 					      cached_dev_nodata,
 					      bcache_wq);
 		} else {
-			s->op.bypass = check_should_bypass(dc, s);
+			s->bypass = check_should_bypass(dc, s);
 
 			if (rw)
 				cached_dev_write(dc, s);
@@ -1376,8 +1377,8 @@ static void flash_dev_nodata(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->op.flush_journal)
-		bch_journal_meta(s->op.c, cl);
+	if (s->flush_journal)
+		bch_journal_meta(s->c, cl);
 
 	continue_at(cl, search_free, NULL);
 }
@@ -1409,13 +1410,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 				      flash_dev_nodata,
 				      bcache_wq);
 	} else if (rw) {
-		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
+		bch_keybuf_check_overlapping(&s->c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
-		s->op.bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
+		s->bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
 		s->writeback	= true;
-		s->op.cache_bio	= bio;
+		s->cache_bio	= bio;
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index b0b4b0b5b7e9..0f79177c4f33 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -8,19 +8,33 @@ struct search {
 	struct closure		cl;
 
 	struct bcache_device	*d;
+	struct cache_set	*c;
 	struct task_struct	*task;
 
 	struct bbio		bio;
 	struct bio		*orig_bio;
 	struct bio		*cache_miss;
+
+	/* Bio to be inserted into the cache */
+	struct bio		*cache_bio;
 	unsigned		cache_bio_sectors;
 
+	unsigned		inode;
+
 	unsigned		recoverable:1;
 	unsigned		unaligned_bvec:1;
 
 	unsigned		write:1;
 	unsigned		writeback:1;
 
+	unsigned		csum:1;
+	unsigned		bypass:1;
+	unsigned		flush_journal:1;
+
+	unsigned		insert_data_done:1;
+
+	uint16_t		write_prio;
+
 	/* IO error returned to s->bio */
 	short			error;
 	unsigned long		start_time;
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index b8730e714d69..ea77263cf7ef 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -200,7 +200,7 @@ void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
-	mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
+	mark_cache_stats(&s->c->accounting.collector, hit, bypass);
 #ifdef CONFIG_CGROUP_BCACHE
 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
 #endif
@@ -210,21 +210,21 @@ void bch_mark_cache_readahead(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
-	atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
+	atomic_inc(&s->c->accounting.collector.cache_readaheads);
 }
 
 void bch_mark_cache_miss_collision(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
-	atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
+	atomic_inc(&s->c->accounting.collector.cache_miss_collisions);
 }
 
 void bch_mark_sectors_bypassed(struct search *s, int sectors)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
-	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed);
 }
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b79dd5a6679e..a314c771263f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1493,11 +1493,10 @@ static void run_cache_set(struct cache_set *c)
 	const char *err = "cannot allocate memory";
 	struct cached_dev *dc, *t;
 	struct cache *ca;
+	struct closure cl;
 	unsigned i;
 
-	struct btree_op op;
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	closure_init_stack(&cl);
 
 	for_each_cache(ca, c, i)
 		c->nbuckets += ca->sb.nbuckets;
@@ -1508,7 +1507,7 @@ static void run_cache_set(struct cache_set *c)
 		struct jset *j;
 
 		err = "cannot allocate memory for journal";
-		if (bch_journal_read(c, &journal, &op))
+		if (bch_journal_read(c, &journal))
 			goto err;
 
 		pr_debug("btree_journal_read() done");
@@ -1543,12 +1542,12 @@ static void run_cache_set(struct cache_set *c)
 		list_del_init(&c->root->list);
 		rw_unlock(true, c->root);
 
-		err = uuid_read(c, j, &op.cl);
+		err = uuid_read(c, j, &cl);
 		if (err)
 			goto err;
 
 		err = "error in recovery";
-		if (bch_btree_check(c, &op))
+		if (bch_btree_check(c))
 			goto err;
 
 		bch_journal_mark(c, &journal);
@@ -1580,7 +1579,7 @@ static void run_cache_set(struct cache_set *c)
 		if (j->version < BCACHE_JSET_VERSION_UUID)
 			__uuid_write(c);
 
-		bch_journal_replay(c, &journal, &op);
+		bch_journal_replay(c, &journal);
 	} else {
 		pr_notice("invalidating existing data");
 
@@ -1616,7 +1615,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		bkey_copy_key(&c->root->key, &MAX_KEY);
-		bch_btree_node_write(c->root, &op.cl);
+		bch_btree_node_write(c->root, &cl);
 
 		bch_btree_set_root(c->root);
 		rw_unlock(true, c->root);
@@ -1629,14 +1628,14 @@ static void run_cache_set(struct cache_set *c)
 		SET_CACHE_SYNC(&c->sb, true);
 
 		bch_journal_next(&c->journal);
-		bch_journal_meta(c, &op.cl);
+		bch_journal_meta(c, &cl);
 	}
 
 	err = "error starting gc thread";
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	closure_sync(&op.cl);
+	closure_sync(&cl);
 	c->sb.last_mount = get_seconds();
 	bcache_write_super(c);
 
@@ -1647,7 +1646,7 @@ static void run_cache_set(struct cache_set *c)
 
 	return;
 err:
-	closure_sync(&op.cl);
+	closure_sync(&cl);
 	/* XXX: test this, it's broken */
 	bch_cache_set_error(c, err);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c68de9f12618..b58c2bc91e3f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -155,7 +155,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys);
+		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)
@@ -433,9 +433,16 @@ static int bch_writeback_thread(void *arg)
 
 /* Init */
 
-static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
+struct sectors_dirty_init {
+	struct btree_op	op;
+	unsigned	inode;
+};
+
+static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 				 struct bkey *k)
 {
+	struct sectors_dirty_init *op = container_of(_op,
+						struct sectors_dirty_init, op);
 	if (KEY_INODE(k) > op->inode)
 		return MAP_DONE;
 
@@ -448,12 +455,12 @@ static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
 
 void bch_sectors_dirty_init(struct cached_dev *dc)
 {
-	struct btree_op op;
+	struct sectors_dirty_init op;
 
-	bch_btree_op_init_stack(&op);
+	bch_btree_op_init_stack(&op.op);
 	op.inode = dc->disk.id;
 
-	bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 }
 

From b54d6934da7857f87b092df9b77dc1f42818ba94 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:04:18 -0700
Subject: [PATCH 68/94] bcache: Kill op->cl

This isn't used for waiting asynchronously anymore - so this is a fairly
trivial refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c      |  2 +-
 drivers/md/bcache/btree.c     | 81 +++++++++++++++--------------------
 drivers/md/bcache/btree.h     |  8 ++--
 drivers/md/bcache/journal.c   |  8 ++--
 drivers/md/bcache/movinggc.c  |  4 +-
 drivers/md/bcache/request.c   | 35 ++++++---------
 drivers/md/bcache/request.h   |  1 +
 drivers/md/bcache/writeback.c |  5 +--
 8 files changed, 63 insertions(+), 81 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index fae5b7b3f5ab..f7b5525ddafa 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1196,7 +1196,7 @@ int bch_bset_print_stats(struct cache_set *c, char *buf)
 	int ret;
 
 	memset(&t, 0, sizeof(struct bset_stats));
-	bch_btree_op_init_stack(&t.op);
+	bch_btree_op_init(&t.op, -1);
 
 	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
 	if (ret < 0)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5cb59c313dc3..cb1a490f7f86 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -115,13 +115,6 @@ enum {
 
 static struct workqueue_struct *btree_io_wq;
 
-void bch_btree_op_init_stack(struct btree_op *op)
-{
-	memset(op, 0, sizeof(struct btree_op));
-	closure_init_stack(&op->cl);
-	op->lock = -1;
-}
-
 static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
@@ -965,8 +958,7 @@ err:
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
- * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
- * if that closure is in non blocking mode, will return -EAGAIN.
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
  *
  * The btree node will have either a read or a write lock held, depending on
  * level and op->lock.
@@ -1260,6 +1252,9 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 {
 	unsigned nodes = 0, keys = 0, blocks;
 	int i;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	while (nodes < GC_MERGE_NODES && r[nodes].b)
 		keys += r[nodes++].keys;
@@ -1353,9 +1348,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 {
 	void write(struct btree *r)
 	{
-		if (!r->written)
-			bch_btree_node_write(r, &op->cl);
-		else if (btree_node_dirty(r))
+		if (!r->written || btree_node_dirty(r))
 			bch_btree_node_write(r, writes);
 
 		up_write(&r->lock);
@@ -1431,6 +1424,9 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	struct btree *n = NULL;
 	unsigned keys = 0;
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	if (b->level || stale > 10)
 		n = btree_node_alloc_replacement(b);
@@ -1442,11 +1438,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 		ret = btree_gc_recurse(b, op, writes, gc);
 
 	if (!b->written || btree_node_dirty(b)) {
-		bch_btree_node_write(b, n ? &op->cl : NULL);
+		bch_btree_node_write(b, n ? &cl : NULL);
 	}
 
 	if (!IS_ERR_OR_NULL(n)) {
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(b);
 		btree_node_free(n);
 		rw_unlock(true, b);
@@ -1545,15 +1541,13 @@ static void bch_btree_gc(struct cache_set *c)
 
 	memset(&stats, 0, sizeof(struct gc_stat));
 	closure_init_stack(&writes);
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	btree_gc_start(c);
 
 	atomic_inc(&c->prio_blocked);
 
 	ret = btree_root(gc_root, c, &op, &writes, &stats);
-	closure_sync(&op.cl);
 	closure_sync(&writes);
 
 	if (ret) {
@@ -1562,8 +1556,8 @@ static void bch_btree_gc(struct cache_set *c)
 	}
 
 	/* Possibly wait for new UUIDs or whatever to hit disk */
-	bch_journal_meta(c, &op.cl);
-	closure_sync(&op.cl);
+	bch_journal_meta(c, &writes);
+	closure_sync(&writes);
 
 	available = bch_btree_gc_finish(c);
 
@@ -1671,8 +1665,7 @@ int bch_btree_check(struct cache_set *c)
 	struct btree_op op;
 
 	memset(seen, 0, sizeof(seen));
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	for (i = 0; c->cache[i]; i++) {
 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1980,6 +1973,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	n1 = btree_node_alloc_replacement(b);
 	if (IS_ERR(n1))
@@ -2025,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		bkey_copy_key(&n2->key, &b->key);
 
 		bch_keylist_add(parent_keys, &n2->key);
-		bch_btree_node_write(n2, &op->cl);
+		bch_btree_node_write(n2, &cl);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
@@ -2034,23 +2030,23 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	}
 
 	bch_keylist_add(parent_keys, &n1->key);
-	bch_btree_node_write(n1, &op->cl);
+	bch_btree_node_write(n1, &cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op, parent_keys);
-		bch_btree_node_write(n3, &op->cl);
+		bch_btree_node_write(n3, &cl);
 
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
 		bch_keylist_reset(parent_keys);
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(n1);
 	} else {
 		unsigned i;
@@ -2065,7 +2061,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		}
 
 		bch_keylist_push(parent_keys);
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		atomic_inc(&b->c->prio_blocked);
 	}
 
@@ -2126,10 +2122,15 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
 			if (bch_btree_insert_keys(b, op, insert_keys)) {
-				if (!b->level)
+				if (!b->level) {
 					bch_btree_leaf_dirty(b, journal_ref);
-				else
-					bch_btree_node_write(b, &op->cl);
+				} else {
+					struct closure cl;
+
+					closure_init_stack(&cl);
+					bch_btree_node_write(b, &cl);
+					closure_sync(&cl);
+				}
 			}
 		}
 	} while (!bch_keylist_empty(&split_keys));
@@ -2204,12 +2205,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 {
 	int ret = 0;
 
-	/*
-	 * Don't want to block with the btree locked unless we have to,
-	 * otherwise we get deadlocks with try_harder and between split/gc
-	 */
-	clear_closure_blocking(&op->cl);
-
 	BUG_ON(bch_keylist_empty(keys));
 
 	while (!bch_keylist_empty(keys)) {
@@ -2217,8 +2212,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
 
 		if (ret == -EAGAIN) {
+			BUG();
 			ret = 0;
-			closure_sync(&op->cl);
 		} else if (ret) {
 			struct bkey *k;
 
@@ -2292,10 +2287,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
 			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
 {
-	int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags);
-	if (closure_blocking(&op->cl))
-		closure_sync(&op->cl);
-	return ret;
+	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
 }
 
 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
@@ -2328,10 +2320,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
 		       struct bkey *from, btree_map_keys_fn *fn, int flags)
 {
-	int ret = btree_root(map_keys_recurse, c, op, from, fn, flags);
-	if (closure_blocking(&op->cl))
-		closure_sync(&op->cl);
-	return ret;
+	return btree_root(map_keys_recurse, c, op, from, fn, flags);
 }
 
 /* Keybuf code */
@@ -2409,7 +2398,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 
 	cond_resched();
 
-	bch_btree_op_init_stack(&refill.op);
+	bch_btree_op_init(&refill.op, -1);
 	refill.buf = buf;
 	refill.end = end;
 	refill.pred = pred;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3f820b67150c..34ee5359b262 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -237,8 +237,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k);
 /* Recursing down the btree */
 
 struct btree_op {
-	struct closure		cl;
-
 	/* Btree level at which we start taking write locks */
 	short			lock;
 
@@ -253,7 +251,11 @@ struct btree_op {
 	BKEY_PADDED(replace);
 };
 
-void bch_btree_op_init_stack(struct btree_op *);
+static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
+{
+	memset(op, 0, sizeof(struct btree_op));
+	op->lock = write_lock_level;
+}
 
 static inline void rw_lock(bool w, struct btree *b, int level)
 {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 725c8eb9a62a..20e900ad5010 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -305,8 +305,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 	struct btree_op op;
 
 	bch_keylist_init(&keylist);
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -341,14 +340,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 
 	pr_info("journal replay done, %i keys in %i entries, seq %llu",
 		keys, entries, end);
-
+err:
 	while (!list_empty(list)) {
 		i = list_first_entry(list, struct journal_replay, list);
 		list_del(&i->list);
 		kfree(i);
 	}
-err:
-	closure_sync(&op.cl);
+
 	return ret;
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 80e30d77221e..219356f6159d 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -108,8 +108,8 @@ static void write_moving(struct closure *cl)
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
 
-		closure_init(&s->op.cl, cl);
-		bch_data_insert(&s->op.cl);
+		closure_init(&s->btree, cl);
+		bch_data_insert(&s->btree);
 	}
 
 	continue_at(cl, write_moving_finish, system_wq);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 818e2e39e71f..5df44fbc9e1d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -215,8 +215,7 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 static void bch_data_insert_keys(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
 
 	/*
@@ -236,7 +235,7 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) {
+	if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -433,8 +432,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 static void bch_data_invalidate(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = s->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
@@ -461,8 +459,7 @@ out:
 
 static void bch_data_insert_error(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -493,8 +490,7 @@ static void bch_data_insert_error(struct closure *cl)
 static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	if (error) {
 		/* TODO: We could try to recover from this. */
@@ -511,8 +507,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
 
 static void bch_data_insert_start(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = s->cache_bio, *n;
 
 	if (s->bypass)
@@ -630,8 +625,7 @@ err:
  */
 void bch_data_insert(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	bch_keylist_init(&s->insert_keys);
 	bio_get(s->cache_bio);
@@ -731,11 +725,10 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 static void cache_lookup(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = &s->bio.bio;
 
-	int ret = bch_btree_map_keys(op, s->c,
+	int ret = bch_btree_map_keys(&s->op, s->c,
 				     &KEY(s->inode, bio->bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
@@ -1064,7 +1057,7 @@ static void cached_dev_read_done(struct closure *cl)
 	if (s->cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
 		s->op.type = BTREE_REPLACE;
-		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1156,7 +1149,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, cache_lookup, NULL, cl);
+	closure_call(&s->btree, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1239,7 +1232,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+	closure_call(&s->btree, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
@@ -1418,9 +1411,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		s->writeback	= true;
 		s->cache_bio	= bio;
 
-		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, cache_lookup, NULL, cl);
+		closure_call(&s->btree, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 0f79177c4f33..ed578aa53ee2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -6,6 +6,7 @@
 struct search {
 	/* Stack frame for bio_complete */
 	struct closure		cl;
+	struct closure		btree;
 
 	struct bcache_device	*d;
 	struct cache_set	*c;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b58c2bc91e3f..d0968e8938f7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -143,7 +143,7 @@ static void write_dirty_finish(struct closure *cl)
 		struct btree_op op;
 		struct keylist keys;
 
-		bch_btree_op_init_stack(&op);
+		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
 		op.type = BTREE_REPLACE;
@@ -156,7 +156,6 @@ static void write_dirty_finish(struct closure *cl)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
 		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
-		closure_sync(&op.cl);
 
 		if (op.insert_collision)
 			trace_bcache_writeback_collision(&w->key);
@@ -457,7 +456,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 {
 	struct sectors_dirty_init op;
 
-	bch_btree_op_init_stack(&op.op);
+	bch_btree_op_init(&op.op, -1);
 	op.inode = dc->disk.id;
 
 	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),

From faadf0c96547ec8277ad0abd6959f2ef48522f31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 1 Nov 2013 18:03:08 -0700
Subject: [PATCH 69/94] bcache: Drop some closure stuff

With a the recent bcache refactoring, some of the closure code isn't
needed anymore.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   |   6 +-
 drivers/md/bcache/closure.c | 103 ++++----------------
 drivers/md/bcache/closure.h | 181 +++++-------------------------------
 3 files changed, 40 insertions(+), 250 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cb1a490f7f86..3e0c90130c2e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -407,7 +407,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio = bch_bbio_alloc(b->c);
 
 	b->bio->bi_end_io	= btree_node_write_endio;
-	b->bio->bi_private	= &b->io.cl;
+	b->bio->bi_private	= cl;
 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
 	bch_bio_map(b->bio, i);
@@ -672,8 +672,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 	}
 
 	/* wait for any in flight btree write */
-	closure_wait_event_sync(&b->io.wait, &cl,
-		atomic_read(&b->io.cl.remaining) == -1);
+	closure_wait_event(&b->io.wait, &cl,
+			   atomic_read(&b->io.cl.remaining) == -1);
 
 	return 0;
 }
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 9aba2017f0d1..dfff2410322e 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,17 +11,6 @@
 
 #include "closure.h"
 
-void closure_queue(struct closure *cl)
-{
-	struct workqueue_struct *wq = cl->wq;
-	if (wq) {
-		INIT_WORK(&cl->work, cl->work.func);
-		BUG_ON(!queue_work(wq, &cl->work));
-	} else
-		cl->fn(cl);
-}
-EXPORT_SYMBOL_GPL(closure_queue);
-
 #define CL_FIELD(type, field)					\
 	case TYPE_ ## type:					\
 	return &container_of(cl, struct type, cl)->field
@@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl)
 {
 	switch (cl->type) {
 		CL_FIELD(closure_with_waitlist, wait);
-		CL_FIELD(closure_with_waitlist_and_timer, wait);
-	default:
-		return NULL;
-	}
-}
-
-static struct timer_list *closure_timer(struct closure *cl)
-{
-	switch (cl->type) {
-		CL_FIELD(closure_with_timer, timer);
-		CL_FIELD(closure_with_waitlist_and_timer, timer);
 	default:
 		return NULL;
 	}
@@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 	int r = flags & CLOSURE_REMAINING_MASK;
 
 	BUG_ON(flags & CLOSURE_GUARD_MASK);
-	BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
 
 	/* Must deliver precisely one wakeup */
 	if (r == 1 && (flags & CLOSURE_SLEEPING))
@@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 
 	if (!r) {
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
-			/* CLOSURE_BLOCKING might be set - clear it */
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
 			closure_queue(cl);
@@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v)
 {
 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
 }
-EXPORT_SYMBOL_GPL(closure_sub);
+EXPORT_SYMBOL(closure_sub);
 
 void closure_put(struct closure *cl)
 {
 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
-EXPORT_SYMBOL_GPL(closure_put);
+EXPORT_SYMBOL(closure_put);
 
 static void set_waiting(struct closure *cl, unsigned long f)
 {
@@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
 }
-EXPORT_SYMBOL_GPL(__closure_wake_up);
+EXPORT_SYMBOL(__closure_wake_up);
 
 bool closure_wait(struct closure_waitlist *list, struct closure *cl)
 {
@@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl)
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(closure_wait);
+EXPORT_SYMBOL(closure_wait);
 
 /**
  * closure_sync() - sleep until a closure a closure has nothing left to wait on
@@ -169,7 +146,7 @@ void closure_sync(struct closure *cl)
 
 	__closure_end_sleep(cl);
 }
-EXPORT_SYMBOL_GPL(closure_sync);
+EXPORT_SYMBOL(closure_sync);
 
 /**
  * closure_trylock() - try to acquire the closure, without waiting
@@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent)
 			   CLOSURE_REMAINING_INITIALIZER) != -1)
 		return false;
 
-	closure_set_ret_ip(cl);
-
 	smp_mb();
+
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
 
+	closure_set_ret_ip(cl);
 	closure_debug_create(cl);
 	return true;
 }
-EXPORT_SYMBOL_GPL(closure_trylock);
+EXPORT_SYMBOL(closure_trylock);
 
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list)
@@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent,
 		if (closure_trylock(cl, parent))
 			return;
 
-		closure_wait_event_sync(wait_list, &wait,
-					atomic_read(&cl->remaining) == -1);
+		closure_wait_event(wait_list, &wait,
+				   atomic_read(&cl->remaining) == -1);
 	}
 }
-EXPORT_SYMBOL_GPL(__closure_lock);
-
-static void closure_delay_timer_fn(unsigned long data)
-{
-	struct closure *cl = (struct closure *) data;
-	closure_sub(cl, CLOSURE_TIMER + 1);
-}
-
-void do_closure_timer_init(struct closure *cl)
-{
-	struct timer_list *timer = closure_timer(cl);
-
-	init_timer(timer);
-	timer->data	= (unsigned long) cl;
-	timer->function = closure_delay_timer_fn;
-}
-EXPORT_SYMBOL_GPL(do_closure_timer_init);
-
-bool __closure_delay(struct closure *cl, unsigned long delay,
-		     struct timer_list *timer)
-{
-	if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
-		return false;
-
-	BUG_ON(timer_pending(timer));
-
-	timer->expires	= jiffies + delay;
-
-	atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
-	add_timer(timer);
-	return true;
-}
-EXPORT_SYMBOL_GPL(__closure_delay);
-
-void __closure_flush(struct closure *cl, struct timer_list *timer)
-{
-	if (del_timer(timer))
-		closure_sub(cl, CLOSURE_TIMER + 1);
-}
-EXPORT_SYMBOL_GPL(__closure_flush);
-
-void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
-{
-	if (del_timer_sync(timer))
-		closure_sub(cl, CLOSURE_TIMER + 1);
-}
-EXPORT_SYMBOL_GPL(__closure_flush_sync);
+EXPORT_SYMBOL(__closure_lock);
 
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
@@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl)
 	list_add(&cl->all, &closure_list);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
-EXPORT_SYMBOL_GPL(closure_debug_create);
+EXPORT_SYMBOL(closure_debug_create);
 
 void closure_debug_destroy(struct closure *cl)
 {
@@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl)
 	list_del(&cl->all);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
-EXPORT_SYMBOL_GPL(closure_debug_destroy);
+EXPORT_SYMBOL(closure_debug_destroy);
 
 static struct dentry *debug;
 
@@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data)
 			   cl, (void *) cl->ip, cl->fn, cl->parent,
 			   r & CLOSURE_REMAINING_MASK);
 
-		seq_printf(f, "%s%s%s%s%s%s\n",
+		seq_printf(f, "%s%s%s%s\n",
 			   test_bit(WORK_STRUCT_PENDING,
 				    work_data_bits(&cl->work)) ? "Q" : "",
 			   r & CLOSURE_RUNNING	? "R" : "",
-			   r & CLOSURE_BLOCKING	? "B" : "",
 			   r & CLOSURE_STACK	? "S" : "",
-			   r & CLOSURE_SLEEPING	? "Sl" : "",
-			   r & CLOSURE_TIMER	? "T" : "");
+			   r & CLOSURE_SLEEPING	? "Sl" : "");
 
 		if (r & CLOSURE_WAITING)
 			seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ab011f03801f..9762f1be3304 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -155,21 +155,6 @@
  * delayed_work embeds a work item and a timer_list. The important thing is, use
  * it exactly like you would a regular closure and closure_put() will magically
  * handle everything for you.
- *
- * We've got closures that embed timers, too. They're called, appropriately
- * enough:
- * struct closure_with_timer;
- *
- * This gives you access to closure_delay(). It takes a refcount for a specified
- * number of jiffies - you could then call closure_sync() (for a slightly
- * convoluted version of msleep()) or continue_at() - which gives you the same
- * effect as using a delayed work item, except you can reuse the work_struct
- * already embedded in struct closure.
- *
- * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
- * probably expect, if you happen to need the features of both. (You don't
- * really want to know how all this is implemented, but if I've done my job
- * right you shouldn't have to care).
  */
 
 struct closure;
@@ -182,16 +167,11 @@ struct closure_waitlist {
 enum closure_type {
 	TYPE_closure				= 0,
 	TYPE_closure_with_waitlist		= 1,
-	TYPE_closure_with_timer			= 2,
-	TYPE_closure_with_waitlist_and_timer	= 3,
-	MAX_CLOSURE_TYPE			= 3,
+	MAX_CLOSURE_TYPE			= 1,
 };
 
 enum closure_state {
 	/*
-	 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
-	 * waiting asynchronously
-	 *
 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
 	 * the thread that owns the closure, and cleared by the thread that's
 	 * waking up the closure.
@@ -200,10 +180,6 @@ enum closure_state {
 	 * - indicates that cl->task is valid and closure_put() may wake it up.
 	 * Only set or cleared by the thread that owns the closure.
 	 *
-	 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
-	 * has an outstanding timer. Must be set by the thread that owns the
-	 * closure, and cleared by the timer function when the timer goes off.
-	 *
 	 * The rest are for debugging and don't affect behaviour:
 	 *
 	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -218,19 +194,17 @@ enum closure_state {
 	 * closure with this flag set
 	 */
 
-	CLOSURE_BITS_START	= (1 << 19),
-	CLOSURE_DESTRUCTOR	= (1 << 19),
-	CLOSURE_BLOCKING	= (1 << 21),
-	CLOSURE_WAITING		= (1 << 23),
-	CLOSURE_SLEEPING	= (1 << 25),
-	CLOSURE_TIMER		= (1 << 27),
+	CLOSURE_BITS_START	= (1 << 23),
+	CLOSURE_DESTRUCTOR	= (1 << 23),
+	CLOSURE_WAITING		= (1 << 25),
+	CLOSURE_SLEEPING	= (1 << 27),
 	CLOSURE_RUNNING		= (1 << 29),
 	CLOSURE_STACK		= (1 << 31),
 };
 
 #define CLOSURE_GUARD_MASK					\
-	((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|	\
-	  CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING|	\
+	  CLOSURE_RUNNING|CLOSURE_STACK) << 1)
 
 #define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
 #define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
@@ -268,17 +242,6 @@ struct closure_with_waitlist {
 	struct closure_waitlist	wait;
 };
 
-struct closure_with_timer {
-	struct closure		cl;
-	struct timer_list	timer;
-};
-
-struct closure_with_waitlist_and_timer {
-	struct closure		cl;
-	struct closure_waitlist	wait;
-	struct timer_list	timer;
-};
-
 extern unsigned invalid_closure_type(void);
 
 #define __CLOSURE_TYPE(cl, _t)						\
@@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void);
 (									\
 	__CLOSURE_TYPE(cl, closure)					\
 	__CLOSURE_TYPE(cl, closure_with_waitlist)			\
-	__CLOSURE_TYPE(cl, closure_with_timer)				\
-	__CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)		\
 	invalid_closure_type()						\
 )
 
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
-void closure_queue(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
@@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent);
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list);
 
-void do_closure_timer_init(struct closure *cl);
-bool __closure_delay(struct closure *cl, unsigned long delay,
-		     struct timer_list *timer);
-void __closure_flush(struct closure *cl, struct timer_list *timer);
-void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
 void closure_debug_init(void);
@@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl)
 	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
 }
 
-static inline bool closure_is_stopped(struct closure *cl)
-{
-	return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
-}
-
 static inline bool closure_is_unlocked(struct closure *cl)
 {
 	return atomic_read(&cl->remaining) == -1;
@@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl)
 static inline void do_closure_init(struct closure *cl, struct closure *parent,
 				   bool running)
 {
-	switch (cl->type) {
-	case TYPE_closure_with_timer:
-	case TYPE_closure_with_waitlist_and_timer:
-		do_closure_timer_init(cl);
-	default:
-		break;
-	}
-
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
@@ -429,8 +370,7 @@ do {								\
 static inline void closure_init_stack(struct closure *cl)
 {
 	memset(cl, 0, sizeof(struct closure));
-	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
-		   CLOSURE_BLOCKING|CLOSURE_STACK);
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
 }
 
 /**
@@ -461,24 +401,6 @@ do {								\
 #define closure_lock(cl, parent)				\
 	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
 
-/**
- * closure_delay() - delay some number of jiffies
- * @cl:		the closure that will sleep
- * @delay:	the delay in jiffies
- *
- * Takes a refcount on @cl which will be released after @delay jiffies; this may
- * be used to have a function run after a delay with continue_at(), or
- * closure_sync() may be used for a convoluted version of msleep().
- */
-#define closure_delay(cl, delay)			\
-	__closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
-
-#define closure_flush(cl)				\
-	__closure_flush(__to_internal_closure(cl), &(cl)->timer)
-
-#define closure_flush_sync(cl)				\
-	__closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
-
 static inline void __closure_end_sleep(struct closure *cl)
 {
 	__set_current_state(TASK_RUNNING);
@@ -497,40 +419,6 @@ static inline void __closure_start_sleep(struct closure *cl)
 		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 
-/**
- * closure_blocking() - returns true if the closure is in blocking mode.
- *
- * If a closure is in blocking mode, closure_wait_event() will sleep until the
- * condition is true instead of waiting asynchronously.
- */
-static inline bool closure_blocking(struct closure *cl)
-{
-	return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
-}
-
-/**
- * set_closure_blocking() - put a closure in blocking mode.
- *
- * If a closure is in blocking mode, closure_wait_event() will sleep until the
- * condition is true instead of waiting asynchronously.
- *
- * Not thread safe - can only be called by the thread running the closure.
- */
-static inline void set_closure_blocking(struct closure *cl)
-{
-	if (!closure_blocking(cl))
-		atomic_add(CLOSURE_BLOCKING, &cl->remaining);
-}
-
-/*
- * Not thread safe - can only be called by the thread running the closure.
- */
-static inline void clear_closure_blocking(struct closure *cl)
-{
-	if (closure_blocking(cl))
-		atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
-}
-
 /**
  * closure_wake_up() - wake up all closures on a wait list.
  */
@@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list)
  * refcount on our closure. If this was a stack allocated closure, that would be
  * bad.
  */
-#define __closure_wait_event(list, cl, condition, _block)		\
+#define closure_wait_event(list, cl, condition)				\
 ({									\
-	bool block = _block;						\
 	typeof(condition) ret;						\
 									\
 	while (1) {							\
 		ret = (condition);					\
 		if (ret) {						\
 			__closure_wake_up(list);			\
-			if (block)					\
-				closure_sync(cl);			\
-									\
+			closure_sync(cl);				\
 			break;						\
 		}							\
 									\
-		if (block)						\
-			__closure_start_sleep(cl);			\
-									\
-		if (!closure_wait(list, cl)) {				\
-			if (!block)					\
-				break;					\
+		__closure_start_sleep(cl);				\
 									\
+		if (!closure_wait(list, cl))				\
 			schedule();					\
-		}							\
 	}								\
 									\
 	ret;								\
 })
 
-/**
- * closure_wait_event() - wait on a condition, synchronously or asynchronously.
- * @list:	the wait list to wait on
- * @cl:		the closure that is doing the waiting
- * @condition:	a C expression for the event to wait for
- *
- * If the closure is in blocking mode, sleeps until the @condition evaluates to
- * true - exactly like wait_event().
- *
- * If the closure is not in blocking mode, waits asynchronously; if the
- * condition is currently false the @cl is put onto @list and returns. @list
- * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
- * wait for another thread to wake up @list, which drops the refcount on @cl.
- *
- * Returns the value of @condition; @cl will be on @list iff @condition was
- * false.
- *
- * closure_wake_up(@list) must be called after changing any variable that could
- * cause @condition to become true.
- */
-#define closure_wait_event(list, cl, condition)				\
-	__closure_wait_event(list, cl, condition, closure_blocking(cl))
-
-#define closure_wait_event_async(list, cl, condition)			\
-	__closure_wait_event(list, cl, condition, false)
-
-#define closure_wait_event_sync(list, cl, condition)			\
-	__closure_wait_event(list, cl, condition, true)
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
 
 static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 				  struct workqueue_struct *wq)

From 1b207d80d5b986fb305bc899357435d319319513 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:52:54 -0700
Subject: [PATCH 70/94] bcache: Kill op->replace

This is prep work for converting bch_btree_insert to
bch_btree_map_leaf_nodes() - we have to convert all its arguments to
actual arguments. Bunch of churn, but should be straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 102 +++++++++++++++++-----------------
 drivers/md/bcache/btree.h     |  10 +---
 drivers/md/bcache/journal.c   |   2 +-
 drivers/md/bcache/movinggc.c  |   4 +-
 drivers/md/bcache/request.c   |  14 +++--
 drivers/md/bcache/request.h   |   2 +
 drivers/md/bcache/writeback.c |  10 ++--
 7 files changed, 71 insertions(+), 73 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3e0c90130c2e..7a1d8dc19e61 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -89,15 +89,6 @@
  * Test module load/unload
  */
 
-static const char * const op_types[] = {
-	"insert", "replace"
-};
-
-static const char *op_type(struct btree_op *op)
-{
-	return op_types[op->type];
-}
-
 enum {
 	BTREE_INSERT_STATUS_INSERT,
 	BTREE_INSERT_STATUS_BACK_MERGE,
@@ -1699,10 +1690,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
 	bch_bset_fix_lookup_table(b, where);
 }
 
-static bool fix_overlapping_extents(struct btree *b,
-				    struct bkey *insert,
+static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
 				    struct btree_iter *iter,
-				    struct btree_op *op)
+				    struct bkey *replace_key)
 {
 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
 	{
@@ -1730,39 +1720,38 @@ static bool fix_overlapping_extents(struct btree *b,
 		 * We might overlap with 0 size extents; we can't skip these
 		 * because if they're in the set we're inserting to we have to
 		 * adjust them so they don't overlap with the key we're
-		 * inserting. But we don't want to check them for BTREE_REPLACE
+		 * inserting. But we don't want to check them for replace
 		 * operations.
 		 */
 
-		if (op->type == BTREE_REPLACE &&
-		    KEY_SIZE(k)) {
+		if (replace_key && KEY_SIZE(k)) {
 			/*
 			 * k might have been split since we inserted/found the
 			 * key we're replacing
 			 */
 			unsigned i;
 			uint64_t offset = KEY_START(k) -
-				KEY_START(&op->replace);
+				KEY_START(replace_key);
 
 			/* But it must be a subset of the replace key */
-			if (KEY_START(k) < KEY_START(&op->replace) ||
-			    KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
+			if (KEY_START(k) < KEY_START(replace_key) ||
+			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
 				goto check_failed;
 
 			/* We didn't find a key that we were supposed to */
 			if (KEY_START(k) > KEY_START(insert) + sectors_found)
 				goto check_failed;
 
-			if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
+			if (KEY_PTRS(replace_key) != KEY_PTRS(k))
 				goto check_failed;
 
 			/* skip past gen */
 			offset <<= 8;
 
-			BUG_ON(!KEY_PTRS(&op->replace));
+			BUG_ON(!KEY_PTRS(replace_key));
 
-			for (i = 0; i < KEY_PTRS(&op->replace); i++)
-				if (k->ptr[i] != op->replace.ptr[i] + offset)
+			for (i = 0; i < KEY_PTRS(replace_key); i++)
+				if (k->ptr[i] != replace_key->ptr[i] + offset)
 					goto check_failed;
 
 			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
@@ -1833,9 +1822,8 @@ static bool fix_overlapping_extents(struct btree *b,
 	}
 
 check_failed:
-	if (op->type == BTREE_REPLACE) {
+	if (replace_key) {
 		if (!sectors_found) {
-			op->insert_collision = true;
 			return true;
 		} else if (sectors_found < KEY_SIZE(insert)) {
 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
@@ -1848,7 +1836,7 @@ check_failed:
 }
 
 static bool btree_insert_key(struct btree *b, struct btree_op *op,
-			     struct bkey *k)
+			     struct bkey *k, struct bkey *replace_key)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct bkey *m, *prev;
@@ -1874,8 +1862,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		prev = NULL;
 		m = bch_btree_iter_init(b, &iter, &search);
 
-		if (fix_overlapping_extents(b, k, &iter, op))
+		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
+			op->insert_collision = true;
 			return false;
+		}
 
 		if (KEY_DIRTY(k))
 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
@@ -1903,24 +1893,28 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		if (m != end(i) &&
 		    bch_bkey_try_merge(b, k, m))
 			goto copy;
-	} else
+	} else {
+		BUG_ON(replace_key);
 		m = bch_bset_search(b, &b->sets[b->nsets], k);
+	}
 
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
-	bch_check_keys(b, "%u for %s", status, op_type(op));
+	bch_check_keys(b, "%u for %s", status,
+		       replace_key ? "replace" : "insert");
 
 	if (b->level && !KEY_OFFSET(k))
 		btree_current_write(b)->prio_blocked++;
 
-	trace_bcache_btree_insert_key(b, k, op->type, status);
+	trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
 
 	return true;
 }
 
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
-				  struct keylist *insert_keys)
+				  struct keylist *insert_keys,
+				  struct bkey *replace_key)
 {
 	bool ret = false;
 	unsigned oldsize = bch_count_data(b);
@@ -1936,11 +1930,11 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 		if (bkey_cmp(k, &b->key) <= 0) {
 			bkey_put(b->c, k, b->level);
 
-			ret |= btree_insert_key(b, op, k);
+			ret |= btree_insert_key(b, op, k, replace_key);
 			bch_keylist_pop_front(insert_keys);
 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
 #if 0
-			if (op->type == BTREE_REPLACE) {
+			if (replace_key) {
 				bkey_put(b->c, k, b->level);
 				bch_keylist_pop_front(insert_keys);
 				op->insert_collision = true;
@@ -1953,7 +1947,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			bch_cut_back(&b->key, &temp.key);
 			bch_cut_front(&b->key, insert_keys->keys);
 
-			ret |= btree_insert_key(b, op, &temp.key);
+			ret |= btree_insert_key(b, op, &temp.key, replace_key);
 			break;
 		} else {
 			break;
@@ -1968,7 +1962,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
-		       struct keylist *parent_keys)
+		       struct keylist *parent_keys,
+		       struct bkey *replace_key)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
@@ -1998,7 +1993,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 				goto err_free2;
 		}
 
-		bch_btree_insert_keys(n1, op, insert_keys);
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 
 		/*
 		 * Has to be a linear search because we don't have an auxiliary
@@ -2026,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
 
-		bch_btree_insert_keys(n1, op, insert_keys);
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
 
 	bch_keylist_add(parent_keys, &n1->key);
@@ -2036,7 +2031,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op, parent_keys);
+		bch_btree_insert_keys(n3, op, parent_keys, NULL);
 		bch_btree_node_write(n3, &cl);
 
 		closure_sync(&cl);
@@ -2091,7 +2086,8 @@ err:
 
 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				 struct keylist *insert_keys,
-				 atomic_t *journal_ref)
+				 atomic_t *journal_ref,
+				 struct bkey *replace_key)
 {
 	int ret = 0;
 	struct keylist split_keys;
@@ -2101,6 +2097,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 	BUG_ON(b->level);
 
 	do {
+		BUG_ON(b->level && replace_key);
+
 		if (should_split(b)) {
 			if (current->bio_list) {
 				op->lock = b->c->root->level + 1;
@@ -2112,8 +2110,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				struct btree *parent = b->parent;
 
 				ret = btree_split(b, op, insert_keys,
-						  &split_keys);
+						  &split_keys, replace_key);
 				insert_keys = &split_keys;
+				replace_key = NULL;
 				b = parent;
 				if (!ret)
 					ret = -EINTR;
@@ -2121,7 +2120,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 		} else {
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
-			if (bch_btree_insert_keys(b, op, insert_keys)) {
+			if (bch_btree_insert_keys(b, op, insert_keys,
+						  replace_key)) {
 				if (!b->level) {
 					bch_btree_leaf_dirty(b, journal_ref);
 				} else {
@@ -2165,9 +2165,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 
 	bch_keylist_add(&insert, check_key);
 
-	BUG_ON(op->type != BTREE_INSERT);
-
-	ret = bch_btree_insert_node(b, op, &insert, NULL);
+	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
 
 	BUG_ON(!ret && !bch_keylist_empty(&insert));
 out:
@@ -2177,7 +2175,8 @@ out:
 }
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys, atomic_t *journal_ref)
+				    struct keylist *keys, atomic_t *journal_ref,
+				    struct bkey *replace_key)
 {
 	if (bch_keylist_empty(keys))
 		return 0;
@@ -2194,14 +2193,17 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op, keys, journal_ref);
+		return btree(insert_recurse, k, b, op, keys,
+			     journal_ref, replace_key);
 	} else {
-		return bch_btree_insert_node(b, op, keys, journal_ref);
+		return bch_btree_insert_node(b, op, keys,
+					     journal_ref, replace_key);
 	}
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys, atomic_t *journal_ref)
+		     struct keylist *keys, atomic_t *journal_ref,
+		     struct bkey *replace_key)
 {
 	int ret = 0;
 
@@ -2209,7 +2211,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 
 	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
+		ret = btree_root(insert_recurse, c, op, keys,
+				 journal_ref, replace_key);
 
 		if (ret == -EAGAIN) {
 			BUG();
@@ -2217,8 +2220,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		} else if (ret) {
 			struct bkey *k;
 
-			pr_err("error %i trying to insert key for %s",
-			       ret, op_type(op));
+			pr_err("error %i", ret);
 
 			while ((k = bch_keylist_pop(keys)))
 				bkey_put(c, k, 0);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 34ee5359b262..6ff08be3e0c9 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -240,15 +240,7 @@ struct btree_op {
 	/* Btree level at which we start taking write locks */
 	short			lock;
 
-	/* Btree insertion type */
-	enum {
-		BTREE_INSERT,
-		BTREE_REPLACE
-	} type:8;
-
 	unsigned		insert_collision:1;
-
-	BKEY_PADDED(replace);
 };
 
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
@@ -290,7 +282,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *,
-		     struct keylist *, atomic_t *);
+		     struct keylist *, atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 20e900ad5010..592adf51128f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -322,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			ret = bch_btree_insert(&op, s, &keylist, i->pin);
+			ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 219356f6159d..c45ba4f21bae 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -105,8 +105,8 @@ static void write_moving(struct closure *cl)
 		s->writeback		= KEY_DIRTY(&io->w->key);
 		s->csum			= KEY_CSUM(&io->w->key);
 
-		s->op.type = BTREE_REPLACE;
-		bkey_copy(&s->op.replace, &io->w->key);
+		bkey_copy(&s->replace_key, &io->w->key);
+		s->replace = true;
 
 		closure_init(&s->btree, cl);
 		bch_data_insert(&s->btree);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 5df44fbc9e1d..16a3e16f3ff4 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
+	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -235,7 +236,8 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) {
+	if (bch_btree_insert(&s->op, s->c, &s->insert_keys,
+			     journal_ref, replace_key)) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -1056,7 +1058,7 @@ static void cached_dev_read_done(struct closure *cl)
 
 	if (s->cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
-		s->op.type = BTREE_REPLACE;
+		BUG_ON(!s->replace);
 		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	}
 
@@ -1101,13 +1103,15 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->op.replace = KEY(s->inode, bio->bi_sector +
-			    s->cache_bio_sectors, s->cache_bio_sectors);
+	s->replace_key = KEY(s->inode, bio->bi_sector +
+			     s->cache_bio_sectors, s->cache_bio_sectors);
 
-	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
+	ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key);
 	if (ret)
 		return ret;
 
+	s->replace = true;
+
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 
 	/* btree_search_recurse()'s btree iterator is no good anymore */
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index ed578aa53ee2..5ea7a0e6fca0 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,6 +33,7 @@ struct search {
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
+	unsigned		replace:1;
 
 	uint16_t		write_prio;
 
@@ -44,6 +45,7 @@ struct search {
 
 	/* Anything past this point won't get zeroed in search_alloc() */
 	struct keylist		insert_keys;
+	BKEY_PADDED(replace_key);
 };
 
 unsigned bch_get_congested(struct cache_set *);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d0968e8938f7..346a5341faca 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -146,16 +146,14 @@ static void write_dirty_finish(struct closure *cl)
 		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
-		op.type = BTREE_REPLACE;
-		bkey_copy(&op.replace, &w->key);
-
-		SET_KEY_DIRTY(&w->key, false);
-		bch_keylist_add(&keys, &w->key);
+		bkey_copy(keys.top, &w->key);
+		SET_KEY_DIRTY(keys.top, false);
+		bch_keylist_push(&keys);
 
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
+		bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
 
 		if (op.insert_collision)
 			trace_bcache_writeback_collision(&w->key);

From 6054c6d4da1940c7bf8870c6393773aa794f53d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:06:22 -0700
Subject: [PATCH 71/94] bcache: Don't use op->insert_collision

When we convert bch_btree_insert() to bch_btree_map_leaf_nodes(), we
won't be passing struct btree_op to bch_btree_insert() anymore - so we
need a different way of returning whether there was a collision (really,
a replace collision).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     |  3 +++
 drivers/md/bcache/movinggc.c  |  2 +-
 drivers/md/bcache/request.c   | 10 +++++++---
 drivers/md/bcache/request.h   |  1 +
 drivers/md/bcache/writeback.c |  7 ++++---
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a1d8dc19e61..441524dd2d77 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2227,6 +2227,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		}
 	}
 
+	if (op->insert_collision)
+		return -ESRCH;
+
 	return ret;
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index c45ba4f21bae..601c96a62b30 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -52,7 +52,7 @@ static void write_moving_finish(struct closure *cl)
 	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 
-	if (io->s.op.insert_collision)
+	if (io->s.insert_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
 	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 16a3e16f3ff4..bcce06a1e466 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -218,6 +218,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
 	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
+	int ret;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -236,8 +237,11 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(&s->op, s->c, &s->insert_keys,
-			     journal_ref, replace_key)) {
+	ret = bch_btree_insert(&s->op, s->c, &s->insert_keys,
+			       journal_ref, replace_key);
+	if (ret == -ESRCH) {
+		s->insert_collision = true;
+	} else if (ret) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -977,7 +981,7 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->op.insert_collision)
+	if (s->insert_collision)
 		bch_mark_cache_miss_collision(s);
 
 	if (s->cache_bio) {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 5ea7a0e6fca0..f0e930b4ca89 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,6 +34,7 @@ struct search {
 
 	unsigned		insert_data_done:1;
 	unsigned		replace:1;
+	unsigned		insert_collision:1;
 
 	uint16_t		write_prio;
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 346a5341faca..312032ef34f7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -142,6 +142,7 @@ static void write_dirty_finish(struct closure *cl)
 		unsigned i;
 		struct btree_op op;
 		struct keylist keys;
+		int ret;
 
 		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
@@ -153,12 +154,12 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
+		ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
 
-		if (op.insert_collision)
+		if (ret)
 			trace_bcache_writeback_collision(&w->key);
 
-		atomic_long_inc(op.insert_collision
+		atomic_long_inc(ret
 				? &dc->disk.c->writeback_keys_failed
 				: &dc->disk.c->writeback_keys_done);
 	}

From cc7b8819212f437fc82f0f9cdc24deb0fb5d775f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:07:22 -0700
Subject: [PATCH 72/94] bcache: Convert bch_btree_insert() to
 bch_btree_map_leaf_nodes()

Last of the btree_map() conversions. Main visible effect is
bch_btree_insert() is no longer taking a struct btree_op as an argument
anymore - there's no fancy state machine stuff going on, it's just a
normal function.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 81 ++++++++++++++++-------------------
 drivers/md/bcache/btree.h     |  4 +-
 drivers/md/bcache/journal.c   |  4 +-
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/writeback.c |  8 ++--
 5 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 441524dd2d77..f5aa4adadf1d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2174,61 +2174,56 @@ out:
 	return ret;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys, atomic_t *journal_ref,
-				    struct bkey *replace_key)
+struct btree_insert_op {
+	struct btree_op	op;
+	struct keylist	*keys;
+	atomic_t	*journal_ref;
+	struct bkey	*replace_key;
+};
+
+int btree_insert_fn(struct btree_op *b_op, struct btree *b)
 {
-	if (bch_keylist_empty(keys))
-		return 0;
+	struct btree_insert_op *op = container_of(b_op,
+					struct btree_insert_op, op);
 
-	if (b->level) {
-		struct bkey *k;
-
-		k = bch_next_recurse_key(b, &START_KEY(keys->keys));
-		if (!k) {
-			btree_bug(b, "no key to recurse on at level %i/%i",
-				  b->level, b->c->root->level);
-
-			bch_keylist_reset(keys);
-			return -EIO;
-		}
-
-		return btree(insert_recurse, k, b, op, keys,
-			     journal_ref, replace_key);
-	} else {
-		return bch_btree_insert_node(b, op, keys,
-					     journal_ref, replace_key);
-	}
+	int ret = bch_btree_insert_node(b, &op->op, op->keys,
+					op->journal_ref, op->replace_key);
+	if (ret && !bch_keylist_empty(op->keys))
+		return ret;
+	else
+		return MAP_DONE;
 }
 
-int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys, atomic_t *journal_ref,
-		     struct bkey *replace_key)
+int bch_btree_insert(struct cache_set *c, struct keylist *keys,
+		     atomic_t *journal_ref, struct bkey *replace_key)
 {
+	struct btree_insert_op op;
 	int ret = 0;
 
+	BUG_ON(current->bio_list);
 	BUG_ON(bch_keylist_empty(keys));
 
-	while (!bch_keylist_empty(keys)) {
-		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys,
-				 journal_ref, replace_key);
+	bch_btree_op_init(&op.op, 0);
+	op.keys		= keys;
+	op.journal_ref	= journal_ref;
+	op.replace_key	= replace_key;
 
-		if (ret == -EAGAIN) {
-			BUG();
-			ret = 0;
-		} else if (ret) {
-			struct bkey *k;
-
-			pr_err("error %i", ret);
-
-			while ((k = bch_keylist_pop(keys)))
-				bkey_put(c, k, 0);
-		}
+	while (!ret && !bch_keylist_empty(keys)) {
+		op.op.lock = 0;
+		ret = bch_btree_map_leaf_nodes(&op.op, c,
+					       &START_KEY(keys->keys),
+					       btree_insert_fn);
 	}
 
-	if (op->insert_collision)
-		return -ESRCH;
+	if (ret) {
+		struct bkey *k;
+
+		pr_err("error %i", ret);
+
+		while ((k = bch_keylist_pop(keys)))
+			bkey_put(c, k, 0);
+	} else if (op.op.insert_collision)
+		ret = -ESRCH;
 
 	return ret;
 }
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 6ff08be3e0c9..8fc1e8925399 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -281,8 +281,8 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *,
-		     struct keylist *, atomic_t *, struct bkey *);
+int bch_btree_insert(struct cache_set *, struct keylist *,
+		     atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 592adf51128f..86de64a6bf26 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -302,10 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	struct keylist keylist;
-	struct btree_op op;
 
 	bch_keylist_init(&keylist);
-	bch_btree_op_init(&op, SHRT_MAX);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -322,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL);
+			ret = bch_btree_insert(s, &keylist, i->pin, NULL);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index bcce06a1e466..6cee2ae1d87f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -237,7 +237,7 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	ret = bch_btree_insert(&s->op, s->c, &s->insert_keys,
+	ret = bch_btree_insert(s->c, &s->insert_keys,
 			       journal_ref, replace_key);
 	if (ret == -ESRCH) {
 		s->insert_collision = true;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 312032ef34f7..ab0f6b449111 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -139,12 +139,10 @@ static void write_dirty_finish(struct closure *cl)
 
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
-		unsigned i;
-		struct btree_op op;
-		struct keylist keys;
 		int ret;
+		unsigned i;
+		struct keylist keys;
 
-		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
 		bkey_copy(keys.top, &w->key);
@@ -154,7 +152,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
+		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
 
 		if (ret)
 			trace_bcache_writeback_collision(&w->key);

From 220bb38c21b83e2f7b842f33220bf727093eca89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:02:45 -0700
Subject: [PATCH 73/94] bcache: Break up struct search

With all the recent refactoring around struct btree op struct search has
gotten rather large.

But we can now easily break it up in a different way - we break out
struct btree_insert_op which is for inserting data into the cache, and
that's now what the copying gc code uses - struct search is now specific
to request.c

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/debug.c     |  38 +-
 drivers/md/bcache/debug.h     |   4 +-
 drivers/md/bcache/movinggc.c  |  56 ++-
 drivers/md/bcache/request.c   | 718 ++++++++++++++++++----------------
 drivers/md/bcache/request.h   |  39 +-
 drivers/md/bcache/stats.c     |  26 +-
 drivers/md/bcache/stats.h     |  13 +-
 drivers/md/bcache/trace.c     |   1 -
 include/trace/events/bcache.h |  18 +-
 9 files changed, 448 insertions(+), 465 deletions(-)

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 88e6411eab4f..d9ccb3169aa2 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,7 +8,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 
 #include <linux/console.h>
 #include <linux/debugfs.h>
@@ -176,42 +175,25 @@ void bch_btree_verify(struct btree *b, struct bset *new)
 	mutex_unlock(&b->c->verify_lock);
 }
 
-static void data_verify_endio(struct bio *bio, int error)
-{
-	struct closure *cl = bio->bi_private;
-	closure_put(cl);
-}
-
-void bch_data_verify(struct search *s)
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-	struct closure *cl = &s->cl;
 	struct bio *check;
 	struct bio_vec *bv;
 	int i;
 
-	if (!s->unaligned_bvec)
-		bio_for_each_segment(bv, s->orig_bio, i)
-			bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
-
-	check = bio_clone(s->orig_bio, GFP_NOIO);
+	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
 		return;
 
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 
-	check->bi_rw		= READ_SYNC;
-	check->bi_private	= cl;
-	check->bi_end_io	= data_verify_endio;
+	submit_bio_wait(READ_SYNC, check);
 
-	closure_bio_submit(check, cl, &dc->disk);
-	closure_sync(cl);
-
-	bio_for_each_segment(bv, s->orig_bio, i) {
-		void *p1 = kmap(bv->bv_page);
-		void *p2 = kmap(check->bi_io_vec[i].bv_page);
+	bio_for_each_segment(bv, bio, i) {
+		void *p1 = kmap_atomic(bv->bv_page);
+		void *p2 = page_address(check->bi_io_vec[i].bv_page);
 
 		if (memcmp(p1 + bv->bv_offset,
 			   p2 + bv->bv_offset,
@@ -219,13 +201,11 @@ void bch_data_verify(struct search *s)
 			printk(KERN_ERR
 			       "bcache (%s): verify failed at sector %llu\n",
 			       bdevname(dc->bdev, name),
-			       (uint64_t) s->orig_bio->bi_sector);
-
-		kunmap(bv->bv_page);
-		kunmap(check->bi_io_vec[i].bv_page);
+			       (uint64_t) bio->bi_sector);
+		kunmap_atomic(p1);
 	}
 
-	__bio_for_each_segment(bv, check, i, 0)
+	bio_for_each_segment_all(bv, check, i)
 		__free_page(bv->bv_page);
 out_put:
 	bio_put(check);
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 1c39b5a2489b..0f4b3440512c 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -29,12 +29,12 @@ void bch_check_keys(struct btree *, const char *, ...);
 #ifdef CONFIG_BCACHE_DEBUG
 
 void bch_btree_verify(struct btree *, struct bset *);
-void bch_data_verify(struct search *);
+void bch_data_verify(struct cached_dev *, struct bio *);
 
 #else /* DEBUG */
 
 static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
-static inline void bch_data_verify(struct search *s) {};
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {};
 
 #endif
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 601c96a62b30..7c1275e66025 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -12,8 +12,9 @@
 #include <trace/events/bcache.h>
 
 struct moving_io {
+	struct closure		cl;
 	struct keybuf_key	*w;
-	struct search		s;
+	struct data_insert_op	op;
 	struct bbio		bio;
 };
 
@@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 
 static void moving_io_destructor(struct closure *cl)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	kfree(io);
 }
 
 static void write_moving_finish(struct closure *cl)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 	struct bio_vec *bv;
 	int i;
@@ -52,12 +53,12 @@ static void write_moving_finish(struct closure *cl)
 	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 
-	if (io->s.insert_collision)
+	if (io->op.replace_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
-	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
+	bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
 
-	up(&io->s.c->moving_in_flight);
+	up(&io->op.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -65,12 +66,12 @@ static void write_moving_finish(struct closure *cl)
 static void read_moving_endio(struct bio *bio, int error)
 {
 	struct moving_io *io = container_of(bio->bi_private,
-					    struct moving_io, s.cl);
+					    struct moving_io, cl);
 
 	if (error)
-		io->s.error = error;
+		io->op.error = error;
 
-	bch_bbio_endio(io->s.c, bio, error, "reading data to move");
+	bch_bbio_endio(io->op.c, bio, error, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -84,32 +85,30 @@ static void moving_init(struct moving_io *io)
 	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
 					       PAGE_SECTORS);
-	bio->bi_private		= &io->s.cl;
+	bio->bi_private		= &io->cl;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 
 static void write_moving(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, cl);
-	struct moving_io *io = container_of(s, struct moving_io, s);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct data_insert_op *op = &io->op;
 
-	if (!s->error) {
+	if (!op->error) {
 		moving_init(io);
 
-		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
-		s->op.lock		= -1;
-		s->write_prio		= 1;
-		s->cache_bio		= &io->bio.bio;
+		io->bio.bio.bi_sector = KEY_START(&io->w->key);
+		op->write_prio		= 1;
+		op->bio			= &io->bio.bio;
 
-		s->writeback		= KEY_DIRTY(&io->w->key);
-		s->csum			= KEY_CSUM(&io->w->key);
+		op->writeback		= KEY_DIRTY(&io->w->key);
+		op->csum		= KEY_CSUM(&io->w->key);
 
-		bkey_copy(&s->replace_key, &io->w->key);
-		s->replace = true;
+		bkey_copy(&op->replace_key, &io->w->key);
+		op->replace		= true;
 
-		closure_init(&s->btree, cl);
-		bch_data_insert(&s->btree);
+		closure_call(&op->cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, write_moving_finish, system_wq);
@@ -117,11 +116,10 @@ static void write_moving(struct closure *cl)
 
 static void read_moving_submit(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, cl);
-	struct moving_io *io = container_of(s, struct moving_io, s);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 
-	bch_submit_bbio(bio, s->c, &io->w->key, 0);
+	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
 
 	continue_at(cl, write_moving, system_wq);
 }
@@ -151,8 +149,8 @@ static void read_moving(struct cache_set *c)
 
 		w->private	= io;
 		io->w		= w;
-		io->s.inode	= KEY_INODE(&w->key);
-		io->s.c		= c;
+		io->op.inode	= KEY_INODE(&w->key);
+		io->op.c	= c;
 
 		moving_init(io);
 		bio = &io->bio.bio;
@@ -166,7 +164,7 @@ static void read_moving(struct cache_set *c)
 		trace_bcache_gc_copy(&w->key);
 
 		down(&c->moving_in_flight);
-		closure_call(&io->s.cl, read_moving_submit, NULL, &cl);
+		closure_call(&io->cl, read_moving_submit, NULL, &cl);
 	}
 
 	if (0) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6cee2ae1d87f..05c7c216f65e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -215,9 +215,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 static void bch_data_insert_keys(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	atomic_t *journal_ref = NULL;
-	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
+	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
 	int ret;
 
 	/*
@@ -232,27 +232,26 @@ static void bch_data_insert_keys(struct closure *cl)
 		closure_sync(&s->cl);
 #endif
 
-	if (s->write)
-		journal_ref = bch_journal(s->c, &s->insert_keys,
-					  s->flush_journal
-					  ? &s->cl : NULL);
+	if (!op->replace)
+		journal_ref = bch_journal(op->c, &op->insert_keys,
+					  op->flush_journal ? cl : NULL);
 
-	ret = bch_btree_insert(s->c, &s->insert_keys,
+	ret = bch_btree_insert(op->c, &op->insert_keys,
 			       journal_ref, replace_key);
 	if (ret == -ESRCH) {
-		s->insert_collision = true;
+		op->replace_collision = true;
 	} else if (ret) {
-		s->error		= -ENOMEM;
-		s->insert_data_done	= true;
+		op->error		= -ENOMEM;
+		op->insert_data_done	= true;
 	}
 
 	if (journal_ref)
 		atomic_dec_bug(journal_ref);
 
-	if (!s->insert_data_done)
+	if (!op->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
-	bch_keylist_free(&s->insert_keys);
+	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
 }
 
@@ -349,10 +348,10 @@ found:
  *
  * If s->writeback is true, will not fail.
  */
-static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
-			      struct search *s)
+static bool bch_alloc_sectors(struct data_insert_op *op,
+			      struct bkey *k, unsigned sectors)
 {
-	struct cache_set *c = s->c;
+	struct cache_set *c = op->c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
 	unsigned i;
@@ -367,15 +366,15 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	bkey_init(&alloc.key);
 	spin_lock(&c->data_bucket_lock);
 
-	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
-		unsigned watermark = s->write_prio
+	while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) {
+		unsigned watermark = op->write_prio
 			? WATERMARK_MOVINGGC
 			: WATERMARK_NONE;
 
 		spin_unlock(&c->data_bucket_lock);
 
 		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
-					 1, s->writeback))
+					 1, op->writeback))
 			return false;
 
 		spin_lock(&c->data_bucket_lock);
@@ -409,7 +408,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	 */
 	list_move_tail(&b->list, &c->data_buckets);
 	bkey_copy_key(&b->key, k);
-	b->last = s->task;
+	b->last = op->task;
 
 	b->sectors_free	-= sectors;
 
@@ -438,8 +437,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 static void bch_data_invalidate(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
-	struct bio *bio = s->cache_bio;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio;
 
 	pr_debug("invalidating %i sectors from %llu",
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
@@ -447,17 +446,17 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&s->insert_keys, 0, s->c))
+		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
-		bch_keylist_add(&s->insert_keys,
-				&KEY(s->inode, bio->bi_sector, len));
+		bch_keylist_add(&op->insert_keys,
+				&KEY(op->inode, bio->bi_sector, len));
 	}
 
-	s->insert_data_done = true;
+	op->insert_data_done = true;
 	bio_put(bio);
 out:
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
@@ -465,7 +464,7 @@ out:
 
 static void bch_data_insert_error(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -476,9 +475,9 @@ static void bch_data_insert_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys;
+	struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
 
-	while (src != s->insert_keys.top) {
+	while (src != op->insert_keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
@@ -488,7 +487,7 @@ static void bch_data_insert_error(struct closure *cl)
 		src = n;
 	}
 
-	s->insert_keys.top = dst;
+	op->insert_keys.top = dst;
 
 	bch_data_insert_keys(cl);
 }
@@ -496,32 +495,32 @@ static void bch_data_insert_error(struct closure *cl)
 static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
 	if (error) {
 		/* TODO: We could try to recover from this. */
-		if (s->writeback)
-			s->error = error;
-		else if (s->write)
+		if (op->writeback)
+			op->error = error;
+		else if (!op->replace)
 			set_closure_fn(cl, bch_data_insert_error, bcache_wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(s->c, bio, error, "writing data to cache");
+	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
-	struct bio *bio = s->cache_bio, *n;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio, *n;
 
-	if (s->bypass)
+	if (op->bypass)
 		return bch_data_invalidate(cl);
 
-	if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) {
-		set_gc_sectors(s->c);
-		wake_up_gc(s->c);
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+		set_gc_sectors(op->c);
+		wake_up_gc(op->c);
 	}
 
 	/*
@@ -533,21 +532,20 @@ static void bch_data_insert_start(struct closure *cl)
 	do {
 		unsigned i;
 		struct bkey *k;
-		struct bio_set *split = s->d
-			? s->d->bio_split : s->c->bio_split;
+		struct bio_set *split = op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
-		if (bch_keylist_realloc(&s->insert_keys,
-					1 + (s->csum ? 1 : 0),
-					s->c))
+		if (bch_keylist_realloc(&op->insert_keys,
+					1 + (op->csum ? 1 : 0),
+					op->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
-		k = s->insert_keys.top;
+		k = op->insert_keys.top;
 		bkey_init(k);
-		SET_KEY_INODE(k, s->inode);
+		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
-		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
+		if (!bch_alloc_sectors(op, k, bio_sectors(bio)))
 			goto err;
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
@@ -555,30 +553,30 @@ static void bch_data_insert_start(struct closure *cl)
 		n->bi_end_io	= bch_data_insert_endio;
 		n->bi_private	= cl;
 
-		if (s->writeback) {
+		if (op->writeback) {
 			SET_KEY_DIRTY(k, true);
 
 			for (i = 0; i < KEY_PTRS(k); i++)
-				SET_GC_MARK(PTR_BUCKET(s->c, k, i),
+				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
 					    GC_MARK_DIRTY);
 		}
 
-		SET_KEY_CSUM(k, s->csum);
+		SET_KEY_CSUM(k, op->csum);
 		if (KEY_CSUM(k))
 			bio_csum(n, k);
 
 		trace_bcache_cache_insert(k);
-		bch_keylist_push(&s->insert_keys);
+		bch_keylist_push(&op->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
-		bch_submit_bbio(n, s->c, k, 0);
+		bch_submit_bbio(n, op->c, k, 0);
 	} while (n != bio);
 
-	s->insert_data_done = true;
+	op->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
-	BUG_ON(s->writeback);
+	BUG_ON(op->writeback);
 
 	/*
 	 * But if it's not a writeback write we'd rather just bail out if
@@ -586,24 +584,24 @@ err:
 	 * we might be starving btree writes for gc or something.
 	 */
 
-	if (s->write) {
+	if (!op->replace) {
 		/*
 		 * Writethrough write: We can't complete the write until we've
 		 * updated the index. But we don't want to delay the write while
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		s->bypass = true;
+		op->bypass = true;
 		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
 		 * we have written or bail out if we didn't do anything.
 		 */
-		s->insert_data_done = true;
+		op->insert_data_done = true;
 		bio_put(bio);
 
-		if (!bch_keylist_empty(&s->insert_keys))
+		if (!bch_keylist_empty(&op->insert_keys))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
@@ -631,221 +629,16 @@ err:
  */
 void bch_data_insert(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	bch_keylist_init(&s->insert_keys);
-	bio_get(s->cache_bio);
+	trace_bcache_write(op->bio, op->writeback, op->bypass);
+
+	bch_keylist_init(&op->insert_keys);
+	bio_get(op->bio);
 	bch_data_insert_start(cl);
 }
 
-/* Cache lookup */
-
-static void bch_cache_read_endio(struct bio *bio, int error)
-{
-	struct bbio *b = container_of(bio, struct bbio, bio);
-	struct closure *cl = bio->bi_private;
-	struct search *s = container_of(cl, struct search, cl);
-
-	/*
-	 * If the bucket was reused while our bio was in flight, we might have
-	 * read the wrong data. Set s->error but not error so it doesn't get
-	 * counted against the cache device, but we'll still reread the data
-	 * from the backing device.
-	 */
-
-	if (error)
-		s->error = error;
-	else if (ptr_stale(s->c, &b->key, 0)) {
-		atomic_long_inc(&s->c->cache_read_races);
-		s->error = -EINTR;
-	}
-
-	bch_bbio_endio(s->c, bio, error, "reading from cache");
-}
-
-/*
- * Read from a single key, handling the initial cache miss if the key starts in
- * the middle of the bio
- */
-static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *n, *bio = &s->bio.bio;
-	struct bkey *bio_key;
-	unsigned ptr;
-
-	if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0)
-		return MAP_CONTINUE;
-
-	if (KEY_INODE(k) != s->inode ||
-	    KEY_START(k) > bio->bi_sector) {
-		unsigned bio_sectors = bio_sectors(bio);
-		unsigned sectors = KEY_INODE(k) == s->inode
-			? min_t(uint64_t, INT_MAX,
-				KEY_START(k) - bio->bi_sector)
-			: INT_MAX;
-
-		int ret = s->d->cache_miss(b, s, bio, sectors);
-		if (ret != MAP_CONTINUE)
-			return ret;
-
-		/* if this was a complete miss we shouldn't get here */
-		BUG_ON(bio_sectors <= sectors);
-	}
-
-	if (!KEY_SIZE(k))
-		return MAP_CONTINUE;
-
-	/* XXX: figure out best pointer - for multiple cache devices */
-	ptr = 0;
-
-	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
-
-	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
-				     KEY_OFFSET(k) - bio->bi_sector),
-			  GFP_NOIO, s->d->bio_split);
-
-	bio_key = &container_of(n, struct bbio, bio)->key;
-	bch_bkey_copy_single_ptr(bio_key, k, ptr);
-
-	bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key);
-	bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key);
-
-	n->bi_end_io	= bch_cache_read_endio;
-	n->bi_private	= &s->cl;
-
-	/*
-	 * The bucket we're reading from might be reused while our bio
-	 * is in flight, and we could then end up reading the wrong
-	 * data.
-	 *
-	 * We guard against this by checking (in cache_read_endio()) if
-	 * the pointer is stale again; if so, we treat it as an error
-	 * and reread from the backing device (but we don't pass that
-	 * error up anywhere).
-	 */
-
-	__bch_submit_bbio(n, b->c);
-	return n == bio ? MAP_DONE : MAP_CONTINUE;
-}
-
-static void cache_lookup(struct closure *cl)
-{
-	struct search *s = container_of(cl, struct search, btree);
-	struct bio *bio = &s->bio.bio;
-
-	int ret = bch_btree_map_keys(&s->op, s->c,
-				     &KEY(s->inode, bio->bi_sector, 0),
-				     cache_lookup_fn, MAP_END_KEY);
-	if (ret == -EAGAIN)
-		continue_at(cl, cache_lookup, bcache_wq);
-
-	closure_return(cl);
-}
-
-/* Common code for the make_request functions */
-
-static void request_endio(struct bio *bio, int error)
-{
-	struct closure *cl = bio->bi_private;
-
-	if (error) {
-		struct search *s = container_of(cl, struct search, cl);
-		s->error = error;
-		/* Only cache read errors are recoverable */
-		s->recoverable = false;
-	}
-
-	bio_put(bio);
-	closure_put(cl);
-}
-
-static void bio_complete(struct search *s)
-{
-	if (s->orig_bio) {
-		int cpu, rw = bio_data_dir(s->orig_bio);
-		unsigned long duration = jiffies - s->start_time;
-
-		cpu = part_stat_lock();
-		part_round_stats(cpu, &s->d->disk->part0);
-		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
-		part_stat_unlock();
-
-		trace_bcache_request_end(s, s->orig_bio);
-		bio_endio(s->orig_bio, s->error);
-		s->orig_bio = NULL;
-	}
-}
-
-static void do_bio_hook(struct search *s)
-{
-	struct bio *bio = &s->bio.bio;
-	memcpy(bio, s->orig_bio, sizeof(struct bio));
-
-	bio->bi_end_io		= request_endio;
-	bio->bi_private		= &s->cl;
-	atomic_set(&bio->bi_cnt, 3);
-}
-
-static void search_free(struct closure *cl)
-{
-	struct search *s = container_of(cl, struct search, cl);
-	bio_complete(s);
-
-	if (s->cache_bio)
-		bio_put(s->cache_bio);
-
-	if (s->unaligned_bvec)
-		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
-
-	closure_debug_destroy(cl);
-	mempool_free(s, s->d->c->search);
-}
-
-static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
-{
-	struct search *s;
-	struct bio_vec *bv;
-
-	s = mempool_alloc(d->c->search, GFP_NOIO);
-	memset(s, 0, offsetof(struct search, insert_keys));
-
-	__closure_init(&s->cl, NULL);
-
-	s->inode		= d->id;
-	s->c			= d->c;
-	s->d			= d;
-	s->op.lock		= -1;
-	s->task			= current;
-	s->orig_bio		= bio;
-	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
-	s->flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
-	s->recoverable		= 1;
-	s->start_time		= jiffies;
-	do_bio_hook(s);
-
-	if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
-		bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
-		memcpy(bv, bio_iovec(bio),
-		       sizeof(struct bio_vec) * bio_segments(bio));
-
-		s->bio.bio.bi_io_vec	= bv;
-		s->unaligned_bvec	= 1;
-	}
-
-	return s;
-}
-
-/* Cached devices */
-
-static void cached_dev_bio_complete(struct closure *cl)
-{
-	struct search *s = container_of(cl, struct search, cl);
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
-	search_free(cl);
-	cached_dev_put(dc);
-}
+/* Congested? */
 
 unsigned bch_get_congested(struct cache_set *c)
 {
@@ -888,12 +681,12 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
 }
 
-static bool check_should_bypass(struct cached_dev *dc, struct search *s)
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 {
-	struct cache_set *c = s->c;
-	struct bio *bio = &s->bio.bio;
+	struct cache_set *c = dc->disk.c;
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
+	struct task_struct *task = current;
 
 	if (atomic_read(&dc->disk.detaching) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -932,7 +725,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct search *s)
 
 		i = list_first_entry(&dc->io_lru, struct io, lru);
 
-		add_sequential(s->task);
+		add_sequential(task);
 		i->sequential = 0;
 found:
 		if (i->sequential + bio->bi_size > i->sequential)
@@ -940,7 +733,7 @@ found:
 
 		i->last			 = bio_end_sector(bio);
 		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		s->task->sequential_io	 = i->sequential;
+		task->sequential_io	 = i->sequential;
 
 		hlist_del(&i->hash);
 		hlist_add_head(&i->hash, iohash(dc, i->last));
@@ -948,22 +741,22 @@ found:
 
 		spin_unlock(&dc->io_lock);
 	} else {
-		s->task->sequential_io = bio->bi_size;
+		task->sequential_io = bio->bi_size;
 
-		add_sequential(s->task);
+		add_sequential(task);
 	}
 
-	sectors = max(s->task->sequential_io,
-		      s->task->sequential_io_avg) >> 9;
+	sectors = max(task->sequential_io,
+		      task->sequential_io_avg) >> 9;
 
 	if (dc->sequential_cutoff &&
 	    sectors >= dc->sequential_cutoff >> 9) {
-		trace_bcache_bypass_sequential(s->orig_bio);
+		trace_bcache_bypass_sequential(bio);
 		goto skip;
 	}
 
 	if (congested && sectors >= congested) {
-		trace_bcache_bypass_congested(s->orig_bio);
+		trace_bcache_bypass_congested(bio);
 		goto skip;
 	}
 
@@ -971,24 +764,255 @@ rescale:
 	bch_rescale_priorities(c, bio_sectors(bio));
 	return false;
 skip:
-	bch_mark_sectors_bypassed(s, bio_sectors(bio));
+	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
 	return true;
 }
 
+/* Cache lookup */
+
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	struct bcache_device	*d;
+
+	struct bbio		bio;
+	struct bio		*orig_bio;
+	struct bio		*cache_miss;
+
+	unsigned		insert_bio_sectors;
+
+	unsigned		recoverable:1;
+	unsigned		unaligned_bvec:1;
+	unsigned		write:1;
+
+	unsigned long		start_time;
+
+	struct btree_op		op;
+	struct data_insert_op	iop;
+};
+
+static void bch_cache_read_endio(struct bio *bio, int error)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct closure *cl = bio->bi_private;
+	struct search *s = container_of(cl, struct search, cl);
+
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s->error but not error so it doesn't get
+	 * counted against the cache device, but we'll still reread the data
+	 * from the backing device.
+	 */
+
+	if (error)
+		s->iop.error = error;
+	else if (ptr_stale(s->iop.c, &b->key, 0)) {
+		atomic_long_inc(&s->iop.c->cache_read_races);
+		s->iop.error = -EINTR;
+	}
+
+	bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *n, *bio = &s->bio.bio;
+	struct bkey *bio_key;
+	unsigned ptr;
+
+	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+		return MAP_CONTINUE;
+
+	if (KEY_INODE(k) != s->iop.inode ||
+	    KEY_START(k) > bio->bi_sector) {
+		unsigned bio_sectors = bio_sectors(bio);
+		unsigned sectors = KEY_INODE(k) == s->iop.inode
+			? min_t(uint64_t, INT_MAX,
+				KEY_START(k) - bio->bi_sector)
+			: INT_MAX;
+
+		int ret = s->d->cache_miss(b, s, bio, sectors);
+		if (ret != MAP_CONTINUE)
+			return ret;
+
+		/* if this was a complete miss we shouldn't get here */
+		BUG_ON(bio_sectors <= sectors);
+	}
+
+	if (!KEY_SIZE(k))
+		return MAP_CONTINUE;
+
+	/* XXX: figure out best pointer - for multiple cache devices */
+	ptr = 0;
+
+	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
+				     KEY_OFFSET(k) - bio->bi_sector),
+			  GFP_NOIO, s->d->bio_split);
+
+	bio_key = &container_of(n, struct bbio, bio)->key;
+	bch_bkey_copy_single_ptr(bio_key, k, ptr);
+
+	bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
+
+	n->bi_end_io	= bch_cache_read_endio;
+	n->bi_private	= &s->cl;
+
+	/*
+	 * The bucket we're reading from might be reused while our bio
+	 * is in flight, and we could then end up reading the wrong
+	 * data.
+	 *
+	 * We guard against this by checking (in cache_read_endio()) if
+	 * the pointer is stale again; if so, we treat it as an error
+	 * and reread from the backing device (but we don't pass that
+	 * error up anywhere).
+	 */
+
+	__bch_submit_bbio(n, b->c);
+	return n == bio ? MAP_DONE : MAP_CONTINUE;
+}
+
+static void cache_lookup(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, iop.cl);
+	struct bio *bio = &s->bio.bio;
+
+	int ret = bch_btree_map_keys(&s->op, s->iop.c,
+				     &KEY(s->iop.inode, bio->bi_sector, 0),
+				     cache_lookup_fn, MAP_END_KEY);
+	if (ret == -EAGAIN)
+		continue_at(cl, cache_lookup, bcache_wq);
+
+	closure_return(cl);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->iop.error = error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+static void bio_complete(struct search *s)
+{
+	if (s->orig_bio) {
+		int cpu, rw = bio_data_dir(s->orig_bio);
+		unsigned long duration = jiffies - s->start_time;
+
+		cpu = part_stat_lock();
+		part_round_stats(cpu, &s->d->disk->part0);
+		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
+		part_stat_unlock();
+
+		trace_bcache_request_end(s->d, s->orig_bio);
+		bio_endio(s->orig_bio, s->iop.error);
+		s->orig_bio = NULL;
+	}
+}
+
+static void do_bio_hook(struct search *s)
+{
+	struct bio *bio = &s->bio.bio;
+	memcpy(bio, s->orig_bio, sizeof(struct bio));
+
+	bio->bi_end_io		= request_endio;
+	bio->bi_private		= &s->cl;
+	atomic_set(&bio->bi_cnt, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	bio_complete(s);
+
+	if (s->iop.bio)
+		bio_put(s->iop.bio);
+
+	if (s->unaligned_bvec)
+		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
+
+	closure_debug_destroy(cl);
+	mempool_free(s, s->d->c->search);
+}
+
+static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+{
+	struct search *s;
+	struct bio_vec *bv;
+
+	s = mempool_alloc(d->c->search, GFP_NOIO);
+	memset(s, 0, offsetof(struct search, iop.insert_keys));
+
+	__closure_init(&s->cl, NULL);
+
+	s->iop.inode		= d->id;
+	s->iop.c		= d->c;
+	s->d			= d;
+	s->op.lock		= -1;
+	s->iop.task		= current;
+	s->orig_bio		= bio;
+	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
+	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->recoverable		= 1;
+	s->start_time		= jiffies;
+	do_bio_hook(s);
+
+	if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
+		bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
+		memcpy(bv, bio_iovec(bio),
+		       sizeof(struct bio_vec) * bio_segments(bio));
+
+		s->bio.bio.bi_io_vec	= bv;
+		s->unaligned_bvec	= 1;
+	}
+
+	return s;
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	search_free(cl);
+	cached_dev_put(dc);
+}
+
 /* Process reads */
 
 static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->insert_collision)
-		bch_mark_cache_miss_collision(s);
+	if (s->iop.replace_collision)
+		bch_mark_cache_miss_collision(s->iop.c, s->d);
 
-	if (s->cache_bio) {
+	if (s->iop.bio) {
 		int i;
 		struct bio_vec *bv;
 
-		bio_for_each_segment_all(bv, s->cache_bio, i)
+		bio_for_each_segment_all(bv, s->iop.bio, i)
 			__free_page(bv->bv_page);
 	}
 
@@ -1006,7 +1030,7 @@ static void cached_dev_read_error(struct closure *cl)
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 
-		s->error = 0;
+		s->iop.error = 0;
 		bv = s->bio.bio.bi_io_vec;
 		do_bio_hook(s);
 		s->bio.bio.bi_io_vec = bv;
@@ -1041,29 +1065,28 @@ static void cached_dev_read_done(struct closure *cl)
 	 * to the buffers the original bio pointed to:
 	 */
 
-	if (s->cache_bio) {
-		bio_reset(s->cache_bio);
-		s->cache_bio->bi_sector =
-			s->cache_miss->bi_sector;
-		s->cache_bio->bi_bdev = s->cache_miss->bi_bdev;
-		s->cache_bio->bi_size = s->cache_bio_sectors << 9;
-		bch_bio_map(s->cache_bio, NULL);
+	if (s->iop.bio) {
+		bio_reset(s->iop.bio);
+		s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+		s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
+		s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+		bch_bio_map(s->iop.bio, NULL);
 
-		bio_copy_data(s->cache_miss, s->cache_bio);
+		bio_copy_data(s->cache_miss, s->iop.bio);
 
 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc, &s->bio.bio) && s->recoverable)
-		bch_data_verify(s);
+	if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec)
+		bch_data_verify(dc, s->orig_bio);
 
 	bio_complete(s);
 
-	if (s->cache_bio &&
-	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
-		BUG_ON(!s->replace);
-		closure_call(&s->btree, bch_data_insert, NULL, cl);
+	if (s->iop.bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
+		BUG_ON(!s->iop.replace);
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1074,12 +1097,13 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->bypass);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
+	bch_mark_cache_accounting(s->iop.c, s->d,
+				  !s->cache_miss, s->iop.bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
-	if (s->error)
+	if (s->iop.error)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-	else if (s->cache_bio || verify(dc, &s->bio.bio))
+	else if (s->iop.bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -1093,7 +1117,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
-	if (s->cache_miss || s->bypass) {
+	if (s->cache_miss || s->iop.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
@@ -1101,20 +1125,21 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	if (!(bio->bi_rw & REQ_RAHEAD) &&
 	    !(bio->bi_rw & REQ_META) &&
-	    s->c->gc_stats.in_use < CUTOFF_CACHE_READA)
+	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
 			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 
-	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->replace_key = KEY(s->inode, bio->bi_sector +
-			     s->cache_bio_sectors, s->cache_bio_sectors);
+	s->iop.replace_key = KEY(s->iop.inode,
+				 bio->bi_sector + s->insert_bio_sectors,
+				 s->insert_bio_sectors);
 
-	ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key);
+	ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
 	if (ret)
 		return ret;
 
-	s->replace = true;
+	s->iop.replace = true;
 
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 
@@ -1122,14 +1147,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
-			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
+			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
 	if (!cache_bio)
 		goto out_submit;
 
 	cache_bio->bi_sector	= miss->bi_sector;
 	cache_bio->bi_bdev	= miss->bi_bdev;
-	cache_bio->bi_size	= s->cache_bio_sectors << 9;
+	cache_bio->bi_size	= s->insert_bio_sectors << 9;
 
 	cache_bio->bi_end_io	= request_endio;
 	cache_bio->bi_private	= &s->cl;
@@ -1138,8 +1163,11 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 
+	if (reada)
+		bch_mark_cache_readahead(s->iop.c, s->d);
+
 	s->cache_miss	= miss;
-	s->cache_bio = cache_bio;
+	s->iop.bio	= cache_bio;
 	bio_get(cache_bio);
 	closure_bio_submit(cache_bio, &s->cl, s->d);
 
@@ -1157,7 +1185,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->btree, cache_lookup, NULL, cl);
+	closure_call(&s->iop.cl, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1179,7 +1207,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
 	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
-	bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end);
+	bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
 
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
@@ -1187,8 +1215,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		 * We overlap with some dirty data undergoing background
 		 * writeback, force this write to writeback
 		 */
-		s->bypass	= false;
-		s->writeback	= true;
+		s->iop.bypass = false;
+		s->iop.writeback = true;
 	}
 
 	/*
@@ -1199,27 +1227,25 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	 * so we still want to call it.
 	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		s->bypass = true;
+		s->iop.bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->bypass)) {
-		s->bypass = false;
-		s->writeback = true;
+			     s->iop.bypass)) {
+		s->iop.bypass = false;
+		s->iop.writeback = true;
 	}
 
-	trace_bcache_write(s->orig_bio, s->writeback, s->bypass);
-
-	if (s->bypass) {
-		s->cache_bio = s->orig_bio;
-		bio_get(s->cache_bio);
+	if (s->iop.bypass) {
+		s->iop.bio = s->orig_bio;
+		bio_get(s->iop.bio);
 
 		if (!(bio->bi_rw & REQ_DISCARD) ||
 		    blk_queue_discard(bdev_get_queue(dc->bdev)))
 			closure_bio_submit(bio, cl, s->d);
-	} else if (s->writeback) {
+	} else if (s->iop.writeback) {
 		bch_writeback_add(dc);
-		s->cache_bio = bio;
+		s->iop.bio = bio;
 
 		if (bio->bi_rw & REQ_FLUSH) {
 			/* Also need to send a flush to the backing device */
@@ -1234,13 +1260,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(flush, cl, s->d);
 		}
 	} else {
-		s->cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						dc->disk.bio_split);
+		s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
+					      dc->disk.bio_split);
 
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->btree, bch_data_insert, NULL, cl);
+	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
@@ -1249,8 +1275,8 @@ static void cached_dev_nodata(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
-	if (s->flush_journal)
-		bch_journal_meta(s->c, cl);
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
 
 	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
@@ -1277,7 +1303,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 
 	if (cached_dev_get(dc)) {
 		s = search_alloc(bio, d);
-		trace_bcache_request_start(s, bio);
+		trace_bcache_request_start(s->d, bio);
 
 		if (!bio->bi_size) {
 			/*
@@ -1288,7 +1314,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 					      cached_dev_nodata,
 					      bcache_wq);
 		} else {
-			s->bypass = check_should_bypass(dc, s);
+			s->iop.bypass = check_should_bypass(dc, bio);
 
 			if (rw)
 				cached_dev_write(dc, s);
@@ -1378,8 +1404,8 @@ static void flash_dev_nodata(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->flush_journal)
-		bch_journal_meta(s->c, cl);
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
 
 	continue_at(cl, search_free, NULL);
 }
@@ -1400,7 +1426,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	cl = &s->cl;
 	bio = &s->bio.bio;
 
-	trace_bcache_request_start(s, bio);
+	trace_bcache_request_start(s->d, bio);
 
 	if (!bio->bi_size) {
 		/*
@@ -1411,17 +1437,17 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 				      flash_dev_nodata,
 				      bcache_wq);
 	} else if (rw) {
-		bch_keybuf_check_overlapping(&s->c->moving_gc_keys,
+		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
-		s->bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
-		s->writeback	= true;
-		s->cache_bio	= bio;
+		s->iop.bypass		= (bio->bi_rw & REQ_DISCARD) != 0;
+		s->iop.writeback	= true;
+		s->iop.bio		= bio;
 
-		closure_call(&s->btree, bch_data_insert, NULL, cl);
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->btree, cache_lookup, NULL, cl);
+		closure_call(&s->iop.cl, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index f0e930b4ca89..54d7de27356f 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -3,46 +3,25 @@
 
 #include <linux/cgroup.h>
 
-struct search {
-	/* Stack frame for bio_complete */
+struct data_insert_op {
 	struct closure		cl;
-	struct closure		btree;
-
-	struct bcache_device	*d;
 	struct cache_set	*c;
 	struct task_struct	*task;
-
-	struct bbio		bio;
-	struct bio		*orig_bio;
-	struct bio		*cache_miss;
-
-	/* Bio to be inserted into the cache */
-	struct bio		*cache_bio;
-	unsigned		cache_bio_sectors;
+	struct bio		*bio;
 
 	unsigned		inode;
+	uint16_t		write_prio;
+	short			error;
 
-	unsigned		recoverable:1;
-	unsigned		unaligned_bvec:1;
-
-	unsigned		write:1;
-	unsigned		writeback:1;
-
-	unsigned		csum:1;
 	unsigned		bypass:1;
+	unsigned		writeback:1;
 	unsigned		flush_journal:1;
+	unsigned		csum:1;
+
+	unsigned		replace:1;
+	unsigned		replace_collision:1;
 
 	unsigned		insert_data_done:1;
-	unsigned		replace:1;
-	unsigned		insert_collision:1;
-
-	uint16_t		write_prio;
-
-	/* IO error returned to s->bio */
-	short			error;
-	unsigned long		start_time;
-
-	struct btree_op		op;
 
 	/* Anything past this point won't get zeroed in search_alloc() */
 	struct keylist		insert_keys;
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index ea77263cf7ef..84d0782f702e 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -7,7 +7,6 @@
 #include "bcache.h"
 #include "stats.h"
 #include "btree.h"
-#include "request.h"
 #include "sysfs.h"
 
 /*
@@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats,
 			atomic_inc(&stats->cache_bypass_misses);
 }
 
-void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
+void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
+			       bool hit, bool bypass)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
-	mark_cache_stats(&s->c->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
 #ifdef CONFIG_CGROUP_BCACHE
 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
 #endif
 }
 
-void bch_mark_cache_readahead(struct search *s)
+void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
-	atomic_inc(&s->c->accounting.collector.cache_readaheads);
+	atomic_inc(&c->accounting.collector.cache_readaheads);
 }
 
-void bch_mark_cache_miss_collision(struct search *s)
+void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
-	atomic_inc(&s->c->accounting.collector.cache_miss_collisions);
+	atomic_inc(&c->accounting.collector.cache_miss_collisions);
 }
 
-void bch_mark_sectors_bypassed(struct search *s, int sectors)
+void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
+			       int sectors)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
-	atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
 }
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index c7c7a8fd29fe..adbff141c887 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -38,7 +38,9 @@ struct cache_accounting {
 	struct cache_stats day;
 };
 
-struct search;
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
 			       struct closure *parent);
@@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc);
 
 void bch_cache_accounting_destroy(struct cache_accounting *acc);
 
-void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
-void bch_mark_cache_readahead(struct search *s);
-void bch_mark_cache_miss_collision(struct search *s);
-void bch_mark_sectors_bypassed(struct search *s, int sectors);
+void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
+			       bool, bool);
+void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
+void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
+void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
 
 #endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index f7b6c197f90f..adbc3df17a80 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -1,6 +1,5 @@
 #include "bcache.h"
 #include "btree.h"
-#include "request.h"
 
 #include <linux/blktrace_api.h>
 #include <linux/module.h>
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 5ebda976ea93..32c89b33c391 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -6,11 +6,9 @@
 
 #include <linux/tracepoint.h>
 
-struct search;
-
 DECLARE_EVENT_CLASS(bcache_request,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio),
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
@@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->orig_major	= s->d->disk->major;
-		__entry->orig_minor	= s->d->disk->first_minor;
+		__entry->orig_major	= d->disk->major;
+		__entry->orig_minor	= d->disk->first_minor;
 		__entry->sector		= bio->bi_sector;
 		__entry->orig_sector	= bio->bi_sector - 16;
 		__entry->nr_sector	= bio->bi_size >> 9;
@@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node,
 /* request.c */
 
 DEFINE_EVENT(bcache_request, bcache_request_start,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio)
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
 );
 
 DEFINE_EVENT(bcache_request, bcache_request_end,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio)
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
 );
 
 DECLARE_EVENT_CLASS(bcache_bio,

From 2599b53b7b0ea6103d1661dca74d35480cb8fa1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:11:11 -0700
Subject: [PATCH 74/94] bcache: Move sector allocator to alloc.c

Just reorganizing things a bit.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c   | 180 ++++++++++++++++++++++++++++++++++
 drivers/md/bcache/bcache.h  |   4 +
 drivers/md/bcache/request.c | 186 +-----------------------------------
 drivers/md/bcache/request.h |   5 +-
 4 files changed, 189 insertions(+), 186 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index b9bd5866055d..4970ddc6a7f6 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -487,8 +487,188 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 	return ret;
 }
 
+/* Sector allocator */
+
+struct open_bucket {
+	struct list_head	list;
+	unsigned		last_write_point;
+	unsigned		sectors_free;
+	BKEY_PADDED(key);
+};
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+					    const struct bkey *search,
+					    unsigned write_point,
+					    struct bkey *alloc)
+{
+	struct open_bucket *ret, *ret_task = NULL;
+
+	list_for_each_entry_reverse(ret, &c->data_buckets, list)
+		if (!bkey_cmp(&ret->key, search))
+			goto found;
+		else if (ret->last_write_point == write_point)
+			ret_task = ret;
+
+	ret = ret_task ?: list_first_entry(&c->data_buckets,
+					   struct open_bucket, list);
+found:
+	if (!ret->sectors_free && KEY_PTRS(alloc)) {
+		ret->sectors_free = c->sb.bucket_size;
+		bkey_copy(&ret->key, alloc);
+		bkey_init(alloc);
+	}
+
+	if (!ret->sectors_free)
+		ret = NULL;
+
+	return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
+		       unsigned write_point, unsigned write_prio, bool wait)
+{
+	struct open_bucket *b;
+	BKEY_PADDED(key) alloc;
+	unsigned i;
+
+	/*
+	 * We might have to allocate a new bucket, which we can't do with a
+	 * spinlock held. So if we have to allocate, we drop the lock, allocate
+	 * and then retry. KEY_PTRS() indicates whether alloc points to
+	 * allocated bucket(s).
+	 */
+
+	bkey_init(&alloc.key);
+	spin_lock(&c->data_bucket_lock);
+
+	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
+		unsigned watermark = write_prio
+			? WATERMARK_MOVINGGC
+			: WATERMARK_NONE;
+
+		spin_unlock(&c->data_bucket_lock);
+
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+			return false;
+
+		spin_lock(&c->data_bucket_lock);
+	}
+
+	/*
+	 * If we had to allocate, we might race and not need to allocate the
+	 * second time we call find_data_bucket(). If we allocated a bucket but
+	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+	 */
+	if (KEY_PTRS(&alloc.key))
+		__bkey_put(c, &alloc.key);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		EBUG_ON(ptr_stale(c, &b->key, i));
+
+	/* Set up the pointer to the space we're allocating: */
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		k->ptr[i] = b->key.ptr[i];
+
+	sectors = min(sectors, b->sectors_free);
+
+	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+	SET_KEY_SIZE(k, sectors);
+	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+	/*
+	 * Move b to the end of the lru, and keep track of what this bucket was
+	 * last used for:
+	 */
+	list_move_tail(&b->list, &c->data_buckets);
+	bkey_copy_key(&b->key, k);
+	b->last_write_point = write_point;
+
+	b->sectors_free	-= sectors;
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++) {
+		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+		atomic_long_add(sectors,
+				&PTR_CACHE(c, &b->key, i)->sectors_written);
+	}
+
+	if (b->sectors_free < c->sb.block_size)
+		b->sectors_free = 0;
+
+	/*
+	 * k takes refcounts on the buckets it points to until it's inserted
+	 * into the btree, but if we're done with this bucket we just transfer
+	 * get_data_bucket()'s refcount.
+	 */
+	if (b->sectors_free)
+		for (i = 0; i < KEY_PTRS(&b->key); i++)
+			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+	spin_unlock(&c->data_bucket_lock);
+	return true;
+}
+
 /* Init */
 
+void bch_open_buckets_free(struct cache_set *c)
+{
+	struct open_bucket *b;
+
+	while (!list_empty(&c->data_buckets)) {
+		b = list_first_entry(&c->data_buckets,
+				     struct open_bucket, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+	int i;
+
+	spin_lock_init(&c->data_bucket_lock);
+
+	for (i = 0; i < 6; i++) {
+		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+		if (!b)
+			return -ENOMEM;
+
+		list_add(&b->list, &c->data_buckets);
+	}
+
+	return 0;
+}
+
 int bch_cache_allocator_start(struct cache *ca)
 {
 	struct task_struct *k = kthread_run(bch_allocator_thread,
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 20fe96c121d9..e32f6fd91755 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1170,6 +1170,8 @@ int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
 			 struct bkey *, int, bool);
+bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+		       unsigned, unsigned, bool);
 
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -1210,6 +1212,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *);
 void bch_btree_cache_free(struct cache_set *);
 int bch_btree_cache_alloc(struct cache_set *);
 void bch_moving_init_cache_set(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+void bch_open_buckets_free(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
 int bch_cache_allocator_init(struct cache *ca);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 05c7c216f65e..cf7850a7592c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -255,186 +255,6 @@ static void bch_data_insert_keys(struct closure *cl)
 	closure_return(cl);
 }
 
-struct open_bucket {
-	struct list_head	list;
-	struct task_struct	*last;
-	unsigned		sectors_free;
-	BKEY_PADDED(key);
-};
-
-void bch_open_buckets_free(struct cache_set *c)
-{
-	struct open_bucket *b;
-
-	while (!list_empty(&c->data_buckets)) {
-		b = list_first_entry(&c->data_buckets,
-				     struct open_bucket, list);
-		list_del(&b->list);
-		kfree(b);
-	}
-}
-
-int bch_open_buckets_alloc(struct cache_set *c)
-{
-	int i;
-
-	spin_lock_init(&c->data_bucket_lock);
-
-	for (i = 0; i < 6; i++) {
-		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
-		if (!b)
-			return -ENOMEM;
-
-		list_add(&b->list, &c->data_buckets);
-	}
-
-	return 0;
-}
-
-/*
- * We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
- *
- * The ideas is if you've got multiple tasks pulling data into the cache at the
- * same time, you'll get better cache utilization if you try to segregate their
- * data and preserve locality.
- *
- * For example, say you've starting Firefox at the same time you're copying a
- * bunch of files. Firefox will likely end up being fairly hot and stay in the
- * cache awhile, but the data you copied might not be; if you wrote all that
- * data to the same buckets it'd get invalidated at the same time.
- *
- * Both of those tasks will be doing fairly random IO so we can't rely on
- * detecting sequential IO to segregate their data, but going off of the task
- * should be a sane heuristic.
- */
-static struct open_bucket *pick_data_bucket(struct cache_set *c,
-					    const struct bkey *search,
-					    struct task_struct *task,
-					    struct bkey *alloc)
-{
-	struct open_bucket *ret, *ret_task = NULL;
-
-	list_for_each_entry_reverse(ret, &c->data_buckets, list)
-		if (!bkey_cmp(&ret->key, search))
-			goto found;
-		else if (ret->last == task)
-			ret_task = ret;
-
-	ret = ret_task ?: list_first_entry(&c->data_buckets,
-					   struct open_bucket, list);
-found:
-	if (!ret->sectors_free && KEY_PTRS(alloc)) {
-		ret->sectors_free = c->sb.bucket_size;
-		bkey_copy(&ret->key, alloc);
-		bkey_init(alloc);
-	}
-
-	if (!ret->sectors_free)
-		ret = NULL;
-
-	return ret;
-}
-
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
- * sectors were actually allocated.
- *
- * If s->writeback is true, will not fail.
- */
-static bool bch_alloc_sectors(struct data_insert_op *op,
-			      struct bkey *k, unsigned sectors)
-{
-	struct cache_set *c = op->c;
-	struct open_bucket *b;
-	BKEY_PADDED(key) alloc;
-	unsigned i;
-
-	/*
-	 * We might have to allocate a new bucket, which we can't do with a
-	 * spinlock held. So if we have to allocate, we drop the lock, allocate
-	 * and then retry. KEY_PTRS() indicates whether alloc points to
-	 * allocated bucket(s).
-	 */
-
-	bkey_init(&alloc.key);
-	spin_lock(&c->data_bucket_lock);
-
-	while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) {
-		unsigned watermark = op->write_prio
-			? WATERMARK_MOVINGGC
-			: WATERMARK_NONE;
-
-		spin_unlock(&c->data_bucket_lock);
-
-		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
-					 1, op->writeback))
-			return false;
-
-		spin_lock(&c->data_bucket_lock);
-	}
-
-	/*
-	 * If we had to allocate, we might race and not need to allocate the
-	 * second time we call find_data_bucket(). If we allocated a bucket but
-	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
-	 */
-	if (KEY_PTRS(&alloc.key))
-		__bkey_put(c, &alloc.key);
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++)
-		EBUG_ON(ptr_stale(c, &b->key, i));
-
-	/* Set up the pointer to the space we're allocating: */
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++)
-		k->ptr[i] = b->key.ptr[i];
-
-	sectors = min(sectors, b->sectors_free);
-
-	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
-	SET_KEY_SIZE(k, sectors);
-	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
-
-	/*
-	 * Move b to the end of the lru, and keep track of what this bucket was
-	 * last used for:
-	 */
-	list_move_tail(&b->list, &c->data_buckets);
-	bkey_copy_key(&b->key, k);
-	b->last = op->task;
-
-	b->sectors_free	-= sectors;
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++) {
-		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
-
-		atomic_long_add(sectors,
-				&PTR_CACHE(c, &b->key, i)->sectors_written);
-	}
-
-	if (b->sectors_free < c->sb.block_size)
-		b->sectors_free = 0;
-
-	/*
-	 * k takes refcounts on the buckets it points to until it's inserted
-	 * into the btree, but if we're done with this bucket we just transfer
-	 * get_data_bucket()'s refcount.
-	 */
-	if (b->sectors_free)
-		for (i = 0; i < KEY_PTRS(&b->key); i++)
-			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
-
-	spin_unlock(&c->data_bucket_lock);
-	return true;
-}
-
 static void bch_data_invalidate(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -545,7 +365,9 @@ static void bch_data_insert_start(struct closure *cl)
 		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
-		if (!bch_alloc_sectors(op, k, bio_sectors(bio)))
+		if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
+				       op->write_point, op->write_prio,
+				       op->writeback))
 			goto err;
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
@@ -968,7 +790,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	s->iop.c		= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
-	s->iop.task		= current;
+	s->iop.write_point	= hash_long((unsigned long) current, 16);
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 54d7de27356f..2cd65bf073c2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -6,10 +6,10 @@
 struct data_insert_op {
 	struct closure		cl;
 	struct cache_set	*c;
-	struct task_struct	*task;
 	struct bio		*bio;
 
 	unsigned		inode;
+	uint16_t		write_point;
 	uint16_t		write_prio;
 	short			error;
 
@@ -31,9 +31,6 @@ struct data_insert_op {
 unsigned bch_get_congested(struct cache_set *);
 void bch_data_insert(struct closure *cl);
 
-void bch_open_buckets_free(struct cache_set *);
-int bch_open_buckets_alloc(struct cache_set *);
-
 void bch_cached_dev_request_init(struct cached_dev *dc);
 void bch_flash_dev_request_init(struct bcache_device *d);
 

From 81ab4190ac17df41686a37c97f701623276b652a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 31 Oct 2013 15:46:42 -0700
Subject: [PATCH 75/94] bcache: Pull on disk data structures out into a
 separate header

Now, the on disk data structures are in a header that can be exported to
userspace - and having them all centralized is nice too.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  | 244 +----------------------
 drivers/md/bcache/bset.c    |   4 +-
 drivers/md/bcache/bset.h    |  31 ---
 drivers/md/bcache/btree.c   |   2 +-
 drivers/md/bcache/journal.c |   4 +-
 drivers/md/bcache/journal.h |  37 ----
 drivers/md/bcache/request.c |   9 +-
 drivers/md/bcache/super.c   |  13 +-
 drivers/md/bcache/util.h    |  10 -
 include/uapi/linux/bcache.h | 373 ++++++++++++++++++++++++++++++++++++
 10 files changed, 387 insertions(+), 340 deletions(-)
 create mode 100644 include/uapi/linux/bcache.h

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index e32f6fd91755..045cb99f1ca6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
 
 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
 
+#include <linux/bcache.h>
 #include <linux/bio.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
 #define GC_MARK_METADATA	2
 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
 
-struct bkey {
-	uint64_t	high;
-	uint64_t	low;
-	uint64_t	ptr[];
-};
-
-/* Enough for a key with 6 pointers */
-#define BKEY_PAD		8
-
-#define BKEY_PADDED(key)					\
-	union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
-
-/* Version 0: Cache device
- * Version 1: Backing device
- * Version 2: Seed pointer into btree node checksum
- * Version 3: Cache device with new UUID format
- * Version 4: Backing device with data offset
- */
-#define BCACHE_SB_VERSION_CDEV			0
-#define BCACHE_SB_VERSION_BDEV			1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID	3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET	4
-#define BCACHE_SB_MAX_VERSION			4
-
-#define SB_SECTOR		8
-#define SB_SIZE			4096
-#define SB_LABEL_SIZE		32
-#define SB_JOURNAL_BUCKETS	256U
-/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
-#define MAX_CACHES_PER_SET	8
-
-#define BDEV_DATA_START_DEFAULT	16	/* sectors */
-
-struct cache_sb {
-	uint64_t		csum;
-	uint64_t		offset;	/* sector where this sb was written */
-	uint64_t		version;
-
-	uint8_t			magic[16];
-
-	uint8_t			uuid[16];
-	union {
-		uint8_t		set_uuid[16];
-		uint64_t	set_magic;
-	};
-	uint8_t			label[SB_LABEL_SIZE];
-
-	uint64_t		flags;
-	uint64_t		seq;
-	uint64_t		pad[8];
-
-	union {
-	struct {
-		/* Cache devices */
-		uint64_t	nbuckets;	/* device size */
-
-		uint16_t	block_size;	/* sectors */
-		uint16_t	bucket_size;	/* sectors */
-
-		uint16_t	nr_in_set;
-		uint16_t	nr_this_dev;
-	};
-	struct {
-		/* Backing devices */
-		uint64_t	data_offset;
-
-		/*
-		 * block_size from the cache device section is still used by
-		 * backing devices, so don't add anything here until we fix
-		 * things to not need it for backing devices anymore
-		 */
-	};
-	};
-
-	uint32_t		last_mount;	/* time_t */
-
-	uint16_t		first_bucket;
-	union {
-		uint16_t	njournal_buckets;
-		uint16_t	keys;
-	};
-	uint64_t		d[SB_JOURNAL_BUCKETS];	/* journal buckets */
-};
-
-BITMASK(CACHE_SYNC,		struct cache_sb, flags, 0, 1);
-BITMASK(CACHE_DISCARD,		struct cache_sb, flags, 1, 1);
-BITMASK(CACHE_REPLACEMENT,	struct cache_sb, flags, 2, 3);
-#define CACHE_REPLACEMENT_LRU	0U
-#define CACHE_REPLACEMENT_FIFO	1U
-#define CACHE_REPLACEMENT_RANDOM 2U
-
-BITMASK(BDEV_CACHE_MODE,	struct cache_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH	0U
-#define CACHE_MODE_WRITEBACK	1U
-#define CACHE_MODE_WRITEAROUND	2U
-#define CACHE_MODE_NONE		3U
-BITMASK(BDEV_STATE,		struct cache_sb, flags, 61, 2);
-#define BDEV_STATE_NONE		0U
-#define BDEV_STATE_CLEAN	1U
-#define BDEV_STATE_DIRTY	2U
-#define BDEV_STATE_STALE	3U
-
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_VERSION	1
-
-/*
- * This is the on disk format for btree nodes - a btree node on disk is a list
- * of these; within each set the keys are sorted
- */
-struct bset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
-/*
- * On disk format for priorities and gens - see super.c near prio_write() for
- * more.
- */
-struct prio_set {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		pad;
-
-	uint64_t		next_bucket;
-
-	struct bucket_disk {
-		uint16_t	prio;
-		uint8_t		gen;
-	} __attribute((packed)) data[];
-};
-
-struct uuid_entry {
-	union {
-		struct {
-			uint8_t		uuid[16];
-			uint8_t		label[32];
-			uint32_t	first_reg;
-			uint32_t	last_reg;
-			uint32_t	invalidated;
-
-			uint32_t	flags;
-			/* Size of flash only volumes */
-			uint64_t	sectors;
-		};
-
-		uint8_t	pad[128];
-	};
-};
-
-BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
-
 #include "journal.h"
 #include "stats.h"
 struct search;
@@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c)
 #endif
 }
 
-static inline bool SB_IS_BDEV(const struct cache_sb *sb)
-{
-	return sb->version == BCACHE_SB_VERSION_BDEV
-		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
 struct bbio {
 	unsigned		submit_time_us;
 	union {
@@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void)
 #define prio_buckets(c)					\
 	DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
 
-#define JSET_MAGIC		0x245235c1a3625032ULL
-#define PSET_MAGIC		0x6750e15f87337f91ULL
-#define BSET_MAGIC		0x90135c78b99e07f5ULL
-
-#define jset_magic(c)		((c)->sb.set_magic ^ JSET_MAGIC)
-#define pset_magic(c)		((c)->sb.set_magic ^ PSET_MAGIC)
-#define bset_magic(c)		((c)->sb.set_magic ^ BSET_MAGIC)
-
-/* Bkey fields: all units are in sectors */
-
-#define KEY_FIELD(name, field, offset, size)				\
-	BITMASK(name, struct bkey, field, offset, size)
-
-#define PTR_FIELD(name, offset, size)					\
-	static inline uint64_t name(const struct bkey *k, unsigned i)	\
-	{ return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }	\
-									\
-	static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
-	{								\
-		k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);	\
-		k->ptr[i] |= v << offset;				\
-	}
-
-KEY_FIELD(KEY_PTRS,	high, 60, 3)
-KEY_FIELD(HEADER_SIZE,	high, 58, 2)
-KEY_FIELD(KEY_CSUM,	high, 56, 2)
-KEY_FIELD(KEY_PINNED,	high, 55, 1)
-KEY_FIELD(KEY_DIRTY,	high, 36, 1)
-
-KEY_FIELD(KEY_SIZE,	high, 20, 16)
-KEY_FIELD(KEY_INODE,	high, 0,  20)
-
-/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
-
-static inline uint64_t KEY_OFFSET(const struct bkey *k)
-{
-	return k->low;
-}
-
-static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
-{
-	k->low = v;
-}
-
-PTR_FIELD(PTR_DEV,		51, 12)
-PTR_FIELD(PTR_OFFSET,		8,  43)
-PTR_FIELD(PTR_GEN,		0,  8)
-
-#define PTR_CHECK_DEV		((1 << 12) - 1)
-
-#define PTR(gen, offset, dev)						\
-	((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
-
 static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
 {
 	return s >> c->bucket_bits;
@@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 
 /* Btree key macros */
 
-/*
- * The high bit being set is a relic from when we used it to do binary
- * searches - it told you where a key started. It's not used anymore,
- * and can probably be safely dropped.
- */
-#define KEY(dev, sector, len)						\
-((struct bkey) {							\
-	.high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev),	\
-	.low = (sector)							\
-})
-
 static inline void bkey_init(struct bkey *k)
 {
-	*k = KEY(0, 0, 0);
+	*k = ZERO_KEY;
 }
 
-#define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
-#define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
-
-#define MAX_KEY_INODE		(~(~0 << 20))
-#define MAX_KEY_OFFSET		(((uint64_t) ~0) >> 1)
-#define MAX_KEY			KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
-
-#define ZERO_KEY		KEY(0, 0, 0)
-
 /*
  * This is used for various on disk data structures - cache_sb, prio_set, bset,
  * jset: The checksum is _always_ the first 8 bytes of these structs
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f7b5525ddafa..7b8713c66050 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b)
 	} else
 		get_random_bytes(&i->seq, sizeof(uint64_t));
 
-	i->magic	= bset_magic(b->c);
+	i->magic	= bset_magic(&b->c->sb);
 	i->version	= 0;
 	i->keys		= 0;
 
@@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 		 * memcpy()
 		 */
 
-		out->magic	= bset_magic(b->c);
+		out->magic	= bset_magic(&b->c->sb);
 		out->seq	= b->sets[0].data->seq;
 		out->version	= b->sets[0].data->version;
 		swap(out, b->sets[0].data);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 8a9305685b7e..5cd90565dfe2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
 
-static inline size_t bkey_u64s(const struct bkey *k)
-{
-	BUG_ON(KEY_CSUM(k) > 1);
-	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
-}
-
-static inline size_t bkey_bytes(const struct bkey *k)
-{
-	return bkey_u64s(k) * sizeof(uint64_t);
-}
-
-static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
-{
-	memcpy(dest, src, bkey_bytes(src));
-}
-
-static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
-{
-	if (!src)
-		src = &KEY(0, 0, 0);
-
-	SET_KEY_INODE(dest, KEY_INODE(src));
-	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
-}
-
-static inline struct bkey *bkey_next(const struct bkey *k)
-{
-	uint64_t *d = (void *) k;
-	return (struct bkey *) (d + bkey_u64s(k));
-}
-
 /* Keylists */
 
 struct keylist {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5aa4adadf1d..aba787d954e5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad magic";
-		if (i->magic != bset_magic(b->c))
+		if (i->magic != bset_magic(&b->c->sb))
 			goto err;
 
 		err = "bad checksum";
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 86de64a6bf26..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -74,7 +74,7 @@ reread:		left = ca->sb.bucket_size - offset;
 			struct list_head *where;
 			size_t blocks, bytes = set_bytes(j);
 
-			if (j->magic != jset_magic(ca->set))
+			if (j->magic != jset_magic(&ca->sb))
 				return ret;
 
 			if (bytes > left << 9)
@@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
 	for_each_cache(ca, c, i)
 		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 
-	w->data->magic		= jset_magic(c);
+	w->data->magic		= jset_magic(&c->sb);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
 	w->data->csum		= csum_set(w->data);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 5e9edb9ef376..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
  * nodes that are pinning the oldest journal entries first.
  */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_UUID	1
-#define BCACHE_JSET_VERSION		1
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	uint64_t		last_seq;
-
-	BKEY_PADDED(uuid_bucket);
-	BKEY_PADDED(btree_root);
-	uint16_t		btree_level;
-	uint16_t		pad[3];
-
-	uint64_t		prio_bucket[MAX_CACHES_PER_SET];
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index cf7850a7592c..932300f18973 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl)
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
 
 	while (bio_sectors(bio)) {
-		unsigned len = min(bio_sectors(bio), 1U << 14);
+		unsigned sectors = min(bio_sectors(bio),
+				       1U << (KEY_SIZE_BITS - 1));
 
 		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
-		bio->bi_sector	+= len;
-		bio->bi_size	-= len << 9;
+		bio->bi_sector	+= sectors;
+		bio->bi_size	-= sectors << 9;
 
 		bch_keylist_add(&op->insert_keys,
-				&KEY(op->inode, bio->bi_sector, len));
+				&KEY(op->inode, bio->bi_sector, sectors));
 	}
 
 	op->insert_data_done = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a314c771263f..c67d19a8913d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = {
 	NULL
 };
 
-struct uuid_entry_v0 {
-	uint8_t		uuid[16];
-	uint8_t		label[32];
-	uint32_t	first_reg;
-	uint32_t	last_reg;
-	uint32_t	invalidated;
-	uint32_t	pad;
-};
-
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca)
 		}
 
 		p->next_bucket	= ca->prio_buckets[i + 1];
-		p->magic	= pset_magic(ca);
+		p->magic	= pset_magic(&ca->sb);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
 		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
@@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 				pr_warn("bad csum reading priorities");
 
-			if (p->magic != pset_magic(ca))
+			if (p->magic != pset_magic(&ca->sb))
 				pr_warn("bad magic reading priorities");
 
 			bucket = p->next_bucket;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..38ae7a4ce928 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -27,16 +27,6 @@ struct closure;
 
 #endif
 
-#define BITMASK(name, type, field, offset, size)		\
-static inline uint64_t name(const type *k)			\
-{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
-								\
-static inline void SET_##name(type *k, uint64_t v)		\
-{								\
-	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
-	k->field |= v << offset;				\
-}
-
 #define DECLARE_HEAP(type, name)					\
 	struct {							\
 		size_t size, used;					\
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
+#ifndef _LINUX_BCACHE_H
+#define _LINUX_BCACHE_H
+
+/*
+ * Bcache on disk data structures
+ */
+
+#include <asm/types.h>
+
+#define BITMASK(name, type, field, offset, size)		\
+static inline __u64 name(const type *k)				\
+{ return (k->field >> offset) & ~(~0ULL << size); }		\
+								\
+static inline void SET_##name(type *k, __u64 v)			\
+{								\
+	k->field &= ~(~(~0ULL << size) << offset);		\
+	k->field |= (v & ~(~0ULL << size)) << offset;		\
+}
+
+/* Btree keys - all units are in sectors */
+
+struct bkey {
+	__u64	high;
+	__u64	low;
+	__u64	ptr[];
+};
+
+#define KEY_FIELD(name, field, offset, size)				\
+	BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size)					\
+static inline __u64 name(const struct bkey *k, unsigned i)		\
+{ return (k->ptr[i] >> offset) & ~(~0ULL << size); }			\
+									\
+static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\
+{									\
+	k->ptr[i] &= ~(~(~0ULL << size) << offset);			\
+	k->ptr[i] |= (v & ~(~0ULL << size)) << offset;			\
+}
+
+#define KEY_SIZE_BITS		16
+
+KEY_FIELD(KEY_PTRS,	high, 60, 3)
+KEY_FIELD(HEADER_SIZE,	high, 58, 2)
+KEY_FIELD(KEY_CSUM,	high, 56, 2)
+KEY_FIELD(KEY_PINNED,	high, 55, 1)
+KEY_FIELD(KEY_DIRTY,	high, 36, 1)
+
+KEY_FIELD(KEY_SIZE,	high, 20, KEY_SIZE_BITS)
+KEY_FIELD(KEY_INODE,	high, 0,  20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline __u64 KEY_OFFSET(const struct bkey *k)
+{
+	return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
+{
+	k->low = v;
+}
+
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(inode, offset, size)					\
+((struct bkey) {							\
+	.high = (1ULL << 63) | ((__u64) (size) << 20) | (inode),	\
+	.low = (offset)							\
+})
+
+#define ZERO_KEY			KEY(0, 0, 0)
+
+#define MAX_KEY_INODE			(~(~0 << 20))
+#define MAX_KEY_OFFSET			(~0ULL >> 1)
+#define MAX_KEY				KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
+
+#define KEY_START(k)			(KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k)			KEY(KEY_INODE(k), KEY_START(k), 0)
+
+#define PTR_DEV_BITS			12
+
+PTR_FIELD(PTR_DEV,			51, PTR_DEV_BITS)
+PTR_FIELD(PTR_OFFSET,			8,  43)
+PTR_FIELD(PTR_GEN,			0,  8)
+
+#define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
+
+#define PTR(gen, offset, dev)						\
+	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
+
+/* Bkey utility code */
+
+static inline unsigned long bkey_u64s(const struct bkey *k)
+{
+	return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
+}
+
+static inline unsigned long bkey_bytes(const struct bkey *k)
+{
+	return bkey_u64s(k) * sizeof(__u64);
+}
+
+#define bkey_copy(_dest, _src)	memcpy(_dest, _src, bkey_bytes(_src))
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+	SET_KEY_INODE(dest, KEY_INODE(src));
+	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
+}
+
+static inline struct bkey *bkey_next(const struct bkey *k)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + bkey_u64s(k));
+}
+
+static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + nr_keys);
+}
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD		8
+
+#define BKEY_PADDED(key)					\
+	union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
+
+/* Superblock */
+
+/* Version 0: Cache device
+ * Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
+ */
+#define BCACHE_SB_VERSION_CDEV		0
+#define BCACHE_SB_VERSION_BDEV		1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
+#define BCACHE_SB_MAX_VERSION		4
+
+#define SB_SECTOR			8
+#define SB_SIZE				4096
+#define SB_LABEL_SIZE			32
+#define SB_JOURNAL_BUCKETS		256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET		8
+
+#define BDEV_DATA_START_DEFAULT		16	/* sectors */
+
+struct cache_sb {
+	__u64			csum;
+	__u64			offset;	/* sector where this sb was written */
+	__u64			version;
+
+	__u8			magic[16];
+
+	__u8			uuid[16];
+	union {
+		__u8		set_uuid[16];
+		__u64		set_magic;
+	};
+	__u8			label[SB_LABEL_SIZE];
+
+	__u64			flags;
+	__u64			seq;
+	__u64			pad[8];
+
+	union {
+	struct {
+		/* Cache devices */
+		__u64		nbuckets;	/* device size */
+
+		__u16		block_size;	/* sectors */
+		__u16		bucket_size;	/* sectors */
+
+		__u16		nr_in_set;
+		__u16		nr_this_dev;
+	};
+	struct {
+		/* Backing devices */
+		__u64		data_offset;
+
+		/*
+		 * block_size from the cache device section is still used by
+		 * backing devices, so don't add anything here until we fix
+		 * things to not need it for backing devices anymore
+		 */
+	};
+	};
+
+	__u32			last_mount;	/* time_t */
+
+	__u16			first_bucket;
+	union {
+		__u16		njournal_buckets;
+		__u16		keys;
+	};
+	__u64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+};
+
+static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+	return sb->version == BCACHE_SB_VERSION_BDEV
+		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
+BITMASK(CACHE_SYNC,			struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD,			struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT,		struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU		0U
+#define CACHE_REPLACEMENT_FIFO		1U
+#define CACHE_REPLACEMENT_RANDOM	2U
+
+BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH		0U
+#define CACHE_MODE_WRITEBACK		1U
+#define CACHE_MODE_WRITEAROUND		2U
+#define CACHE_MODE_NONE			3U
+BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE			0U
+#define BDEV_STATE_CLEAN		1U
+#define BDEV_STATE_DIRTY		2U
+#define BDEV_STATE_STALE		3U
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define JSET_MAGIC			0x245235c1a3625032ULL
+#define PSET_MAGIC			0x6750e15f87337f91ULL
+#define BSET_MAGIC			0x90135c78b99e07f5ULL
+
+static inline __u64 jset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ JSET_MAGIC;
+}
+
+static inline __u64 pset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ PSET_MAGIC;
+}
+
+static inline __u64 bset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ BSET_MAGIC;
+}
+
+/*
+ * Journal
+ *
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+
+#define BCACHE_JSET_VERSION_UUIDv1	1
+#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
+#define BCACHE_JSET_VERSION		1
+
+struct jset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	__u64			last_seq;
+
+	BKEY_PADDED(uuid_bucket);
+	BKEY_PADDED(btree_root);
+	__u16			btree_level;
+	__u16			pad[3];
+
+	__u64			prio_bucket[MAX_CACHES_PER_SET];
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* Bucket prios/gens */
+
+struct prio_set {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			pad;
+
+	__u64			next_bucket;
+
+	struct bucket_disk {
+		__u16		prio;
+		__u8		gen;
+	} __attribute((packed)) data[];
+};
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry {
+	union {
+		struct {
+			__u8	uuid[16];
+			__u8	label[32];
+			__u32	first_reg;
+			__u32	last_reg;
+			__u32	invalidated;
+
+			__u32	flags;
+			/* Size of flash only volumes */
+			__u64	sectors;
+		};
+
+		__u8		pad[128];
+	};
+};
+
+BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
+
+/* Btree nodes */
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_CSUM		1
+#define BCACHE_BSET_VERSION		1
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* OBSOLETE */
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry_v0 {
+	__u8		uuid[16];
+	__u8		label[32];
+	__u32		first_reg;
+	__u32		last_reg;
+	__u32		invalidated;
+	__u32		pad;
+};
+
+#endif /* _LINUX_BCACHE_H */

From e58ff155034791ed3a5563d24a50fae0a8c1617c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:14:44 -0700
Subject: [PATCH 76/94] bcache: Fix bch_ptr_bad()

Previously, bch_ptr_bad() could return false when there was a pointer to
a nonexistant device... it only filtered out keys with PTR_CHECK_DEV
pointers.

This behaviour was intended for multiple cache device support; for that,
just because the device for one of the pointers has gone away doesn't
mean we want to filter out the rest of the pointers.

But we don't yet explicitly filter/check individual pointers, so without
that this behaviour was wrong - a corrupt bkey with a bad device pointer
could cause us to deref a bad pointer. Doh.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c | 61 ++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7b8713c66050..f32216c75948 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -116,47 +116,46 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 	    bch_ptr_invalid(b, k))
 		return true;
 
-	if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
-		return true;
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (!ptr_available(b->c, k, i))
+			return true;
 
-	for (i = 0; i < KEY_PTRS(k); i++)
-		if (ptr_available(b->c, k, i)) {
-			g = PTR_BUCKET(b->c, k, i);
-			stale = ptr_stale(b->c, k, i);
+		g = PTR_BUCKET(b->c, k, i);
+		stale = ptr_stale(b->c, k, i);
 
-			btree_bug_on(stale > 96, b,
-				     "key too stale: %i, need_gc %u",
-				     stale, b->c->need_gc);
+		btree_bug_on(stale > 96, b,
+			     "key too stale: %i, need_gc %u",
+			     stale, b->c->need_gc);
 
-			btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
-				     b, "stale dirty pointer");
+		btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+			     b, "stale dirty pointer");
 
-			if (stale)
-				return true;
+		if (stale)
+			return true;
 
 #ifdef CONFIG_BCACHE_EDEBUG
-			if (!mutex_trylock(&b->c->bucket_lock))
-				continue;
+		if (!mutex_trylock(&b->c->bucket_lock))
+			continue;
 
-			if (b->level) {
-				if (KEY_DIRTY(k) ||
-				    g->prio != BTREE_PRIO ||
-				    (b->c->gc_mark_valid &&
-				     GC_MARK(g) != GC_MARK_METADATA))
-					goto bug;
+		if (b->level) {
+			if (KEY_DIRTY(k) ||
+			    g->prio != BTREE_PRIO ||
+			    (b->c->gc_mark_valid &&
+			     GC_MARK(g) != GC_MARK_METADATA))
+				goto bug;
 
-			} else {
-				if (g->prio == BTREE_PRIO)
-					goto bug;
+		} else {
+			if (g->prio == BTREE_PRIO)
+				goto bug;
 
-				if (KEY_DIRTY(k) &&
-				    b->c->gc_mark_valid &&
-				    GC_MARK(g) != GC_MARK_DIRTY)
-					goto bug;
-			}
-			mutex_unlock(&b->c->bucket_lock);
-#endif
+			if (KEY_DIRTY(k) &&
+			    b->c->gc_mark_valid &&
+			    GC_MARK(g) != GC_MARK_DIRTY)
+				goto bug;
 		}
+		mutex_unlock(&b->c->bucket_lock);
+#endif
+	}
 
 	return false;
 #ifdef CONFIG_BCACHE_EDEBUG

From 280481d06c8a683d9aaa26125476222e76b733c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 16:36:03 -0700
Subject: [PATCH 77/94] bcache: Debug code improvements

Couple changes:
 * Consolidate bch_check_keys() and bch_check_key_order(), and move the
   checks that only check_key_order() could do to bch_btree_iter_next().

 * Get rid of CONFIG_BCACHE_EDEBUG - now, all that code is compiled in
   when CONFIG_BCACHE_DEBUG is enabled, and there's now a sysfs file to
   flip on the EDEBUG checks at runtime.

 * Dropped an old not terribly useful check in rw_unlock(), and
   refactored/improved a some of the other debug code.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/Kconfig  |  11 +---
 drivers/md/bcache/alloc.c  |   5 +-
 drivers/md/bcache/bcache.h |  10 +--
 drivers/md/bcache/bset.c   | 112 ++++++++++++++++---------------
 drivers/md/bcache/bset.h   |   3 +
 drivers/md/bcache/btree.c  |   8 ++-
 drivers/md/bcache/btree.h  |   8 ---
 drivers/md/bcache/debug.c  | 132 ++++++++++++++++---------------------
 drivers/md/bcache/debug.h  |  46 ++++++-------
 drivers/md/bcache/sysfs.c  |   5 ++
 drivers/md/bcache/util.h   |   4 +-
 11 files changed, 160 insertions(+), 184 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f950c9d29f3e..2638417b19aa 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -13,15 +13,8 @@ config BCACHE_DEBUG
 	---help---
 	Don't select this option unless you're a developer
 
-	Enables extra debugging tools (primarily a fuzz tester)
-
-config BCACHE_EDEBUG
-	bool "Extended runtime checks"
-	depends on BCACHE
-	---help---
-	Don't select this option unless you're a developer
-
-	Enables extra runtime checks which significantly affect performance
+	Enables extra debugging tools, allows expensive runtime checks to be
+	turned on.
 
 config BCACHE_CLOSURES_DEBUG
 	bool "Debug closures"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4970ddc6a7f6..ed5920b20c61 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -398,8 +398,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
 out:
 	wake_up_process(ca->alloc_thread);
 
-#ifdef CONFIG_BCACHE_EDEBUG
-	{
+	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
 
@@ -413,7 +412,7 @@ out:
 		fifo_for_each(i, &ca->unused, iter)
 			BUG_ON(i == r);
 	}
-#endif
+
 	b = ca->buckets + r;
 
 	BUG_ON(atomic_read(&b->pin) != 1);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 045cb99f1ca6..d03bc6f66493 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -690,6 +690,7 @@ struct cache_set {
 	unsigned short		journal_delay_ms;
 	unsigned		verify:1;
 	unsigned		key_merging_disabled:1;
+	unsigned		expensive_debug_checks:1;
 	unsigned		gc_always_rewrite:1;
 	unsigned		shrinker_disabled:1;
 	unsigned		copy_gc_enabled:1;
@@ -698,15 +699,6 @@ struct cache_set {
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
 };
 
-static inline bool key_merging_disabled(struct cache_set *c)
-{
-#ifdef CONFIG_BCACHE_DEBUG
-	return c->key_merging_disabled;
-#else
-	return 0;
-#endif
-}
-
 struct bbio {
 	unsigned		submit_time_us;
 	union {
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f32216c75948..6bffde478926 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -106,6 +106,43 @@ bad:
 	return true;
 }
 
+static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
+				     unsigned ptr)
+{
+	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+	char buf[80];
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		if (b->level) {
+			if (KEY_DIRTY(k) ||
+			    g->prio != BTREE_PRIO ||
+			    (b->c->gc_mark_valid &&
+			     GC_MARK(g) != GC_MARK_METADATA))
+				goto err;
+
+		} else {
+			if (g->prio == BTREE_PRIO)
+				goto err;
+
+			if (KEY_DIRTY(k) &&
+			    b->c->gc_mark_valid &&
+			    GC_MARK(g) != GC_MARK_DIRTY)
+				goto err;
+		}
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	return true;
+}
+
 bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 {
 	struct bucket *g;
@@ -133,46 +170,12 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 		if (stale)
 			return true;
 
-#ifdef CONFIG_BCACHE_EDEBUG
-		if (!mutex_trylock(&b->c->bucket_lock))
-			continue;
-
-		if (b->level) {
-			if (KEY_DIRTY(k) ||
-			    g->prio != BTREE_PRIO ||
-			    (b->c->gc_mark_valid &&
-			     GC_MARK(g) != GC_MARK_METADATA))
-				goto bug;
-
-		} else {
-			if (g->prio == BTREE_PRIO)
-				goto bug;
-
-			if (KEY_DIRTY(k) &&
-			    b->c->gc_mark_valid &&
-			    GC_MARK(g) != GC_MARK_DIRTY)
-				goto bug;
-		}
-		mutex_unlock(&b->c->bucket_lock);
-#endif
+		if (expensive_debug_checks(b->c) &&
+		    ptr_bad_expensive_checks(b, k, i))
+			return true;
 	}
 
 	return false;
-#ifdef CONFIG_BCACHE_EDEBUG
-bug:
-	mutex_unlock(&b->c->bucket_lock);
-
-	{
-		char buf[80];
-
-		bch_bkey_to_text(buf, sizeof(buf), k);
-		btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-			  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
-			  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
-	}
-	return true;
-#endif
 }
 
 /* Key/pointer manipulation */
@@ -821,16 +824,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 	} else
 		i = bset_search_write_set(b, t, search);
 
-#ifdef CONFIG_BCACHE_EDEBUG
-	BUG_ON(bset_written(b, t) &&
-	       i.l != t->data->start &&
-	       bkey_cmp(tree_to_prev_bkey(t,
-		  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
-			search) > 0);
+	if (expensive_debug_checks(b->c)) {
+		BUG_ON(bset_written(b, t) &&
+		       i.l != t->data->start &&
+		       bkey_cmp(tree_to_prev_bkey(t,
+			  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+				search) > 0);
 
-	BUG_ON(i.r != end(t->data) &&
-	       bkey_cmp(i.r, search) <= 0);
-#endif
+		BUG_ON(i.r != end(t->data) &&
+		       bkey_cmp(i.r, search) <= 0);
+	}
 
 	while (likely(i.l != i.r) &&
 	       bkey_cmp(i.l, search) <= 0)
@@ -871,12 +874,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
 }
 
 struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
-			       struct bkey *search, struct bset_tree *start)
+				   struct bkey *search, struct bset_tree *start)
 {
 	struct bkey *ret = NULL;
 	iter->size = ARRAY_SIZE(iter->data);
 	iter->used = 0;
 
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = b;
+#endif
+
 	for (; start <= &b->sets[b->nsets]; start++) {
 		ret = bch_bset_search(b, start, search);
 		bch_btree_iter_push(iter, ret, end(start->data));
@@ -891,6 +898,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
 	struct bkey *ret = NULL;
 
 	if (!btree_iter_end(iter)) {
+		bch_btree_iter_next_check(iter);
+
 		ret = iter->data->k;
 		iter->data->k = bkey_next(iter->data->k);
 
@@ -1002,7 +1011,6 @@ static void btree_mergesort(struct btree *b, struct bset *out,
 	out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
 
 	pr_debug("sorted %i keys", out->keys);
-	bch_check_key_order(b, out);
 }
 
 static void __btree_sort(struct btree *b, struct btree_iter *iter,
@@ -1063,15 +1071,15 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 
 void bch_btree_sort_partial(struct btree *b, unsigned start)
 {
-	size_t oldsize = 0, order = b->page_order, keys = 0;
+	size_t order = b->page_order, keys = 0;
 	struct btree_iter iter;
+	int oldsize = bch_count_data(b);
+
 	__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
 
 	BUG_ON(b->sets[b->nsets].data == write_block(b) &&
 	       (b->sets[b->nsets].size || b->nsets));
 
-	if (b->written)
-		oldsize = bch_count_data(b);
 
 	if (start) {
 		unsigned i;
@@ -1087,7 +1095,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start)
 
 	__btree_sort(b, &iter, start, order, false);
 
-	EBUG_ON(b->written && bch_count_data(b) != oldsize);
+	EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
 }
 
 void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 5cd90565dfe2..a043a92d4dc9 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -148,6 +148,9 @@
 
 struct btree_iter {
 	size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree *b;
+#endif
 	struct btree_iter_set {
 		struct bkey *k, *end;
 	} data[MAX_BSETS];
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index aba787d954e5..fa4d0b1f6d75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -216,6 +216,10 @@ static void bch_btree_node_read_done(struct btree *b)
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = b;
+#endif
+
 	if (!i->seq)
 		goto err;
 
@@ -454,7 +458,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 	BUG_ON(b->written >= btree_blocks(b));
 	BUG_ON(b->written && !i->keys);
 	BUG_ON(b->sets->data->seq != i->seq);
-	bch_check_key_order(b, i);
+	bch_check_keys(b, "writing");
 
 	cancel_delayed_work(&b->work);
 
@@ -1917,7 +1921,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 				  struct bkey *replace_key)
 {
 	bool ret = false;
-	unsigned oldsize = bch_count_data(b);
+	int oldsize = bch_count_data(b);
 
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 8fc1e8925399..27e90b189112 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -259,14 +259,6 @@ static inline void rw_lock(bool w, struct btree *b, int level)
 
 static inline void rw_unlock(bool w, struct btree *b)
 {
-#ifdef CONFIG_BCACHE_EDEBUG
-	unsigned i;
-
-	if (w && b->key.ptr[0])
-		for (i = 0; i <= b->nsets; i++)
-			bch_check_key_order(b, b->sets[i].data);
-#endif
-
 	if (w)
 		b->seq++;
 	(w ? up_write : up_read)(&b->lock);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d9ccb3169aa2..e99e6b8852b2 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -76,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 	return out - buf;
 }
 
-int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
-{
-	return scnprintf(buf, size, "%zu level %i/%i",
-			 PTR_BUCKET_NR(b->c, &b->key, 0),
-			 b->level, b->c->root ? b->c->root->level : -1);
-}
-
-#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
-
-static bool skipped_backwards(struct btree *b, struct bkey *k)
-{
-	return bkey_cmp(k, (!b->level)
-			? &START_KEY(bkey_next(k))
-			: bkey_next(k)) > 0;
-}
+#ifdef CONFIG_BCACHE_DEBUG
 
 static void dump_bset(struct btree *b, struct bset *i)
 {
-	struct bkey *k;
+	struct bkey *k, *next;
 	unsigned j;
 	char buf[80];
 
-	for (k = i->start; k < end(i); k = bkey_next(k)) {
+	for (k = i->start; k < end(i); k = next) {
+		next = bkey_next(k);
+
 		bch_bkey_to_text(buf, sizeof(buf), k);
 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
 		       (uint64_t *) k - i->d, i->keys, buf);
@@ -114,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i)
 
 		printk(" %s\n", bch_ptr_status(b->c, k));
 
-		if (bkey_next(k) < end(i) &&
-		    skipped_backwards(b, k))
+		if (next < end(i) &&
+		    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
 			printk(KERN_ERR "Key skipped backwards\n");
 	}
 }
 
-#endif
+static void bch_dump_bucket(struct btree *b)
+{
+	unsigned i;
 
-#ifdef CONFIG_BCACHE_DEBUG
+	console_lock();
+	for (i = 0; i <= b->nsets; i++)
+		dump_bset(b, b->sets[i].data);
+	console_unlock();
+}
 
 void bch_btree_verify(struct btree *b, struct bset *new)
 {
@@ -211,11 +205,7 @@ out_put:
 	bio_put(check);
 }
 
-#endif
-
-#ifdef CONFIG_BCACHE_EDEBUG
-
-unsigned bch_count_data(struct btree *b)
+int __bch_count_data(struct btree *b)
 {
 	unsigned ret = 0;
 	struct btree_iter iter;
@@ -227,72 +217,60 @@ unsigned bch_count_data(struct btree *b)
 	return ret;
 }
 
-static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
-				   va_list args)
-{
-	unsigned i;
-	char buf[80];
-
-	console_lock();
-
-	for (i = 0; i <= b->nsets; i++)
-		dump_bset(b, b->sets[i].data);
-
-	vprintk(fmt, args);
-
-	console_unlock();
-
-	bch_btree_to_text(buf, sizeof(buf), b);
-	panic("at %s\n", buf);
-}
-
-void bch_check_key_order_msg(struct btree *b, struct bset *i,
-			     const char *fmt, ...)
-{
-	struct bkey *k;
-
-	if (!i->keys)
-		return;
-
-	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
-		if (skipped_backwards(b, k)) {
-			va_list args;
-			va_start(args, fmt);
-
-			vdump_bucket_and_panic(b, fmt, args);
-			va_end(args);
-		}
-}
-
-void bch_check_keys(struct btree *b, const char *fmt, ...)
+void __bch_check_keys(struct btree *b, const char *fmt, ...)
 {
 	va_list args;
 	struct bkey *k, *p = NULL;
 	struct btree_iter iter;
-
-	if (b->level)
-		return;
+	const char *err;
 
 	for_each_key(b, k, &iter) {
-		if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
-			printk(KERN_ERR "Keys out of order:\n");
-			goto bug;
-		}
+		if (!b->level) {
+			err = "Keys out of order";
+			if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+				goto bug;
 
-		if (bch_ptr_invalid(b, k))
-			continue;
+			if (bch_ptr_invalid(b, k))
+				continue;
 
-		if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
-			printk(KERN_ERR "Overlapping keys:\n");
-			goto bug;
+			err =  "Overlapping keys";
+			if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+				goto bug;
+		} else {
+			if (bch_ptr_bad(b, k))
+				continue;
+
+			err = "Duplicate keys";
+			if (p && !bkey_cmp(p, k))
+				goto bug;
 		}
 		p = k;
 	}
+
+	err = "Key larger than btree node key";
+	if (p && bkey_cmp(p, &b->key) > 0)
+		goto bug;
+
 	return;
 bug:
+	bch_dump_bucket(b);
+
 	va_start(args, fmt);
-	vdump_bucket_and_panic(b, fmt, args);
+	vprintk(fmt, args);
 	va_end(args);
+
+	panic("bcache error: %s:\n", err);
+}
+
+void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+	struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+	if (next < iter->data->end &&
+	    bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
+		bch_dump_bucket(iter->b);
+		panic("Key skipped backwards\n");
+	}
 }
 
 #endif
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 0f4b3440512c..7914ba0ff316 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -4,40 +4,42 @@
 /* Btree/bkey debug printing */
 
 int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
-int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
-
-#ifdef CONFIG_BCACHE_EDEBUG
-
-unsigned bch_count_data(struct btree *);
-void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
-void bch_check_keys(struct btree *, const char *, ...);
-
-#define bch_check_key_order(b, i)			\
-	bch_check_key_order_msg(b, i, "keys out of order")
-#define EBUG_ON(cond)		BUG_ON(cond)
-
-#else /* EDEBUG */
-
-#define bch_count_data(b)				0
-#define bch_check_key_order(b, i)			do {} while (0)
-#define bch_check_key_order_msg(b, i, ...)		do {} while (0)
-#define bch_check_keys(b, ...)				do {} while (0)
-#define EBUG_ON(cond)					do {} while (0)
-
-#endif
 
 #ifdef CONFIG_BCACHE_DEBUG
 
 void bch_btree_verify(struct btree *, struct bset *);
 void bch_data_verify(struct cached_dev *, struct bio *);
+int __bch_count_data(struct btree *);
+void __bch_check_keys(struct btree *, const char *, ...);
+void bch_btree_iter_next_check(struct btree_iter *);
+
+#define EBUG_ON(cond)			BUG_ON(cond)
+#define expensive_debug_checks(c)	((c)->expensive_debug_checks)
+#define key_merging_disabled(c)		((c)->key_merging_disabled)
 
 #else /* DEBUG */
 
 static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
-static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {};
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+static inline int __bch_count_data(struct btree *b) { return -1; }
+static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#define EBUG_ON(cond)			do { if (cond); } while (0)
+#define expensive_debug_checks(c)	0
+#define key_merging_disabled(c)		0
 
 #endif
 
+#define bch_count_data(b)						\
+	(expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
+
+#define bch_check_keys(b, ...)						\
+do {									\
+	if (expensive_debug_checks((b)->c))				\
+		__bch_check_keys(b, __VA_ARGS__);			\
+} while (0)
+
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index ab286b9b5e40..9687771ec6f3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,6 +102,7 @@ rw_attribute(io_error_halflife);
 rw_attribute(verify);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
+rw_attribute(expensive_debug_checks);
 rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
@@ -517,6 +518,8 @@ lock_root:
 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
 	sysfs_printf(verify,			"%i", c->verify);
 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
+	sysfs_printf(expensive_debug_checks,
+		     "%i", c->expensive_debug_checks);
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
@@ -599,6 +602,7 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(verify,			c->verify);
 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
+	sysfs_strtoul(expensive_debug_checks,	c->expensive_debug_checks);
 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
@@ -674,6 +678,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
 	&sysfs_key_merging_disabled,
+	&sysfs_expensive_debug_checks,
 #endif
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 38ae7a4ce928..8ce5aab55962 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,12 +15,12 @@
 
 struct closure;
 
-#ifdef CONFIG_BCACHE_EDEBUG
+#ifdef CONFIG_BCACHE_DEBUG
 
 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
 
-#else /* EDEBUG */
+#else /* DEBUG */
 
 #define atomic_dec_bug(v)	atomic_dec(v)
 #define atomic_inc_bug(v, i)	atomic_inc(v)

From 3a3b6a4e075188342b58d4b6560f5540af64cac0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 16:46:42 -0700
Subject: [PATCH 78/94] bcache: Don't bother with bucket refcount for btree
 node allocations

The bucket refcount (dropped with bkey_put()) is only needed to prevent
the newly allocated bucket from being garbage collected until we've
added a pointer to it somewhere. But for btree node allocations, the
fact that we have btree nodes locked is enough to guard against races
with garbage collection.

Eventually the per bucket refcount is going to be replaced with
something specific to bch_alloc_sectors().

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c |  4 ++--
 drivers/md/bcache/btree.c | 28 +++++-----------------------
 drivers/md/bcache/btree.h |  2 +-
 drivers/md/bcache/super.c |  2 +-
 4 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ed5920b20c61..2b46bf1d7e40 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -472,7 +472,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 	return 0;
 err:
 	bch_bucket_free(c, k);
-	__bkey_put(c, k);
+	bkey_put(c, k);
 	return -1;
 }
 
@@ -588,7 +588,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
 	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 	 */
 	if (KEY_PTRS(&alloc.key))
-		__bkey_put(c, &alloc.key);
+		bkey_put(c, &alloc.key);
 
 	for (i = 0; i < KEY_PTRS(&b->key); i++)
 		EBUG_ON(ptr_stale(c, &b->key, i));
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fa4d0b1f6d75..7dff73ba1b71 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -180,7 +180,7 @@ static inline bool should_split(struct btree *b)
 
 /* Btree key manipulation */
 
-void __bkey_put(struct cache_set *c, struct bkey *k)
+void bkey_put(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 
@@ -189,12 +189,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k)
 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
 }
 
-static void bkey_put(struct cache_set *c, struct bkey *k, int level)
-{
-	if ((level && KEY_OFFSET(k)) || !level)
-		__bkey_put(c, k);
-}
-
 /* Btree IO */
 
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
@@ -1068,6 +1062,7 @@ retry:
 	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		goto err;
 
+	bkey_put(c, &k.key);
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
 	b = mca_alloc(c, &k.key, level);
@@ -1077,7 +1072,6 @@ retry:
 	if (!b) {
 		cache_bug(c,
 			"Tried to allocate bucket that was in btree cache");
-		__bkey_put(c, &k.key);
 		goto retry;
 	}
 
@@ -1090,7 +1084,6 @@ retry:
 	return b;
 err_free:
 	bch_bucket_free(c, &k.key);
-	__bkey_put(c, &k.key);
 err:
 	mutex_unlock(&c->bucket_lock);
 
@@ -1217,7 +1210,6 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 
 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
-		__bkey_put(b->c, &b->key);
 
 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));
@@ -1932,19 +1924,12 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			break;
 
 		if (bkey_cmp(k, &b->key) <= 0) {
-			bkey_put(b->c, k, b->level);
+			if (!b->level)
+				bkey_put(b->c, k);
 
 			ret |= btree_insert_key(b, op, k, replace_key);
 			bch_keylist_pop_front(insert_keys);
 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
-#if 0
-			if (replace_key) {
-				bkey_put(b->c, k, b->level);
-				bch_keylist_pop_front(insert_keys);
-				op->insert_collision = true;
-				break;
-			}
-#endif
 			BKEY_PADDED(key) temp;
 			bkey_copy(&temp.key, insert_keys->keys);
 
@@ -2071,11 +2056,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 	return 0;
 err_free2:
-	__bkey_put(n2->c, &n2->key);
 	btree_node_free(n2);
 	rw_unlock(true, n2);
 err_free1:
-	__bkey_put(n1->c, &n1->key);
 	btree_node_free(n1);
 	rw_unlock(true, n1);
 err:
@@ -2225,7 +2208,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
 		pr_err("error %i", ret);
 
 		while ((k = bch_keylist_pop(keys)))
-			bkey_put(c, k, 0);
+			bkey_put(c, k);
 	} else if (op.op.insert_collision)
 		ret = -ESRCH;
 
@@ -2251,7 +2234,6 @@ void bch_btree_set_root(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 
 	b->c->root = b;
-	__bkey_put(b->c, &b->key);
 
 	bch_journal_meta(b->c, &cl);
 	closure_sync(&cl);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 27e90b189112..d4b705eeec24 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -216,7 +216,7 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
-void __bkey_put(struct cache_set *c, struct bkey *k);
+void bkey_put(struct cache_set *c, struct bkey *k);
 
 /* Looping macros */
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c67d19a8913d..05f8ccb9f8ca 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -426,7 +426,7 @@ static int __uuid_write(struct cache_set *c)
 	closure_sync(&cl);
 
 	bkey_copy(&c->uuid_bucket, &k.key);
-	__bkey_put(c, &k.key);
+	bkey_put(c, &k.key);
 	return 0;
 }
 

From d5cc66e95744065f96024add4bf7d7e019be54ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 23:06:40 -0700
Subject: [PATCH 79/94] bcache: bch_(btree|extent)_ptr_invalid()

Trying to treat btree pointers and leaf node pointers the same way was a
mistake - going to start being more explicit about the type of
key/pointer we're dealing with. This is the first part of that
refactoring; this patch shouldn't change any actual behaviour.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  | 49 ++++++++++++++++++++++++++++-----------
 drivers/md/bcache/bset.h  |  4 +++-
 drivers/md/bcache/btree.h | 13 +++++++----
 drivers/md/bcache/super.c |  4 ++--
 4 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 6bffde478926..b615348c45fc 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -73,19 +73,9 @@ void bch_keylist_pop_front(struct keylist *l)
 
 /* Pointer validation */
 
-bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 {
 	unsigned i;
-	char buf[80];
-
-	if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
-		goto bad;
-
-	if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
-		goto bad;
-
-	if (!KEY_SIZE(k))
-		return true;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
@@ -96,13 +86,46 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
 			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
 			    bucket <  ca->sb.first_bucket ||
 			    bucket >= ca->sb.nbuckets)
-				goto bad;
+				return true;
 		}
 
+	return false;
+}
+
+bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
 	return false;
 bad:
 	bch_bkey_to_text(buf, sizeof(buf), k);
-	cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
+	cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+	return true;
+}
+
+bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_SIZE(k))
+		return true;
+
+	if (KEY_SIZE(k) > KEY_OFFSET(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 	return true;
 }
 
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index a043a92d4dc9..e67386001814 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -277,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
 }
 
 const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
+bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
+
 bool bch_ptr_bad(struct btree *, const struct bkey *);
 
 static inline uint8_t gen_after(uint8_t a, uint8_t b)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d4b705eeec24..e11bb8571d24 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -204,11 +204,6 @@ static inline void set_gc_sectors(struct cache_set *c)
 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
 }
 
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-	return __bch_ptr_invalid(b->c, b->level, k);
-}
-
 static inline struct bkey *bch_btree_iter_init(struct btree *b,
 					       struct btree_iter *iter,
 					       struct bkey *search)
@@ -216,6 +211,14 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
+static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
+{
+	if (b->level)
+		return bch_btree_ptr_invalid(b->c, k);
+	else
+		return bch_extent_ptr_invalid(b->c, k);
+}
+
 void bkey_put(struct cache_set *c, struct bkey *k);
 
 /* Looping macros */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 05f8ccb9f8ca..7ab926d94d81 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -373,7 +373,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
 	struct bkey *k = &j->uuid_bucket;
 
-	if (__bch_ptr_invalid(c, 1, k))
+	if (bch_btree_ptr_invalid(c, k))
 		return "bad uuid pointer";
 
 	bkey_copy(&c->uuid_bucket, k);
@@ -1522,7 +1522,7 @@ static void run_cache_set(struct cache_set *c)
 		k = &j->btree_root;
 
 		err = "bad btree root";
-		if (__bch_ptr_invalid(c, j->btree_level + 1, k))
+		if (bch_btree_ptr_invalid(c, k))
 			goto err;
 
 		err = "error reading btree root";

From 0eacac22034ca21c73fe49e800d0b938b2047250 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 1 Jul 2013 19:29:05 -0700
Subject: [PATCH 80/94] bcache: PRECEDING_KEY()

btree_insert_key() was open coding this, this is just refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.h  | 18 ++++++++++++++++++
 drivers/md/bcache/btree.c |  9 ++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index e67386001814..bc9975526aa1 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -353,12 +353,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
 struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
 			   const struct bkey *);
 
+/*
+ * Returns the first key that is strictly greater than search
+ */
 static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
 					   const struct bkey *search)
 {
 	return search ? __bch_bset_search(b, t, search) : t->data->start;
 }
 
+#define PRECEDING_KEY(_k)					\
+({								\
+	struct bkey *_ret = NULL;				\
+								\
+	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
+		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
+								\
+		if (!_ret->low)					\
+			_ret->high--;				\
+		_ret->low--;					\
+	}							\
+								\
+	_ret;							\
+})
+
 bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
 void bch_btree_sort_lazy(struct btree *);
 void bch_btree_sort_into(struct btree *, struct btree *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7dff73ba1b71..773b0e929ff4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1844,19 +1844,14 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 
 	if (!b->level) {
 		struct btree_iter iter;
-		struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
 
 		/*
 		 * bset_search() returns the first key that is strictly greater
 		 * than the search key - but for back merging, we want to find
-		 * the first key that is greater than or equal to KEY_START(k) -
-		 * unless KEY_START(k) is 0.
+		 * the previous key.
 		 */
-		if (KEY_OFFSET(&search))
-			SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
-
 		prev = NULL;
-		m = bch_btree_iter_init(b, &iter, &search);
+		m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
 
 		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
 			op->insert_collision = true;

From f269af5a078302712de8ee70d273eba2eb4485ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 23 Jul 2013 20:48:29 -0700
Subject: [PATCH 81/94] bcache: Add btree_node_write_sync()

More refactoring - mostly making the interfaces more explicit about what
we actually want to do.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 773b0e929ff4..7d6204c41840 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -474,6 +474,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 		bch_bset_init_next(b);
 }
 
+static void bch_btree_node_write_sync(struct btree *b)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch_btree_node_write(b, &cl);
+	closure_sync(&cl);
+}
+
 static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
@@ -655,10 +664,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 		return -ENOMEM;
 	}
 
-	if (btree_node_dirty(b)) {
-		bch_btree_node_write(b, &cl);
-		closure_sync(&cl);
-	}
+	if (btree_node_dirty(b))
+		bch_btree_node_write_sync(b);
 
 	/* wait for any in flight btree write */
 	closure_wait_event(&b->io.wait, &cl,
@@ -1411,9 +1418,6 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	struct btree *n = NULL;
 	unsigned keys = 0;
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
-	struct closure cl;
-
-	closure_init_stack(&cl);
 
 	if (b->level || stale > 10)
 		n = btree_node_alloc_replacement(b);
@@ -1424,12 +1428,10 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	if (b->level)
 		ret = btree_gc_recurse(b, op, writes, gc);
 
-	if (!b->written || btree_node_dirty(b)) {
-		bch_btree_node_write(b, n ? &cl : NULL);
-	}
+	if (!b->written || btree_node_dirty(b))
+		bch_btree_node_write_sync(b);
 
 	if (!IS_ERR_OR_NULL(n)) {
-		closure_sync(&cl);
 		bch_btree_set_root(b);
 		btree_node_free(n);
 		rw_unlock(true, b);
@@ -2104,15 +2106,10 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 			if (bch_btree_insert_keys(b, op, insert_keys,
 						  replace_key)) {
-				if (!b->level) {
+				if (!b->level)
 					bch_btree_leaf_dirty(b, journal_ref);
-				} else {
-					struct closure cl;
-
-					closure_init_stack(&cl);
-					bch_btree_node_write(b, &cl);
-					closure_sync(&cl);
-				}
+				else
+					bch_btree_node_write_sync(b);
 			}
 		}
 	} while (!bch_keylist_empty(&split_keys));

From 8835c1234dd9a838993a2d5cb7572f57992ebbee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 23:18:05 -0700
Subject: [PATCH 82/94] bcache: Add make_btree_freeing_key()

Refactoring, prep work for incremental garbage collection.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7d6204c41840..a3f8ca4ee6e0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1107,6 +1107,22 @@ static struct btree *btree_node_alloc_replacement(struct btree *b)
 	return n;
 }
 
+static void make_btree_freeing_key(struct btree *b, struct bkey *k)
+{
+	unsigned i;
+
+	bkey_copy(k, &b->key);
+	bkey_copy_key(k, &ZERO_KEY);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
+
+		SET_PTR_GEN(k, i, g);
+	}
+
+	atomic_inc(&b->c->prio_blocked);
+}
+
 /* Garbage collection */
 
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -2030,20 +2046,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		closure_sync(&cl);
 		bch_btree_set_root(n1);
 	} else {
-		unsigned i;
-
-		bkey_copy(parent_keys->top, &b->key);
-		bkey_copy_key(parent_keys->top, &ZERO_KEY);
-
-		for (i = 0; i < KEY_PTRS(&b->key); i++) {
-			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
-
-			SET_PTR_GEN(parent_keys->top, i, g);
-		}
-
-		bch_keylist_push(parent_keys);
 		closure_sync(&cl);
-		atomic_inc(&b->c->prio_blocked);
+		make_btree_freeing_key(b, parent_keys->top);
+		bch_keylist_push(parent_keys);
 	}
 
 	rw_unlock(true, n1);

From a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:07:00 -0700
Subject: [PATCH 83/94] bcache: Incremental gc

Big garbage collection rewrite; now, garbage collection uses the same
mechanisms as used elsewhere for inserting/updating btree node pointers,
instead of rewriting interior btree nodes in place.

This makes the code significantly cleaner and less fragile, and means we
can now make garbage collection incremental - it doesn't have to hold a
write lock on the root of the btree for the entire duration of garbage
collection.

This means that there's less of a latency hit for doing garbage
collection, which means we can gc more frequently (and do a better job
of reclaiming from the cache), and we can coalesce across more btree
nodes (improving our space efficiency).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |   1 -
 drivers/md/bcache/btree.c  | 410 +++++++++++++++++++++----------------
 drivers/md/bcache/btree.h  |   2 +-
 drivers/md/bcache/sysfs.c  |   2 -
 4 files changed, 237 insertions(+), 178 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d03bc6f66493..d6970a651e42 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,7 +477,6 @@ struct gc_stat {
 
 	size_t			nkeys;
 	uint64_t		data;	/* sectors */
-	uint64_t		dirty;	/* sectors */
 	unsigned		in_use; /* percent */
 };
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a3f8ca4ee6e0..7d283d217438 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 
 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
 
-static int btree_gc_mark_node(struct btree *b, unsigned *keys,
-			      struct gc_stat *gc)
+static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 {
 	uint8_t stale = 0;
-	unsigned last_dev = -1;
-	struct bcache_device *d = NULL;
+	unsigned keys = 0, good_keys = 0;
 	struct bkey *k;
 	struct btree_iter iter;
 	struct bset_tree *t;
@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 	gc->nodes++;
 
 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
-		if (last_dev != KEY_INODE(k)) {
-			last_dev = KEY_INODE(k);
-
-			d = KEY_INODE(k) < b->c->nr_uuids
-				? b->c->devices[last_dev]
-				: NULL;
-		}
-
 		stale = max(stale, btree_mark_key(b, k));
+		keys++;
 
 		if (bch_ptr_bad(b, k))
 			continue;
 
-		*keys += bkey_u64s(k);
-
 		gc->key_bytes += bkey_u64s(k);
 		gc->nkeys++;
+		good_keys++;
 
 		gc->data += KEY_SIZE(k);
-		if (KEY_DIRTY(k))
-			gc->dirty += KEY_SIZE(k);
 	}
 
 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1218,94 +1206,63 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 			     bkey_cmp(&b->key, &t->end) < 0,
 			     b, "found short btree key in gc");
 
-	return stale;
+	if (b->c->gc_always_rewrite)
+		return true;
+
+	if (stale > 10)
+		return true;
+
+	if ((keys - good_keys) * 2 > keys)
+		return true;
+
+	return false;
 }
 
-static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
-{
-	/*
-	 * We block priorities from being written for the duration of garbage
-	 * collection, so we can't sleep in btree_alloc() ->
-	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
-	 * our closure.
-	 */
-	struct btree *n = btree_node_alloc_replacement(b);
-
-	if (!IS_ERR_OR_NULL(n)) {
-		swap(b, n);
-
-		memcpy(k->ptr, b->key.ptr,
-		       sizeof(uint64_t) * KEY_PTRS(&b->key));
-
-		btree_node_free(n);
-		up_write(&n->lock);
-	}
-
-	return b;
-}
-
-/*
- * Leaving this at 2 until we've got incremental garbage collection done; it
- * could be higher (and has been tested with 4) except that garbage collection
- * could take much longer, adversely affecting latency.
- */
-#define GC_MERGE_NODES	2U
+#define GC_MERGE_NODES	4U
 
 struct gc_merge_info {
 	struct btree	*b;
-	struct bkey	*k;
 	unsigned	keys;
 };
 
-static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
-			      struct gc_merge_info *r)
-{
-	unsigned nodes = 0, keys = 0, blocks;
-	int i;
-	struct closure cl;
+static int bch_btree_insert_node(struct btree *, struct btree_op *,
+				 struct keylist *, atomic_t *, struct bkey *);
 
+static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
+			     struct keylist *keylist, struct gc_stat *gc,
+			     struct gc_merge_info *r)
+{
+	unsigned i, nodes = 0, keys = 0, blocks;
+	struct btree *new_nodes[GC_MERGE_NODES];
+	struct closure cl;
+	struct bkey *k;
+
+	memset(new_nodes, 0, sizeof(new_nodes));
 	closure_init_stack(&cl);
 
-	while (nodes < GC_MERGE_NODES && r[nodes].b)
+	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
 		keys += r[nodes++].keys;
 
 	blocks = btree_default_blocks(b->c) * 2 / 3;
 
 	if (nodes < 2 ||
 	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
-		return;
+		return 0;
 
-	for (i = nodes - 1; i >= 0; --i) {
-		if (r[i].b->written)
-			r[i].b = btree_gc_alloc(r[i].b, r[i].k);
-
-		if (r[i].b->written)
-			return;
+	for (i = 0; i < nodes; i++) {
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b);
+		if (IS_ERR_OR_NULL(new_nodes[i]))
+			goto out_nocoalesce;
 	}
 
 	for (i = nodes - 1; i > 0; --i) {
-		struct bset *n1 = r[i].b->sets->data;
-		struct bset *n2 = r[i - 1].b->sets->data;
+		struct bset *n1 = new_nodes[i]->sets->data;
+		struct bset *n2 = new_nodes[i - 1]->sets->data;
 		struct bkey *k, *last = NULL;
 
 		keys = 0;
 
-		if (i == 1) {
-			/*
-			 * Last node we're not getting rid of - we're getting
-			 * rid of the node at r[0]. Have to try and fit all of
-			 * the remaining keys into this node; we can't ensure
-			 * they will always fit due to rounding and variable
-			 * length keys (shouldn't be possible in practice,
-			 * though)
-			 */
-			if (__set_blocks(n1, n1->keys + r->keys,
-					 b->c) > btree_blocks(r[i].b))
-				return;
-
-			keys = n2->keys;
-			last = &r->b->key;
-		} else
+		if (i > 1) {
 			for (k = n2->start;
 			     k < end(n2);
 			     k = bkey_next(k)) {
@@ -1316,20 +1273,36 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 				last = k;
 				keys += bkey_u64s(k);
 			}
+		} else {
+			/*
+			 * Last node we're not getting rid of - we're getting
+			 * rid of the node at r[0]. Have to try and fit all of
+			 * the remaining keys into this node; we can't ensure
+			 * they will always fit due to rounding and variable
+			 * length keys (shouldn't be possible in practice,
+			 * though)
+			 */
+			if (__set_blocks(n1, n1->keys + n2->keys,
+					 b->c) > btree_blocks(new_nodes[i]))
+				goto out_nocoalesce;
+
+			keys = n2->keys;
+			/* Take the key of the node we're getting rid of */
+			last = &r->b->key;
+		}
 
 		BUG_ON(__set_blocks(n1, n1->keys + keys,
-				    b->c) > btree_blocks(r[i].b));
+				    b->c) > btree_blocks(new_nodes[i]));
 
-		if (last) {
-			bkey_copy_key(&r[i].b->key, last);
-			bkey_copy_key(r[i].k, last);
-		}
+		if (last)
+			bkey_copy_key(&new_nodes[i]->key, last);
 
 		memcpy(end(n1),
 		       n2->start,
 		       (void *) node(n2, keys) - (void *) n2->start);
 
 		n1->keys += keys;
+		r[i].keys = n1->keys;
 
 		memmove(n2->start,
 			node(n2, keys),
@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 
 		n2->keys -= keys;
 
-		r[i].keys	= n1->keys;
-		r[i - 1].keys	= n2->keys;
+		if (bch_keylist_realloc(keylist,
+					KEY_PTRS(&new_nodes[i]->key), b->c))
+			goto out_nocoalesce;
+
+		bch_btree_node_write(new_nodes[i], &cl);
+		bch_keylist_add(keylist, &new_nodes[i]->key);
 	}
 
-	btree_node_free(r->b);
-	up_write(&r->b->lock);
+	for (i = 0; i < nodes; i++) {
+		if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+			goto out_nocoalesce;
+
+		make_btree_freeing_key(r[i].b, keylist->top);
+		bch_keylist_push(keylist);
+	}
+
+	/* We emptied out this node */
+	BUG_ON(new_nodes[0]->sets->data->keys);
+	btree_node_free(new_nodes[0]);
+	rw_unlock(true, new_nodes[0]);
+
+	closure_sync(&cl);
+
+	for (i = 0; i < nodes; i++) {
+		btree_node_free(r[i].b);
+		rw_unlock(true, r[i].b);
+
+		r[i].b = new_nodes[i];
+	}
+
+	bch_btree_insert_node(b, op, keylist, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(keylist));
+
+	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
+	r[nodes - 1].b = ERR_PTR(-EINTR);
 
 	trace_bcache_btree_gc_coalesce(nodes);
-
 	gc->nodes--;
-	nodes--;
 
-	memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
-	memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+	/* Invalidated our iterator */
+	return -EINTR;
+
+out_nocoalesce:
+	closure_sync(&cl);
+
+	while ((k = bch_keylist_pop(keylist)))
+		if (!bkey_cmp(k, &ZERO_KEY))
+			atomic_dec(&b->c->prio_blocked);
+
+	for (i = 0; i < nodes; i++)
+		if (!IS_ERR_OR_NULL(new_nodes[i])) {
+			btree_node_free(new_nodes[i]);
+			rw_unlock(true, new_nodes[i]);
+		}
+	return 0;
+}
+
+static unsigned btree_gc_count_keys(struct btree *b)
+{
+	struct bkey *k;
+	struct btree_iter iter;
+	unsigned ret = 0;
+
+	for_each_key_filter(b, k, &iter, bch_ptr_bad)
+		ret += bkey_u64s(k);
+
+	return ret;
 }
 
 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct closure *writes, struct gc_stat *gc)
 {
-	void write(struct btree *r)
-	{
-		if (!r->written || btree_node_dirty(r))
-			bch_btree_node_write(r, writes);
-
-		up_write(&r->lock);
-	}
-
-	int ret = 0, stale;
 	unsigned i;
+	int ret = 0;
+	bool should_rewrite;
+	struct btree *n;
+	struct bkey *k;
+	struct keylist keys;
+	struct btree_iter iter;
 	struct gc_merge_info r[GC_MERGE_NODES];
+	struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
 
-	memset(r, 0, sizeof(r));
+	bch_keylist_init(&keys);
+	bch_btree_iter_init(b, &iter, &b->c->gc_done);
 
-	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
-		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
+	for (i = 0; i < GC_MERGE_NODES; i++)
+		r[i].b = ERR_PTR(-EINTR);
 
-		if (IS_ERR(r->b)) {
-			ret = PTR_ERR(r->b);
-			break;
+	while (1) {
+		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+		if (k) {
+			r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+			if (IS_ERR(r->b)) {
+				ret = PTR_ERR(r->b);
+				break;
+			}
+
+			r->keys = btree_gc_count_keys(r->b);
+
+			ret = btree_gc_coalesce(b, op, &keys, gc, r);
+			if (ret)
+				break;
 		}
 
-		r->keys	= 0;
-		stale = btree_gc_mark_node(r->b, &r->keys, gc);
-
-		if (!b->written &&
-		    (r->b->level || stale > 10 ||
-		     b->c->gc_always_rewrite))
-			r->b = btree_gc_alloc(r->b, r->k);
-
-		if (r->b->level)
-			ret = btree_gc_recurse(r->b, op, writes, gc);
-
-		if (ret) {
-			write(r->b);
+		if (!last->b)
 			break;
+
+		if (!IS_ERR(last->b)) {
+			should_rewrite = btree_gc_mark_node(last->b, gc);
+			if (should_rewrite) {
+				n = btree_node_alloc_replacement(last->b);
+
+				if (!IS_ERR_OR_NULL(n)) {
+					bch_btree_node_write_sync(n);
+					bch_keylist_add(&keys, &n->key);
+
+					make_btree_freeing_key(last->b,
+							       keys.top);
+					bch_keylist_push(&keys);
+
+					btree_node_free(last->b);
+
+					bch_btree_insert_node(b, op, &keys,
+							      NULL, NULL);
+					BUG_ON(!bch_keylist_empty(&keys));
+
+					rw_unlock(true, last->b);
+					last->b = n;
+
+					/* Invalidated our iterator */
+					ret = -EINTR;
+					break;
+				}
+			}
+
+			if (last->b->level) {
+				ret = btree_gc_recurse(last->b, op, writes, gc);
+				if (ret)
+					break;
+			}
+
+			bkey_copy_key(&b->c->gc_done, &last->b->key);
+
+			/*
+			 * Must flush leaf nodes before gc ends, since replace
+			 * operations aren't journalled
+			 */
+			if (btree_node_dirty(last->b))
+				bch_btree_node_write(last->b, writes);
+			rw_unlock(true, last->b);
 		}
 
-		bkey_copy_key(&b->c->gc_done, r->k);
+		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
+		r->b = NULL;
 
-		if (!b->written)
-			btree_gc_coalesce(b, gc, r);
-
-		if (r[GC_MERGE_NODES - 1].b)
-			write(r[GC_MERGE_NODES - 1].b);
-
-		memmove(&r[1], &r[0],
-			sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
-
-		/* When we've got incremental GC working, we'll want to do
-		 * if (should_resched())
-		 *	return -EAGAIN;
-		 */
-		cond_resched();
-#if 0
 		if (need_resched()) {
 			ret = -EAGAIN;
 			break;
 		}
-#endif
 	}
 
-	for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
-		write(r[i].b);
+	for (i = 0; i < GC_MERGE_NODES; i++)
+		if (!IS_ERR_OR_NULL(r[i].b)) {
+			if (btree_node_dirty(r[i].b))
+				bch_btree_node_write(r[i].b, writes);
+			rw_unlock(true, r[i].b);
+		}
 
-	/* Might have freed some children, must remove their keys */
-	if (!b->written)
-		bch_btree_sort(b);
+	bch_keylist_free(&keys);
 
 	return ret;
 }
@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 			     struct closure *writes, struct gc_stat *gc)
 {
 	struct btree *n = NULL;
-	unsigned keys = 0;
-	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+	int ret = 0;
+	bool should_rewrite;
 
-	if (b->level || stale > 10)
+	should_rewrite = btree_gc_mark_node(b, gc);
+	if (should_rewrite) {
 		n = btree_node_alloc_replacement(b);
 
-	if (!IS_ERR_OR_NULL(n))
-		swap(b, n);
+		if (!IS_ERR_OR_NULL(n)) {
+			bch_btree_node_write_sync(n);
+			bch_btree_set_root(n);
+			btree_node_free(b);
+			rw_unlock(true, n);
 
-	if (b->level)
-		ret = btree_gc_recurse(b, op, writes, gc);
-
-	if (!b->written || btree_node_dirty(b))
-		bch_btree_node_write_sync(b);
-
-	if (!IS_ERR_OR_NULL(n)) {
-		bch_btree_set_root(b);
-		btree_node_free(n);
-		rw_unlock(true, b);
+			return -EINTR;
+		}
 	}
 
+	if (b->level) {
+		ret = btree_gc_recurse(b, op, writes, gc);
+		if (ret)
+			return ret;
+	}
+
+	bkey_copy_key(&b->c->gc_done, &b->key);
+
 	return ret;
 }
 
@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c)
 
 	btree_gc_start(c);
 
-	atomic_inc(&c->prio_blocked);
+	do {
+		ret = btree_root(gc_root, c, &op, &writes, &stats);
+		closure_sync(&writes);
 
-	ret = btree_root(gc_root, c, &op, &writes, &stats);
-	closure_sync(&writes);
-
-	if (ret) {
-		pr_warn("gc failed!");
-		return;
-	}
-
-	/* Possibly wait for new UUIDs or whatever to hit disk */
-	bch_journal_meta(c, &writes);
-	closure_sync(&writes);
+		if (ret && ret != -EAGAIN)
+			pr_warn("gc failed!");
+	} while (ret);
 
 	available = bch_btree_gc_finish(c);
-
-	atomic_dec(&c->prio_blocked);
 	wake_up_allocators(c);
 
 	bch_time_stats_update(&c->btree_gc_time, start_time);
 
 	stats.key_bytes *= sizeof(uint64_t);
-	stats.dirty	<<= 9;
 	stats.data	<<= 9;
 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c)
 static int bch_gc_thread(void *arg)
 {
 	struct cache_set *c = arg;
+	struct cache *ca;
+	unsigned i;
 
 	while (1) {
+again:
 		bch_btree_gc(c);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread_should_stop())
 			break;
 
+		mutex_lock(&c->bucket_lock);
+
+		for_each_cache(ca, c, i)
+			if (ca->invalidate_needs_gc) {
+				mutex_unlock(&c->bucket_lock);
+				set_current_state(TASK_RUNNING);
+				goto again;
+			}
+
+		mutex_unlock(&c->bucket_lock);
+
 		try_to_freeze();
 		schedule();
 	}
@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 	bch_keylist_init(&split_keys);
 
-	BUG_ON(b->level);
-
 	do {
 		BUG_ON(b->level && replace_key);
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index e11bb8571d24..b5a46affe8eb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
 
 static inline void set_gc_sectors(struct cache_set *c)
 {
-	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }
 
 static inline struct bkey *bch_btree_iter_init(struct btree *b,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9687771ec6f3..c5f73e34d016 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -489,7 +489,6 @@ lock_root:
 
 	sysfs_print(btree_used_percent,	btree_used(c));
 	sysfs_print(btree_nodes,	c->gc_stats.nodes);
-	sysfs_hprint(dirty_data,	c->gc_stats.dirty);
 	sysfs_hprint(average_key_size,	average_key_size(c));
 
 	sysfs_print(cache_read_races,
@@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = {
 	&sysfs_cache_available_percent,
 
 	&sysfs_average_key_size,
-	&sysfs_dirty_data,
 
 	&sysfs_errors,
 	&sysfs_io_error_limit,

From bc9389eefe479b7b7b323c2729b61a7155d2d0ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:07:35 -0700
Subject: [PATCH 84/94] bcache: Avoid deadlocking in garbage collection

Not a complete fix - we could still deadlock if btree_insert_node() has
to split...

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 21 +++++++++++----------
 drivers/md/bcache/btree.h |  2 +-
 drivers/md/bcache/super.c |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7d283d217438..bf8f5174b253 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1059,14 +1059,14 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level)
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
 		goto err;
 
 	bkey_put(c, &k.key);
@@ -1098,9 +1098,9 @@ err:
 	return b;
 }
 
-static struct btree *btree_node_alloc_replacement(struct btree *b)
+static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, b->level);
+	struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
 	if (!IS_ERR_OR_NULL(n))
 		bch_btree_sort_into(b, n);
 
@@ -1250,7 +1250,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		return 0;
 
 	for (i = 0; i < nodes; i++) {
-		new_nodes[i] = btree_node_alloc_replacement(r[i].b);
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
 		if (IS_ERR_OR_NULL(new_nodes[i]))
 			goto out_nocoalesce;
 	}
@@ -1420,7 +1420,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		if (!IS_ERR(last->b)) {
 			should_rewrite = btree_gc_mark_node(last->b, gc);
 			if (should_rewrite) {
-				n = btree_node_alloc_replacement(last->b);
+				n = btree_node_alloc_replacement(last->b,
+								 false);
 
 				if (!IS_ERR_OR_NULL(n)) {
 					bch_btree_node_write_sync(n);
@@ -1492,7 +1493,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 
 	should_rewrite = btree_gc_mark_node(b, gc);
 	if (should_rewrite) {
-		n = btree_node_alloc_replacement(b);
+		n = btree_node_alloc_replacement(b, false);
 
 		if (!IS_ERR_OR_NULL(n)) {
 			bch_btree_node_write_sync(n);
@@ -2038,7 +2039,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 	closure_init_stack(&cl);
 
-	n1 = btree_node_alloc_replacement(b);
+	n1 = btree_node_alloc_replacement(b, true);
 	if (IS_ERR(n1))
 		goto err;
 
@@ -2049,12 +2050,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
 
-		n2 = bch_btree_node_alloc(b->c, b->level);
+		n2 = bch_btree_node_alloc(b->c, b->level, true);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, b->level + 1);
+			n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index b5a46affe8eb..767e75570896 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -271,7 +271,7 @@ void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int);
+struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7ab926d94d81..e21200e98da6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1601,7 +1601,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, 0);
+		c->root = bch_btree_node_alloc(c, 0, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 

From 50310164bcd789eb3690f45a9baf8a507bf93358 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 17:18:59 -0700
Subject: [PATCH 85/94] bcache: Kill bch_next_recurse_key()

This dates from before the btree iterator, and now it's finally gone

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  |  8 --------
 drivers/md/bcache/bset.h  |  1 -
 drivers/md/bcache/btree.c | 23 +++++++++++------------
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index b615348c45fc..b0fe0dc59ee6 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -952,14 +952,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 	return ret;
 }
 
-struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
-{
-	struct btree_iter iter;
-
-	bch_btree_iter_init(b, &iter, search);
-	return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
-}
-
 /* Mergesort */
 
 static void sort_key_next(struct btree_iter *iter,
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index bc9975526aa1..1d3c24f9fa0e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -303,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
 
 typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
 
-struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
 struct bkey *bch_btree_iter_next(struct btree_iter *);
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
 					struct btree *, ptr_filter_fn);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bf8f5174b253..adc5bb0d8e92 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1680,9 +1680,9 @@ int bch_gc_thread_start(struct cache_set *c)
 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 				   unsigned long **seen)
 {
-	int ret;
+	int ret = 0;
 	unsigned i;
-	struct bkey *k;
+	struct bkey *k, *p = NULL;
 	struct bucket *g;
 	struct btree_iter iter;
 
@@ -1709,19 +1709,18 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 	}
 
 	if (b->level) {
-		k = bch_next_recurse_key(b, &ZERO_KEY);
+		bch_btree_iter_init(b, &iter, NULL);
+
+		do {
+			k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+			if (k)
+				btree_node_prefetch(b->c, k, b->level - 1);
 
-		while (k) {
-			struct bkey *p = bch_next_recurse_key(b, k);
 			if (p)
-				btree_node_prefetch(b->c, p, b->level - 1);
+				ret = btree(check_recurse, p, b, op, seen);
 
-			ret = btree(check_recurse, k, b, op, seen);
-			if (ret)
-				return ret;
-
-			k = p;
-		}
+			p = k;
+		} while (p && !ret);
 	}
 
 	return 0;

From 8aee122071a69ca6fa3314da7713bdf0b61dc07c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 30 Jul 2013 22:34:40 -0700
Subject: [PATCH 86/94] bcache: Kill sequential_merge option

It never really made sense to expose this, so just kill it.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  1 -
 drivers/md/bcache/request.c | 43 ++++++++++++++++---------------------
 drivers/md/bcache/super.c   |  1 -
 drivers/md/bcache/sysfs.c   |  4 ----
 4 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6970a651e42..322735547eab 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -364,7 +364,6 @@ struct cached_dev {
 	unsigned		sequential_cutoff;
 	unsigned		readahead;
 
-	unsigned		sequential_merge:1;
 	unsigned		verify:1;
 
 	unsigned		partial_stripes_expensive:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 932300f18973..f645da61189a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -510,6 +510,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
+	struct io *i;
 
 	if (atomic_read(&dc->disk.detaching) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -536,38 +537,30 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	    (bio->bi_rw & REQ_SYNC))
 		goto rescale;
 
-	if (dc->sequential_merge) {
-		struct io *i;
+	spin_lock(&dc->io_lock);
 
-		spin_lock(&dc->io_lock);
+	hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+		if (i->last == bio->bi_sector &&
+		    time_before(jiffies, i->jiffies))
+			goto found;
 
-		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-			if (i->last == bio->bi_sector &&
-			    time_before(jiffies, i->jiffies))
-				goto found;
+	i = list_first_entry(&dc->io_lru, struct io, lru);
 
-		i = list_first_entry(&dc->io_lru, struct io, lru);
-
-		add_sequential(task);
-		i->sequential = 0;
+	add_sequential(task);
+	i->sequential = 0;
 found:
-		if (i->sequential + bio->bi_size > i->sequential)
-			i->sequential	+= bio->bi_size;
+	if (i->sequential + bio->bi_size > i->sequential)
+		i->sequential	+= bio->bi_size;
 
-		i->last			 = bio_end_sector(bio);
-		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		task->sequential_io	 = i->sequential;
+	i->last			 = bio_end_sector(bio);
+	i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+	task->sequential_io	 = i->sequential;
 
-		hlist_del(&i->hash);
-		hlist_add_head(&i->hash, iohash(dc, i->last));
-		list_move_tail(&i->lru, &dc->io_lru);
+	hlist_del(&i->hash);
+	hlist_add_head(&i->hash, iohash(dc, i->last));
+	list_move_tail(&i->lru, &dc->io_lru);
 
-		spin_unlock(&dc->io_lock);
-	} else {
-		task->sequential_io = bio->bi_size;
-
-		add_sequential(task);
-	}
+	spin_unlock(&dc->io_lock);
 
 	sectors = max(task->sequential_io,
 		      task->sequential_io_avg) >> 9;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e21200e98da6..041dd9d1d882 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1079,7 +1079,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 	spin_lock_init(&dc->io_lock);
 	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
 
-	dc->sequential_merge		= true;
 	dc->sequential_cutoff		= 4 << 20;
 
 	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c5f73e34d016..4b672449ffaf 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -72,7 +72,6 @@ rw_attribute(congested_read_threshold_us);
 rw_attribute(congested_write_threshold_us);
 
 rw_attribute(sequential_cutoff);
-rw_attribute(sequential_merge);
 rw_attribute(data_csum);
 rw_attribute(cache_mode);
 rw_attribute(writeback_metadata);
@@ -161,7 +160,6 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
 	var_printf(partial_stripes_expensive,	"%u");
 
-	var_printf(sequential_merge,	"%i");
 	var_hprint(sequential_cutoff);
 	var_hprint(readahead);
 
@@ -207,7 +205,6 @@ STORE(__cached_dev)
 			    dc->writeback_rate_p_term_inverse, 1, INT_MAX);
 	d_strtoul(writeback_rate_d_smooth);
 
-	d_strtoul(sequential_merge);
 	d_strtoi_h(sequential_cutoff);
 	d_strtoi_h(readahead);
 
@@ -319,7 +316,6 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_stripe_size,
 	&sysfs_partial_stripes_expensive,
 	&sysfs_sequential_cutoff,
-	&sysfs_sequential_merge,
 	&sysfs_clear_stats,
 	&sysfs_running,
 	&sysfs_state,

From 65d22e911bfc4f46cda4751f1b1926b43c316c14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 31 Jul 2013 00:03:54 -0700
Subject: [PATCH 87/94] bcache: Move spinlock into struct time_stats

Minor cleanup.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |  2 --
 drivers/md/bcache/bset.c   |  7 +------
 drivers/md/bcache/btree.c  |  3 ---
 drivers/md/bcache/super.c  |  8 ++++++--
 drivers/md/bcache/util.c   | 12 +++++++++---
 drivers/md/bcache/util.h   |  1 +
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 322735547eab..816d07958fac 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -666,11 +666,9 @@ struct cache_set {
 	unsigned		congested_read_threshold_us;
 	unsigned		congested_write_threshold_us;
 
-	spinlock_t		sort_time_lock;
 	struct time_stats	sort_time;
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
-	spinlock_t		btree_read_time_lock;
 	struct time_stats	btree_read_time;
 	struct time_stats	try_harder_time;
 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index b0fe0dc59ee6..14573391206b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1077,11 +1077,8 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 	if (b->written)
 		bset_build_written_tree(b);
 
-	if (!start) {
-		spin_lock(&b->c->sort_time_lock);
+	if (!start)
 		bch_time_stats_update(&b->c->sort_time, start_time);
-		spin_unlock(&b->c->sort_time_lock);
-	}
 }
 
 void bch_btree_sort_partial(struct btree *b, unsigned start)
@@ -1128,9 +1125,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
 
 	btree_mergesort(b, new->sets->data, &iter, false, true);
 
-	spin_lock(&b->c->sort_time_lock);
 	bch_time_stats_update(&b->c->sort_time, start_time);
-	spin_unlock(&b->c->sort_time_lock);
 
 	bkey_copy_key(&new->key, &b->key);
 	new->sets->size = 0;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index adc5bb0d8e92..1a7530cd1407 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -317,10 +317,7 @@ void bch_btree_node_read(struct btree *b)
 		goto err;
 
 	bch_btree_node_read_done(b);
-
-	spin_lock(&b->c->btree_read_time_lock);
 	bch_time_stats_update(&b->c->btree_read_time, start_time);
-	spin_unlock(&b->c->btree_read_time_lock);
 
 	return;
 err:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 041dd9d1d882..4813ef67cef5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1435,9 +1435,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	init_waitqueue_head(&c->try_wait);
 	init_waitqueue_head(&c->bucket_wait);
 	closure_init_unlocked(&c->uuid_write);
-	spin_lock_init(&c->sort_time_lock);
 	mutex_init(&c->sort_lock);
-	spin_lock_init(&c->btree_read_time_lock);
+
+	spin_lock_init(&c->sort_time.lock);
+	spin_lock_init(&c->btree_gc_time.lock);
+	spin_lock_init(&c->btree_split_time.lock);
+	spin_lock_init(&c->btree_read_time.lock);
+	spin_lock_init(&c->try_harder_time.lock);
 
 	bch_moving_init_cache_set(c);
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 420dad545c7d..462214eeacbe 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid)
 
 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 {
-	uint64_t now		= local_clock();
-	uint64_t duration	= time_after64(now, start_time)
+	uint64_t now, duration, last;
+
+	spin_lock(&stats->lock);
+
+	now		= local_clock();
+	duration	= time_after64(now, start_time)
 		? now - start_time : 0;
-	uint64_t last		= time_after64(now, stats->last)
+	last		= time_after64(now, stats->last)
 		? now - stats->last : 0;
 
 	stats->max_duration = max(stats->max_duration, duration);
@@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 	}
 
 	stats->last = now ?: 1;
+
+	spin_unlock(&stats->lock);
 }
 
 /**
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 8ce5aab55962..362c4b3f8b4a 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -378,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[
 ssize_t bch_read_string_list(const char *buf, const char * const list[]);
 
 struct time_stats {
+	spinlock_t	lock;
 	/*
 	 * all fields are in nanoseconds, averages are ewmas stored left shifted
 	 * by 8

From 17e21a9f248d3d330acdfb2405c23b8d84c9c23a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 26 Jul 2013 12:32:38 -0700
Subject: [PATCH 88/94] bcache: Have btree_split() insert into parent directly

The flow control in btree_insert_node() was... fragile... before,
this'll use more stack (but since our btrees are never more than depth
1, that shouldn't matter) and it should be significantly clearer and
less fragile.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 85 ++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 1a7530cd1407..6def7c9a1228 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2025,15 +2025,16 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
-		       struct keylist *parent_keys,
 		       struct bkey *replace_key)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 	struct closure cl;
+	struct keylist parent_keys;
 
 	closure_init_stack(&cl);
+	bch_keylist_init(&parent_keys);
 
 	n1 = btree_node_alloc_replacement(b, true);
 	if (IS_ERR(n1))
@@ -2078,7 +2079,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		bkey_copy_key(&n2->key, &b->key);
 
-		bch_keylist_add(parent_keys, &n2->key);
+		bch_keylist_add(&parent_keys, &n2->key);
 		bch_btree_node_write(n2, &cl);
 		rw_unlock(true, n2);
 	} else {
@@ -2087,33 +2088,39 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
 
-	bch_keylist_add(parent_keys, &n1->key);
+	bch_keylist_add(&parent_keys, &n1->key);
 	bch_btree_node_write(n1, &cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
-
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op, parent_keys, NULL);
+		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
 		bch_btree_node_write(n3, &cl);
 
 		closure_sync(&cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
+
+		btree_node_free(b);
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
-
-		bch_keylist_reset(parent_keys);
 		closure_sync(&cl);
 		bch_btree_set_root(n1);
+
+		btree_node_free(b);
 	} else {
+		/* Split a non root node */
 		closure_sync(&cl);
-		make_btree_freeing_key(b, parent_keys->top);
-		bch_keylist_push(parent_keys);
+		make_btree_freeing_key(b, parent_keys.top);
+		bch_keylist_push(&parent_keys);
+
+		btree_node_free(b);
+
+		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
+		BUG_ON(!bch_keylist_empty(&parent_keys));
 	}
 
 	rw_unlock(true, n1);
-	btree_node_free(b);
 
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
 
@@ -2139,46 +2146,32 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				 atomic_t *journal_ref,
 				 struct bkey *replace_key)
 {
-	int ret = 0;
-	struct keylist split_keys;
+	BUG_ON(b->level && replace_key);
 
-	bch_keylist_init(&split_keys);
-
-	do {
-		BUG_ON(b->level && replace_key);
-
-		if (should_split(b)) {
-			if (current->bio_list) {
-				op->lock = b->c->root->level + 1;
-				ret = -EAGAIN;
-			} else if (op->lock <= b->c->root->level) {
-				op->lock = b->c->root->level + 1;
-				ret = -EINTR;
-			} else {
-				struct btree *parent = b->parent;
-
-				ret = btree_split(b, op, insert_keys,
-						  &split_keys, replace_key);
-				insert_keys = &split_keys;
-				replace_key = NULL;
-				b = parent;
-				if (!ret)
-					ret = -EINTR;
-			}
+	if (should_split(b)) {
+		if (current->bio_list) {
+			op->lock = b->c->root->level + 1;
+			return -EAGAIN;
+		} else if (op->lock <= b->c->root->level) {
+			op->lock = b->c->root->level + 1;
+			return -EINTR;
 		} else {
-			BUG_ON(write_block(b) != b->sets[b->nsets].data);
-
-			if (bch_btree_insert_keys(b, op, insert_keys,
-						  replace_key)) {
-				if (!b->level)
-					bch_btree_leaf_dirty(b, journal_ref);
-				else
-					bch_btree_node_write_sync(b);
-			}
+			/* Invalidated all iterators */
+			return btree_split(b, op, insert_keys, replace_key) ?:
+				-EINTR;
 		}
-	} while (!bch_keylist_empty(&split_keys));
+	} else {
+		BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
-	return ret;
+		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+			if (!b->level)
+				bch_btree_leaf_dirty(b, journal_ref);
+			else
+				bch_btree_node_write_sync(b);
+		}
+
+		return 0;
+	}
 }
 
 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,

From 48a915a87f0bd98c3d68d029acf223a2e5116f07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 31 Oct 2013 15:43:22 -0700
Subject: [PATCH 89/94] bcache: Better full stripe scanning

The old scanning-by-stripe code burned too much CPU, this should be
better.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  5 +-
 drivers/md/bcache/btree.c     | 19 ++++---
 drivers/md/bcache/super.c     | 17 ++++++-
 drivers/md/bcache/writeback.c | 94 +++++++++++++++++++++--------------
 drivers/md/bcache/writeback.h | 21 +++++---
 include/trace/events/bcache.h | 29 +++++++++++
 6 files changed, 128 insertions(+), 57 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 816d07958fac..ab0b2150fed6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -237,7 +237,7 @@ struct keybuf {
 
 	struct rb_root		keys;
 
-#define KEYBUF_NR		100
+#define KEYBUF_NR		500
 	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
@@ -273,9 +273,10 @@ struct bcache_device {
 	atomic_t		detaching;
 	int			flush_done;
 
-	uint64_t		nr_stripes;
+	unsigned		nr_stripes;
 	unsigned		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
+	unsigned long		*full_dirty_stripes;
 
 	unsigned long		sectors_dirty_last;
 	long			sectors_dirty_derivative;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 6def7c9a1228..5e2765aadce1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2378,6 +2378,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 
 struct refill {
 	struct btree_op	op;
+	unsigned	nr_found;
 	struct keybuf	*buf;
 	struct bkey	*end;
 	keybuf_pred_fn	*pred;
@@ -2414,6 +2415,8 @@ static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
 
 		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
 			array_free(&buf->freelist, w);
+		else
+			refill->nr_found++;
 
 		if (array_freelist_empty(&buf->freelist))
 			ret = MAP_DONE;
@@ -2434,18 +2437,18 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 	cond_resched();
 
 	bch_btree_op_init(&refill.op, -1);
-	refill.buf = buf;
-	refill.end = end;
-	refill.pred = pred;
+	refill.nr_found	= 0;
+	refill.buf	= buf;
+	refill.end	= end;
+	refill.pred	= pred;
 
 	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
 			   refill_keybuf_fn, MAP_END_KEY);
 
-	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
-		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
-		 array_freelist_empty(&buf->freelist) ? "some" : "a few",
-		 KEY_INODE(&start), KEY_OFFSET(&start),
-		 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
+	trace_bcache_keyscan(refill.nr_found,
+			     KEY_INODE(&start), KEY_OFFSET(&start),
+			     KEY_INODE(&buf->last_scanned),
+			     KEY_OFFSET(&buf->last_scanned));
 
 	spin_lock(&buf->lock);
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4813ef67cef5..43fcfe38be11 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -738,6 +738,10 @@ static void bcache_device_free(struct bcache_device *d)
 		mempool_destroy(d->unaligned_bvec);
 	if (d->bio_split)
 		bioset_free(d->bio_split);
+	if (is_vmalloc_addr(d->full_dirty_stripes))
+		vfree(d->full_dirty_stripes);
+	else
+		kfree(d->full_dirty_stripes);
 	if (is_vmalloc_addr(d->stripe_sectors_dirty))
 		vfree(d->stripe_sectors_dirty);
 	else
@@ -757,8 +761,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 
-	if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
+	if (!d->nr_stripes ||
+	    d->nr_stripes > INT_MAX ||
+	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+		pr_err("nr_stripes too large");
 		return -ENOMEM;
+	}
 
 	n = d->nr_stripes * sizeof(atomic_t);
 	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
@@ -767,6 +775,13 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->stripe_sectors_dirty)
 		return -ENOMEM;
 
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	d->full_dirty_stripes = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->full_dirty_stripes)
+		return -ENOMEM;
+
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
 				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ab0f6b449111..22e21dc9a037 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -292,14 +292,12 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_offset;
-	uint64_t stripe = offset;
+	unsigned stripe_offset, stripe, sectors_dirty;
 
 	if (!d)
 		return;
 
-	do_div(stripe, d->stripe_size);
-
+	stripe = offset_to_stripe(d, offset);
 	stripe_offset = offset & (d->stripe_size - 1);
 
 	while (nr_sectors) {
@@ -309,7 +307,16 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 		if (nr_sectors < 0)
 			s = -s;
 
-		atomic_add(s, d->stripe_sectors_dirty + stripe);
+		if (stripe >= d->nr_stripes)
+			return;
+
+		sectors_dirty = atomic_add_return(s,
+					d->stripe_sectors_dirty + stripe);
+		if (sectors_dirty == d->stripe_size)
+			set_bit(stripe, d->full_dirty_stripes);
+		else
+			clear_bit(stripe, d->full_dirty_stripes);
+
 		nr_sectors -= s;
 		stripe_offset = 0;
 		stripe++;
@@ -321,59 +328,70 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 	return KEY_DIRTY(k);
 }
 
-static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+static void refill_full_stripes(struct cached_dev *dc)
 {
-	uint64_t stripe = KEY_START(k);
-	unsigned nr_sectors = KEY_SIZE(k);
-	struct cached_dev *dc = container_of(buf, struct cached_dev,
-					     writeback_keys);
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned start_stripe, stripe, next_stripe;
+	bool wrapped = false;
 
-	if (!KEY_DIRTY(k))
-		return false;
+	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
 
-	do_div(stripe, dc->disk.stripe_size);
+	if (stripe >= dc->disk.nr_stripes)
+		stripe = 0;
+
+	start_stripe = stripe;
 
 	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
-		    dc->disk.stripe_size)
-			return true;
+		stripe = find_next_bit(dc->disk.full_dirty_stripes,
+				       dc->disk.nr_stripes, stripe);
 
-		if (nr_sectors <= dc->disk.stripe_size)
-			return false;
+		if (stripe == dc->disk.nr_stripes)
+			goto next;
 
-		nr_sectors -= dc->disk.stripe_size;
-		stripe++;
+		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+						 dc->disk.nr_stripes, stripe);
+
+		buf->last_scanned = KEY(dc->disk.id,
+					stripe * dc->disk.stripe_size, 0);
+
+		bch_refill_keybuf(dc->disk.c, buf,
+				  &KEY(dc->disk.id,
+				       next_stripe * dc->disk.stripe_size, 0),
+				  dirty_pred);
+
+		if (array_freelist_empty(&buf->freelist))
+			return;
+
+		stripe = next_stripe;
+next:
+		if (wrapped && stripe > start_stripe)
+			return;
+
+		if (stripe == dc->disk.nr_stripes) {
+			stripe = 0;
+			wrapped = true;
+		}
 	}
 }
 
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
-	bool searched_from_start = false;
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+	bool searched_from_start = false;
+
+	if (dc->partial_stripes_expensive) {
+		refill_full_stripes(dc);
+		if (array_freelist_empty(&buf->freelist))
+			return false;
+	}
 
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
 		buf->last_scanned = KEY(dc->disk.id, 0, 0);
 		searched_from_start = true;
 	}
 
-	if (dc->partial_stripes_expensive) {
-		uint64_t i;
-
-		for (i = 0; i < dc->disk.nr_stripes; i++)
-			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    dc->disk.stripe_size)
-				goto full_stripes;
-
-		goto normal_refill;
-full_stripes:
-		searched_from_start = false;	/* not searching entire btree */
-		bch_refill_keybuf(dc->disk.c, buf, &end,
-				  dirty_full_stripe_pred);
-	} else {
-normal_refill:
-		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
-	}
+	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
 	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 60516bfa6052..fe7d9d56492b 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,22 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 	return ret;
 }
 
-static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+					uint64_t offset)
+{
+	do_div(offset, d->stripe_size);
+	return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 					   uint64_t offset,
 					   unsigned nr_sectors)
 {
-	uint64_t stripe = offset;
-
-	do_div(stripe, d->stripe_size);
+	unsigned stripe = offset_to_stripe(&dc->disk, offset);
 
 	while (1) {
-		if (atomic_read(d->stripe_sectors_dirty + stripe))
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
 			return true;
 
-		if (nr_sectors <= d->stripe_size)
+		if (nr_sectors <= dc->disk.stripe_size)
 			return false;
 
-		nr_sectors -= d->stripe_size;
+		nr_sectors -= dc->disk.stripe_size;
 		stripe++;
 	}
 }
@@ -45,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return false;
 
 	if (dc->partial_stripes_expensive &&
-	    bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
+	    bcache_dev_stripe_dirty(dc, bio->bi_sector,
 				    bio_sectors(bio)))
 		return true;
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 32c89b33c391..e2b9576d00e2 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -368,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root,
 	TP_ARGS(b)
 );
 
+TRACE_EVENT(bcache_keyscan,
+	TP_PROTO(unsigned nr_found,
+		 unsigned start_inode, uint64_t start_offset,
+		 unsigned end_inode, uint64_t end_offset),
+	TP_ARGS(nr_found,
+		start_inode, start_offset,
+		end_inode, end_offset),
+
+	TP_STRUCT__entry(
+		__field(__u32,	nr_found			)
+		__field(__u32,	start_inode			)
+		__field(__u64,	start_offset			)
+		__field(__u32,	end_inode			)
+		__field(__u64,	end_offset			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_found	= nr_found;
+		__entry->start_inode	= start_inode;
+		__entry->start_offset	= start_offset;
+		__entry->end_inode	= end_inode;
+		__entry->end_offset	= end_offset;
+	),
+
+	TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
+		  __entry->start_inode, __entry->start_offset,
+		  __entry->end_inode, __entry->end_offset)
+);
+
 /* Allocator */
 
 TRACE_EVENT(bcache_alloc_invalidate,

From c4d951ddb66fe1d087447b0ba65c4fa4446f1083 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 21 Aug 2013 17:49:09 -0700
Subject: [PATCH 90/94] bcache: Fix sysfs splat on shutdown with flash only
 devs

Whoops.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    | 10 ++++-----
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/super.c     | 41 +++++++++++++++++------------------
 drivers/md/bcache/sysfs.c     |  2 +-
 drivers/md/bcache/writeback.c |  6 ++---
 drivers/md/bcache/writeback.h |  2 +-
 6 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index ab0b2150fed6..97ef126b68bb 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -266,12 +266,10 @@ struct bcache_device {
 
 	struct gendisk		*disk;
 
-	/* If nonzero, we're closing */
-	atomic_t		closing;
-
-	/* If nonzero, we're detaching/unregistering from cache set */
-	atomic_t		detaching;
-	int			flush_done;
+	unsigned long		flags;
+#define BCACHE_DEV_CLOSING	0
+#define BCACHE_DEV_DETACHING	1
+#define BCACHE_DEV_UNLINK_DONE	2
 
 	unsigned		nr_stripes;
 	unsigned		stripe_size;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f645da61189a..9f5a1386f77a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -512,7 +512,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	struct task_struct *task = current;
 	struct io *i;
 
-	if (atomic_read(&dc->disk.detaching) ||
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
 	    (bio->bi_rw & REQ_DISCARD))
 		goto skip;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 43fcfe38be11..fa1d53087f88 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -621,7 +621,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 static int open_dev(struct block_device *b, fmode_t mode)
 {
 	struct bcache_device *d = b->bd_disk->private_data;
-	if (atomic_read(&d->closing))
+	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
 		return -ENXIO;
 
 	closure_get(&d->cl);
@@ -650,20 +650,24 @@ static const struct block_device_operations bcache_ops = {
 
 void bcache_device_stop(struct bcache_device *d)
 {
-	if (!atomic_xchg(&d->closing, 1))
+	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
 		closure_queue(&d->cl);
 }
 
 static void bcache_device_unlink(struct bcache_device *d)
 {
-	unsigned i;
-	struct cache *ca;
+	lockdep_assert_held(&bch_register_lock);
 
-	sysfs_remove_link(&d->c->kobj, d->name);
-	sysfs_remove_link(&d->kobj, "cache");
+	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+		unsigned i;
+		struct cache *ca;
 
-	for_each_cache(ca, d->c, i)
-		bd_unlink_disk_holder(ca->bdev, d->disk);
+		sysfs_remove_link(&d->c->kobj, d->name);
+		sysfs_remove_link(&d->kobj, "cache");
+
+		for_each_cache(ca, d->c, i)
+			bd_unlink_disk_holder(ca->bdev, d->disk);
+	}
 }
 
 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
@@ -687,19 +691,16 @@ static void bcache_device_detach(struct bcache_device *d)
 {
 	lockdep_assert_held(&bch_register_lock);
 
-	if (atomic_read(&d->detaching)) {
+	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
 		struct uuid_entry *u = d->c->uuids + d->id;
 
 		SET_UUID_FLASH_ONLY(u, 0);
 		memcpy(u->uuid, invalid_uuid, 16);
 		u->invalidated = cpu_to_le32(get_seconds());
 		bch_uuid_write(d->c);
-
-		atomic_set(&d->detaching, 0);
 	}
 
-	if (!d->flush_done)
-		bcache_device_unlink(d);
+	bcache_device_unlink(d);
 
 	d->c->devices[d->id] = NULL;
 	closure_put(&d->c->caching);
@@ -879,7 +880,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	struct closure cl;
 	closure_init_stack(&cl);
 
-	BUG_ON(!atomic_read(&dc->disk.detaching));
+	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
 	BUG_ON(atomic_read(&dc->count));
 
 	mutex_lock(&bch_register_lock);
@@ -893,6 +894,8 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	bcache_device_detach(&dc->disk);
 	list_move(&dc->list, &uncached_devices);
 
+	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+
 	mutex_unlock(&bch_register_lock);
 
 	pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
@@ -905,10 +908,10 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 {
 	lockdep_assert_held(&bch_register_lock);
 
-	if (atomic_read(&dc->disk.closing))
+	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
 		return;
 
-	if (atomic_xchg(&dc->disk.detaching, 1))
+	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 		return;
 
 	/*
@@ -1064,11 +1067,7 @@ static void cached_dev_flush(struct closure *cl)
 	struct bcache_device *d = &dc->disk;
 
 	mutex_lock(&bch_register_lock);
-	d->flush_done = 1;
-
-	if (d->c)
-		bcache_device_unlink(d);
-
+	bcache_device_unlink(d);
 	mutex_unlock(&bch_register_lock);
 
 	bch_cache_accounting_destroy(&dc->accounting);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4b672449ffaf..194d43782ea4 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -370,7 +370,7 @@ STORE(__bch_flash_dev)
 	}
 
 	if (attr == &sysfs_unregister) {
-		atomic_set(&d->detaching, 1);
+		set_bit(BCACHE_DEV_DETACHING, &d->flags);
 		bcache_device_stop(d);
 	}
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22e21dc9a037..99053b1251be 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -89,7 +89,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
 	uint64_t ret;
 
-	if (atomic_read(&dc->disk.detaching) ||
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    !dc->writeback_percent)
 		return 0;
 
@@ -404,7 +404,7 @@ static int bch_writeback_thread(void *arg)
 	while (!kthread_should_stop()) {
 		down_write(&dc->writeback_lock);
 		if (!atomic_read(&dc->has_dirty) ||
-		    (!atomic_read(&dc->disk.detaching) &&
+		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
 		     !dc->writeback_running)) {
 			up_write(&dc->writeback_lock);
 			set_current_state(TASK_INTERRUPTIBLE);
@@ -437,7 +437,7 @@ static int bch_writeback_thread(void *arg)
 
 			while (delay &&
 			       !kthread_should_stop() &&
-			       !atomic_read(&dc->disk.detaching))
+			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 				delay = schedule_timeout_interruptible(delay);
 		}
 	}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index fe7d9d56492b..c9ddcf4614b9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -45,7 +45,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
-	    atomic_read(&dc->disk.detaching) ||
+	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    in_use > CUTOFF_WRITEBACK_SYNC)
 		return false;
 

From 28935ab5163c49ca5c199d67335e5e3c72c50853 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 31 Jul 2013 01:12:02 -0700
Subject: [PATCH 91/94] bcache: Use ida for bcache block dev minor

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa1d53087f88..70708ab0b8f9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -16,6 +16,7 @@
 #include <linux/buffer_head.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
+#include <linux/idr.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -50,7 +51,8 @@ struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
 static LIST_HEAD(uncached_devices);
 
-static int bcache_major, bcache_minor;
+static int bcache_major;
+static DEFINE_IDA(bcache_minor);
 static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
@@ -731,8 +733,10 @@ static void bcache_device_free(struct bcache_device *d)
 		del_gendisk(d->disk);
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
-	if (d->disk)
+	if (d->disk) {
+		ida_simple_remove(&bcache_minor, d->disk->first_minor);
 		put_disk(d->disk);
+	}
 
 	bio_split_pool_free(&d->bio_split_hook);
 	if (d->unaligned_bvec)
@@ -756,6 +760,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 {
 	struct request_queue *q;
 	size_t n;
+	int minor;
 
 	if (!d->stripe_size)
 		d->stripe_size = 1 << 31;
@@ -783,22 +788,31 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
+	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	if (minor < 0)
+		return minor;
+
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
 				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
 	    bio_split_pool_init(&d->bio_split_hook) ||
-	    !(d->disk = alloc_disk(1)) ||
-	    !(q = blk_alloc_queue(GFP_KERNEL)))
+	    !(d->disk = alloc_disk(1))) {
+		ida_simple_remove(&bcache_minor, minor);
 		return -ENOMEM;
+	}
 
 	set_capacity(d->disk, sectors);
-	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= bcache_minor++;
+	d->disk->first_minor	= minor;
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+
 	blk_queue_make_request(q, NULL);
 	d->disk->queue			= q;
 	q->queuedata			= d;

From 098fb25498214069e6bbf908515f2952dd7654d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 21 Aug 2013 18:36:45 -0700
Subject: [PATCH 92/94] bcache: Delete some slower inline asm

Never saw a profile of bset_search_tree() where it wasn't bottlenecked
on memory until I got my new Haswell machine, but when I tried it there
it was suddenly burning 20% of the cpu in the inner loop on shrd...

Turns out, the version of shrd that takes 64 bit operands has a 9 cycle
latency. hah.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 14573391206b..7d388b8bb50e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -481,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
 
 static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
 {
-#ifdef CONFIG_X86_64
-	asm("shrd %[shift],%[high],%[low]"
-	    : [low] "+Rm" (low)
-	    : [high] "R" (high),
-	    [shift] "ci" (shift)
-	    : "cc");
-#else
 	low >>= shift;
 	low  |= (high << 1) << (63U - shift);
-#endif
 	return low;
 }
 

From 5ceaaad7047745c1c02150c39d3fb623b7948d48 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 14:27:42 -0700
Subject: [PATCH 93/94] bcache: Bypass torture test

More testing ftw! Also, now verify mode doesn't break if you read dirty
data.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  1 +
 drivers/md/bcache/debug.c   | 15 ++++++++-------
 drivers/md/bcache/debug.h   |  2 ++
 drivers/md/bcache/request.c | 14 +++++++++++++-
 drivers/md/bcache/sysfs.c   |  4 ++++
 5 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 97ef126b68bb..4beb55a0ff30 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -364,6 +364,7 @@ struct cached_dev {
 	unsigned		readahead;
 
 	unsigned		verify:1;
+	unsigned		bypass_torture_test:1;
 
 	unsigned		partial_stripes_expensive:1;
 	unsigned		writeback_metadata:1;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index e99e6b8852b2..264fcfbd6290 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -189,13 +189,14 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		void *p1 = kmap_atomic(bv->bv_page);
 		void *p2 = page_address(check->bi_io_vec[i].bv_page);
 
-		if (memcmp(p1 + bv->bv_offset,
-			   p2 + bv->bv_offset,
-			   bv->bv_len))
-			printk(KERN_ERR
-			       "bcache (%s): verify failed at sector %llu\n",
-			       bdevname(dc->bdev, name),
-			       (uint64_t) bio->bi_sector);
+		cache_set_err_on(memcmp(p1 + bv->bv_offset,
+					p2 + bv->bv_offset,
+					bv->bv_len),
+				 dc->disk.c,
+				 "verify failed at dev %s sector %llu",
+				 bdevname(dc->bdev, name),
+				 (uint64_t) bio->bi_sector);
+
 		kunmap_atomic(p1);
 	}
 
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 7914ba0ff316..2ede60e31874 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -16,6 +16,7 @@ void bch_btree_iter_next_check(struct btree_iter *);
 #define EBUG_ON(cond)			BUG_ON(cond)
 #define expensive_debug_checks(c)	((c)->expensive_debug_checks)
 #define key_merging_disabled(c)		((c)->key_merging_disabled)
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
 
 #else /* DEBUG */
 
@@ -28,6 +29,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
 #define EBUG_ON(cond)			do { if (cond); } while (0)
 #define expensive_debug_checks(c)	0
 #define key_merging_disabled(c)		0
+#define bypass_torture_test(d)		0
 
 #endif
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9f5a1386f77a..fbcc851ed5a5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -528,6 +528,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 		goto skip;
 	}
 
+	if (bypass_torture_test(dc)) {
+		if ((get_random_int() & 3) == 3)
+			goto skip;
+		else
+			goto rescale;
+	}
+
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
@@ -601,6 +608,7 @@ struct search {
 	unsigned		recoverable:1;
 	unsigned		unaligned_bvec:1;
 	unsigned		write:1;
+	unsigned		read_dirty_data:1;
 
 	unsigned long		start_time;
 
@@ -669,6 +677,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
 
+	if (KEY_DIRTY(k))
+		s->read_dirty_data = true;
+
 	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
 				     KEY_OFFSET(k) - bio->bi_sector),
 			  GFP_NOIO, s->d->bio_split);
@@ -894,7 +905,8 @@ static void cached_dev_read_done(struct closure *cl)
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec)
+	if (verify(dc, &s->bio.bio) && s->recoverable &&
+	    !s->unaligned_bvec && !s->read_dirty_data)
 		bch_data_verify(dc, s->orig_bio);
 
 	bio_complete(s);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 194d43782ea4..80d4c2bee18a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -99,6 +99,7 @@ rw_attribute(errors);
 rw_attribute(io_error_limit);
 rw_attribute(io_error_halflife);
 rw_attribute(verify);
+rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
@@ -123,6 +124,7 @@ SHOW(__bch_cached_dev)
 
 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
 	var_printf(verify,		"%i");
+	var_printf(bypass_torture_test,	"%i");
 	var_printf(writeback_metadata,	"%i");
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
@@ -191,6 +193,7 @@ STORE(__cached_dev)
 
 	sysfs_strtoul(data_csum,	dc->disk.data_csum);
 	d_strtoul(verify);
+	d_strtoul(bypass_torture_test);
 	d_strtoul(writeback_metadata);
 	d_strtoul(writeback_running);
 	d_strtoul(writeback_delay);
@@ -323,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_readahead,
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
+	&sysfs_bypass_torture_test,
 #endif
 	NULL
 };

From c86949486d41d9e7d7681fc72923555114fd702f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Sep 2013 21:41:34 -0700
Subject: [PATCH 94/94] bcache: defensively handle format strings

Just to be safe, call the error reporting function with "%s" to avoid
any possible future format string leak.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 70708ab0b8f9..dec15cd2d797 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1670,7 +1670,7 @@ static void run_cache_set(struct cache_set *c)
 err:
 	closure_sync(&cl);
 	/* XXX: test this, it's broken */
-	bch_cache_set_error(c, err);
+	bch_cache_set_error(c, "%s", err);
 }
 
 static bool can_attach_cache(struct cache *ca, struct cache_set *c)