From f8b6e31e4e46bf514c27fce38783ed5615cca01d Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 26 Nov 2010 13:02:21 -0500 Subject: [PATCH 01/42] IB/srp: allow task management without a previous request We can only have one task management comment outstanding, so move the completion and status to the target port. This allows us to handle resets of a LUN without a corresponding request having been sent. Meanwhile, we don't need to play games with host_scribble, just use it as the pointer it is. This fixes a crash when we issue a bus reset using sg_reset. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=13893 Reported-by: Bart Van Assche Reviewed-by: Bart Van Assche Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 90 ++++++++++------------------- drivers/infiniband/ulp/srp/ib_srp.h | 10 ++-- 2 files changed, 37 insertions(+), 63 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 1e1e347a7715..29429a13fd90 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -542,6 +542,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) { srp_unmap_data(req->scmnd, target, req); + req->scmnd = NULL; list_move_tail(&req->list, &target->free_reqs); } @@ -925,15 +926,13 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) target->req_lim += delta; - req = &target->req_ring[rsp->tag & ~SRP_TAG_TSK_MGMT]; - if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { - if (be32_to_cpu(rsp->resp_data_len) < 4) - req->tsk_status = -1; - else - req->tsk_status = rsp->data[3]; - complete(&req->done); + target->tsk_mgmt_status = -1; + if (be32_to_cpu(rsp->resp_data_len) >= 4) + target->tsk_mgmt_status = rsp->data[3]; + complete(&target->tsk_mgmt_done); } else { + req = &target->req_ring[rsp->tag]; scmnd = req->scmnd; if (!scmnd) shost_printk(KERN_ERR, target->scsi_host, @@ -953,13 +952,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); - if (!req->tsk_mgmt) { - scmnd->host_scribble = (void *) -1L; - scmnd->scsi_done(scmnd); - - srp_remove_req(target, req); - } else - req->cmd_done = 1; + scmnd->host_scribble = NULL; + scmnd->scsi_done(scmnd); + srp_remove_req(target, req); } spin_unlock_irqrestore(target->scsi_host->host_lock, flags); @@ -1155,7 +1150,7 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, scmnd->scsi_done = done; scmnd->result = 0; - scmnd->host_scribble = (void *) (long) req->index; + scmnd->host_scribble = (void *) req; cmd = iu->buf; memset(cmd, 0, sizeof *cmd); @@ -1167,8 +1162,6 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, req->scmnd = scmnd; req->cmd = iu; - req->cmd_done = 0; - req->tsk_mgmt = NULL; len = srp_map_data(scmnd, target, req); if (len < 0) { @@ -1442,7 +1435,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) } static int srp_send_tsk_mgmt(struct srp_target_port *target, - struct srp_request *req, u8 func) + u64 req_tag, unsigned int lun, u8 func) { struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; @@ -1451,12 +1444,10 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, spin_lock_irq(target->scsi_host->host_lock); if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) { - req->scmnd->result = DID_BAD_TARGET << 16; + target->state == SRP_TARGET_REMOVED) goto out; - } - init_completion(&req->done); + init_completion(&target->tsk_mgmt_done); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); if (!iu) @@ -1468,21 +1459,19 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, memset(tsk_mgmt, 0, sizeof *tsk_mgmt); tsk_mgmt->opcode = SRP_TSK_MGMT; - tsk_mgmt->lun = cpu_to_be64((u64) req->scmnd->device->lun << 48); - tsk_mgmt->tag = req->index | SRP_TAG_TSK_MGMT; + tsk_mgmt->lun = cpu_to_be64((u64) lun << 48); + tsk_mgmt->tag = req_tag | SRP_TAG_TSK_MGMT; tsk_mgmt->tsk_mgmt_func = func; - tsk_mgmt->task_tag = req->index; + tsk_mgmt->task_tag = req_tag; ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) goto out; - req->tsk_mgmt = iu; - spin_unlock_irq(target->scsi_host->host_lock); - if (!wait_for_completion_timeout(&req->done, + if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) return -1; @@ -1493,43 +1482,29 @@ out: return -1; } -static int srp_find_req(struct srp_target_port *target, - struct scsi_cmnd *scmnd, - struct srp_request **req) -{ - if (scmnd->host_scribble == (void *) -1L) - return -1; - - *req = &target->req_ring[(long) scmnd->host_scribble]; - - return 0; -} - static int srp_abort(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - struct srp_request *req; + struct srp_request *req = (struct srp_request *) scmnd->host_scribble; int ret = SUCCESS; shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); - if (target->qp_in_error) + if (!req || target->qp_in_error) return FAILED; - if (srp_find_req(target, scmnd, &req)) - return FAILED; - if (srp_send_tsk_mgmt(target, req, SRP_TSK_ABORT_TASK)) + if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, + SRP_TSK_ABORT_TASK)) return FAILED; spin_lock_irq(target->scsi_host->host_lock); - if (req->cmd_done) { - srp_remove_req(target, req); - scmnd->scsi_done(scmnd); - } else if (!req->tsk_status) { - srp_remove_req(target, req); - scmnd->result = DID_ABORT << 16; - } else - ret = FAILED; + if (req->scmnd) { + if (!target->tsk_mgmt_status) { + srp_remove_req(target, req); + scmnd->result = DID_ABORT << 16; + } else + ret = FAILED; + } spin_unlock_irq(target->scsi_host->host_lock); @@ -1545,17 +1520,16 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (target->qp_in_error) return FAILED; - if (srp_find_req(target, scmnd, &req)) + if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, + SRP_TSK_LUN_RESET)) return FAILED; - if (srp_send_tsk_mgmt(target, req, SRP_TSK_LUN_RESET)) - return FAILED; - if (req->tsk_status) + if (target->tsk_mgmt_status) return FAILED; spin_lock_irq(target->scsi_host->host_lock); list_for_each_entry_safe(req, tmp, &target->req_queue, list) - if (req->scmnd->device == scmnd->device) + if (req->scmnd && req->scmnd->device == scmnd->device) srp_reset_req(target, req); spin_unlock_irq(target->scsi_host->host_lock); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index ed0dce9e479f..f8b689a644b7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -68,7 +68,8 @@ enum { SRP_TSK_MGMT_SQ_SIZE = 1, SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, - SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1), + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = 1U << 31, SRP_FMR_SIZE = 256, SRP_FMR_POOL_SIZE = 1024, @@ -113,12 +114,8 @@ struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct srp_iu *tsk_mgmt; struct ib_pool_fmr *fmr; - struct completion done; short index; - u8 cmd_done; - u8 tsk_status; }; struct srp_target_port { @@ -165,6 +162,9 @@ struct srp_target_port { int status; enum srp_target_state state; int qp_in_error; + + struct completion tsk_mgmt_done; + u8 tsk_mgmt_status; }; struct srp_iu { From 9709f0e05b827049733f439de82a4a1688b37b86 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 13:13:06 -0500 Subject: [PATCH 02/42] IB/srp: consolidate state change code Signed-off-by: Bart Van Assche [ broken out and small cleanups by David Dillow ] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 45 +++++++++++++++-------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 29429a13fd90..def9e6b38459 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -441,18 +441,28 @@ static void srp_disconnect_target(struct srp_target_port *target) wait_for_completion(&target->done); } +static bool srp_change_state(struct srp_target_port *target, + enum srp_target_state old, + enum srp_target_state new) +{ + bool changed = false; + + spin_lock_irq(target->scsi_host->host_lock); + if (target->state == old) { + target->state = new; + changed = true; + } + spin_unlock_irq(target->scsi_host->host_lock); + return changed; +} + static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = container_of(work, struct srp_target_port, work); - spin_lock_irq(target->scsi_host->host_lock); - if (target->state != SRP_TARGET_DEAD) { - spin_unlock_irq(target->scsi_host->host_lock); + if (!srp_change_state(target, SRP_TARGET_DEAD, SRP_TARGET_REMOVED)) return; - } - target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(target->scsi_host->host_lock); spin_lock(&target->srp_host->target_lock); list_del(&target->list); @@ -560,13 +570,8 @@ static int srp_reconnect_target(struct srp_target_port *target) struct ib_wc wc; int ret; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state != SRP_TARGET_LIVE) { - spin_unlock_irq(target->scsi_host->host_lock); + if (!srp_change_state(target, SRP_TARGET_LIVE, SRP_TARGET_CONNECTING)) return -EAGAIN; - } - target->state = SRP_TARGET_CONNECTING; - spin_unlock_irq(target->scsi_host->host_lock); srp_disconnect_target(target); /* @@ -605,13 +610,8 @@ static int srp_reconnect_target(struct srp_target_port *target) if (ret) goto err; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state == SRP_TARGET_CONNECTING) { - ret = 0; - target->state = SRP_TARGET_LIVE; - } else + if (!srp_change_state(target, SRP_TARGET_CONNECTING, SRP_TARGET_LIVE)) ret = -EAGAIN; - spin_unlock_irq(target->scsi_host->host_lock); return ret; @@ -621,9 +621,12 @@ err: /* * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we might - * be in the context of the SCSI error handler now, which - * would deadlock if we call scsi_remove_host(). + * However, we have to defer the real removal because we + * are in the context of the SCSI error handler now, which + * will deadlock if we call scsi_remove_host(). + * + * Schedule our work inside the lock to avoid a race with + * the flush_scheduled_work() in srp_remove_one(). */ spin_lock_irq(target->scsi_host->host_lock); if (target->state == SRP_TARGET_CONNECTING) { From dcb4cb85f4b7caac9769bce464fef16306a4758c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 13:22:48 -0500 Subject: [PATCH 03/42] IB/srp: allow lockless work posting Only one CPU at a time will own an RX IU, so using the address of the IU as the work request cookie allows us to avoid taking a lock. We can similarly prepare the TX path for lockless posting by moving the free TX IUs to a list. This also removes the requirement that the queue sizes be a power of 2. Signed-off-by: Bart Van Assche [ broken out, small cleanups, and modified to avoid needing an extra field in the IU by David Dillow] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 65 ++++++++++++----------------- drivers/infiniband/ulp/srp/ib_srp.h | 7 +--- 2 files changed, 28 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index def9e6b38459..aa78d2615c8d 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -568,7 +568,7 @@ static int srp_reconnect_target(struct srp_target_port *target) struct ib_qp_attr qp_attr; struct srp_request *req, *tmp; struct ib_wc wc; - int ret; + int i, ret; if (!srp_change_state(target, SRP_TARGET_LIVE, SRP_TARGET_CONNECTING)) return -EAGAIN; @@ -601,9 +601,9 @@ static int srp_reconnect_target(struct srp_target_port *target) srp_reset_req(target, req); spin_unlock_irq(target->scsi_host->host_lock); - target->rx_head = 0; - target->tx_head = 0; - target->tx_tail = 0; + list_del_init(&target->free_tx); + for (i = 0; i < SRP_SQ_SIZE; ++i) + list_move(&target->tx_ring[i]->list, &target->free_tx); target->qp_in_error = 0; ret = srp_connect_target(target); @@ -817,7 +817,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, /* * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. Lock cannot be dropped between call here and + * req_lim and free_tx. Lock cannot be dropped between call here and * call to __srp_post_send(). * * Note: @@ -837,7 +837,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, srp_send_completion(target->send_cq, target); - if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) + if (list_empty(&target->free_tx)) return NULL; /* Initiator responses to target requests do not consume credits */ @@ -846,14 +846,14 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, return NULL; } - iu = target->tx_ring[target->tx_head & SRP_SQ_MASK]; + iu = list_first_entry(&target->free_tx, struct srp_iu, list); iu->type = iu_type; return iu; } /* * Must be called with target->scsi_host->host_lock held to protect - * req_lim and tx_head. + * req_lim and free_tx. */ static int __srp_post_send(struct srp_target_port *target, struct srp_iu *iu, int len) @@ -867,7 +867,7 @@ static int __srp_post_send(struct srp_target_port *target, list.lkey = target->srp_host->srp_dev->mr->lkey; wr.next = NULL; - wr.wr_id = target->tx_head & SRP_SQ_MASK; + wr.wr_id = (uintptr_t) iu; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; @@ -876,7 +876,7 @@ static int __srp_post_send(struct srp_target_port *target, ret = ib_post_send(target->qp, &wr, &bad_wr); if (!ret) { - ++target->tx_head; + list_del(&iu->list); if (iu->type != SRP_IU_RSP) --target->req_lim; } @@ -884,36 +884,21 @@ static int __srp_post_send(struct srp_target_port *target, return ret; } -static int srp_post_recv(struct srp_target_port *target) +static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) { - unsigned long flags; - struct srp_iu *iu; - struct ib_sge list; struct ib_recv_wr wr, *bad_wr; - unsigned int next; - int ret; - - spin_lock_irqsave(target->scsi_host->host_lock, flags); - - next = target->rx_head & SRP_RQ_MASK; - wr.wr_id = next; - iu = target->rx_ring[next]; + struct ib_sge list; list.addr = iu->dma; list.length = iu->size; list.lkey = target->srp_host->srp_dev->mr->lkey; wr.next = NULL; + wr.wr_id = (uintptr_t) iu; wr.sg_list = &list; wr.num_sge = 1; - ret = ib_post_recv(target->qp, &wr, &bad_wr); - if (!ret) - ++target->rx_head; - - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - - return ret; + return ib_post_recv(target->qp, &wr, &bad_wr); } static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) @@ -1030,14 +1015,11 @@ static void srp_process_aer_req(struct srp_target_port *target, static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) { - struct ib_device *dev; - struct srp_iu *iu; + struct ib_device *dev = target->srp_host->srp_dev->dev; + struct srp_iu *iu = (struct srp_iu *) wc->wr_id; int res; u8 opcode; - iu = target->rx_ring[wc->wr_id]; - - dev = target->srp_host->srp_dev->dev; ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len, DMA_FROM_DEVICE); @@ -1078,7 +1060,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len, DMA_FROM_DEVICE); - res = srp_post_recv(target); + res = srp_post_recv(target, iu); if (res != 0) shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed with error code %d\n", res); @@ -1107,6 +1089,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; struct ib_wc wc; + struct srp_iu *iu; while (ib_poll_cq(cq, 1, &wc) > 0) { if (wc.status) { @@ -1117,7 +1100,8 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) break; } - ++target->tx_tail; + iu = (struct srp_iu *) wc.wr_id; + list_add(&iu->list, &target->free_tx); } } @@ -1212,6 +1196,8 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) GFP_KERNEL, DMA_TO_DEVICE); if (!target->tx_ring[i]) goto err; + + list_add(&target->tx_ring[i]->list, &target->free_tx); } return 0; @@ -1373,7 +1359,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) break; for (i = 0; i < SRP_RQ_SIZE; i++) { - target->status = srp_post_recv(target); + struct srp_iu *iu = target->rx_ring[i]; + target->status = srp_post_recv(target, iu); if (target->status) break; } @@ -1965,6 +1952,7 @@ static ssize_t srp_create_target(struct device *dev, target->scsi_host = target_host; target->srp_host = host; + INIT_LIST_HEAD(&target->free_tx); INIT_LIST_HEAD(&target->free_reqs); INIT_LIST_HEAD(&target->req_queue); for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { @@ -2235,8 +2223,7 @@ static int __init srp_init_module(void) { int ret; - BUILD_BUG_ON_NOT_POWER_OF_2(SRP_SQ_SIZE); - BUILD_BUG_ON_NOT_POWER_OF_2(SRP_RQ_SIZE); + BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); if (srp_sg_tablesize > 255) { printk(KERN_WARNING PFX "Clamping srp_sg_tablesize to 255\n"); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index f8b689a644b7..41ecb46adf15 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -59,10 +59,8 @@ enum { SRP_RQ_SHIFT = 6, SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - SRP_RQ_MASK = SRP_RQ_SIZE - 1, SRP_SQ_SIZE = SRP_RQ_SIZE, - SRP_SQ_MASK = SRP_SQ_SIZE - 1, SRP_RSP_SQ_SIZE = 1, SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, SRP_TSK_MGMT_SQ_SIZE = 1, @@ -144,11 +142,9 @@ struct srp_target_port { int zero_req_lim; - unsigned rx_head; struct srp_iu *rx_ring[SRP_RQ_SIZE]; - unsigned tx_head; - unsigned tx_tail; + struct list_head free_tx; struct srp_iu *tx_ring[SRP_SQ_SIZE]; struct list_head free_reqs; @@ -168,6 +164,7 @@ struct srp_target_port { }; struct srp_iu { + struct list_head list; u64 dma; void *buf; size_t size; From 536ae14e7588e85203d4b4147c041309be5b3efb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 13:58:27 -0500 Subject: [PATCH 04/42] IB/srp: don't move active requests to their own list We use req->scmnd != NULL to indicate an active request, so there's no need to keep a separate list for them. We can afford the array iteration during error handling, and dropping it gives us one less item that needs lock protection. Signed-off-by: Bart Van Assche [ broken out and small cleanups by David Dillow ] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 23 +++++++++++++---------- drivers/infiniband/ulp/srp/ib_srp.h | 1 - 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index aa78d2615c8d..2aff8814f2c5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -553,7 +553,7 @@ static void srp_remove_req(struct srp_target_port *target, struct srp_request *r { srp_unmap_data(req->scmnd, target, req); req->scmnd = NULL; - list_move_tail(&req->list, &target->free_reqs); + list_add_tail(&req->list, &target->free_reqs); } static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) @@ -566,7 +566,6 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re static int srp_reconnect_target(struct srp_target_port *target) { struct ib_qp_attr qp_attr; - struct srp_request *req, *tmp; struct ib_wc wc; int i, ret; @@ -597,13 +596,16 @@ static int srp_reconnect_target(struct srp_target_port *target) ; /* nothing */ spin_lock_irq(target->scsi_host->host_lock); - list_for_each_entry_safe(req, tmp, &target->req_queue, list) - srp_reset_req(target, req); + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + struct srp_request *req = &target->req_ring[i]; + if (req->scmnd) + srp_reset_req(target, req); + } spin_unlock_irq(target->scsi_host->host_lock); - list_del_init(&target->free_tx); + INIT_LIST_HEAD(&target->free_tx); for (i = 0; i < SRP_SQ_SIZE; ++i) - list_move(&target->tx_ring[i]->list, &target->free_tx); + list_add(&target->tx_ring[i]->list, &target->free_tx); target->qp_in_error = 0; ret = srp_connect_target(target); @@ -1165,7 +1167,7 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, goto err_unmap; } - list_move_tail(&req->list, &target->req_queue); + list_del(&req->list); return 0; @@ -1504,7 +1506,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - struct srp_request *req, *tmp; + int i; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); @@ -1518,9 +1520,11 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) spin_lock_irq(target->scsi_host->host_lock); - list_for_each_entry_safe(req, tmp, &target->req_queue, list) + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + struct srp_request *req = &target->req_ring[i]; if (req->scmnd && req->scmnd->device == scmnd->device) srp_reset_req(target, req); + } spin_unlock_irq(target->scsi_host->host_lock); @@ -1954,7 +1958,6 @@ static ssize_t srp_create_target(struct device *dev, INIT_LIST_HEAD(&target->free_tx); INIT_LIST_HEAD(&target->free_reqs); - INIT_LIST_HEAD(&target->req_queue); for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { target->req_ring[i].index = i; list_add_tail(&target->req_ring[i].list, &target->free_reqs); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 41ecb46adf15..924d8e9c6672 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -148,7 +148,6 @@ struct srp_target_port { struct srp_iu *tx_ring[SRP_SQ_SIZE]; struct list_head free_reqs; - struct list_head req_queue; struct srp_request req_ring[SRP_CMD_SQ_SIZE]; struct work_struct work; From 76c75b258f1fe6abac6af2356989ad4d6518886e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 14:37:47 -0500 Subject: [PATCH 05/42] IB/srp: reduce local coverage for command submission and EH We only need locks to protect our lists and number of credits available. By pre-consuming the credit for the request, we can reduce our lock coverage to just those areas. If we don't actually send the request, we'll need to put the credit back into the pool. Signed-off-by: Bart Van Assche [ broken out and small cleanups by David Dillow ] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 124 +++++++++++++++------------- drivers/infiniband/ulp/srp/ib_srp.h | 1 - 2 files changed, 67 insertions(+), 58 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 2aff8814f2c5..e5bd181dbce5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -817,10 +817,25 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, return len; } +/* + * Return an IU and possible credit to the free pool + */ +static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, + enum srp_iu_type iu_type) +{ + unsigned long flags; + + spin_lock_irqsave(target->scsi_host->host_lock, flags); + list_add(&iu->list, &target->free_tx); + if (iu_type != SRP_IU_RSP) + ++target->req_lim; + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); +} + /* * Must be called with target->scsi_host->host_lock held to protect - * req_lim and free_tx. Lock cannot be dropped between call here and - * call to __srp_post_send(). + * req_lim and free_tx. If IU is not sent, it must be returned using + * srp_put_tx_iu(). * * Note: * An upper limit for the number of allocated information units for each @@ -843,26 +858,25 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, return NULL; /* Initiator responses to target requests do not consume credits */ - if (target->req_lim <= rsv && iu_type != SRP_IU_RSP) { - ++target->zero_req_lim; - return NULL; + if (iu_type != SRP_IU_RSP) { + if (target->req_lim <= rsv) { + ++target->zero_req_lim; + return NULL; + } + + --target->req_lim; } iu = list_first_entry(&target->free_tx, struct srp_iu, list); - iu->type = iu_type; + list_del(&iu->list); return iu; } -/* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and free_tx. - */ -static int __srp_post_send(struct srp_target_port *target, - struct srp_iu *iu, int len) +static int srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; - int ret = 0; list.addr = iu->dma; list.length = len; @@ -875,15 +889,7 @@ static int __srp_post_send(struct srp_target_port *target, wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; - ret = ib_post_send(target->qp, &wr, &bad_wr); - - if (!ret) { - list_del(&iu->list); - if (iu->type != SRP_IU_RSP) - --target->req_lim; - } - - return ret; + return ib_post_send(target->qp, &wr, &bad_wr); } static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) @@ -953,34 +959,33 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) static int srp_response_common(struct srp_target_port *target, s32 req_delta, void *rsp, int len) { - struct ib_device *dev; + struct ib_device *dev = target->srp_host->srp_dev->dev; unsigned long flags; struct srp_iu *iu; - int err = 1; - - dev = target->srp_host->srp_dev->dev; + int err; spin_lock_irqsave(target->scsi_host->host_lock, flags); target->req_lim += req_delta; - iu = __srp_get_tx_iu(target, SRP_IU_RSP); + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + if (!iu) { shost_printk(KERN_ERR, target->scsi_host, PFX "no IU available to send response\n"); - goto out; + return 1; } ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); memcpy(iu->buf, rsp, len); ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); - err = __srp_post_send(target, iu, len); - if (err) + err = srp_post_send(target, iu, len); + if (err) { shost_printk(KERN_ERR, target->scsi_host, PFX "unable to post response: %d\n", err); + srp_put_tx_iu(target, iu, SRP_IU_RSP); + } -out: - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); return err; } @@ -1107,14 +1112,14 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) } } -static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, - void (*done)(struct scsi_cmnd *)) +static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { - struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_target_port *target = host_to_target(shost); struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; + unsigned long flags; int len; if (target->state == SRP_TARGET_CONNECTING) @@ -1123,11 +1128,19 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, if (target->state == SRP_TARGET_DEAD || target->state == SRP_TARGET_REMOVED) { scmnd->result = DID_BAD_TARGET << 16; - done(scmnd); + scmnd->scsi_done(scmnd); return 0; } + spin_lock_irqsave(shost->host_lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); + if (iu) { + req = list_first_entry(&target->free_reqs, struct srp_request, + list); + list_del(&req->list); + } + spin_unlock_irqrestore(shost->host_lock, flags); + if (!iu) goto err; @@ -1135,9 +1148,6 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, ib_dma_sync_single_for_cpu(dev, iu->dma, srp_max_iu_len, DMA_TO_DEVICE); - req = list_first_entry(&target->free_reqs, struct srp_request, list); - - scmnd->scsi_done = done; scmnd->result = 0; scmnd->host_scribble = (void *) req; @@ -1156,30 +1166,33 @@ static int srp_queuecommand_lck(struct scsi_cmnd *scmnd, if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, PFX "Failed to map data\n"); - goto err; + goto err_iu; } ib_dma_sync_single_for_device(dev, iu->dma, srp_max_iu_len, DMA_TO_DEVICE); - if (__srp_post_send(target, iu, len)) { + if (srp_post_send(target, iu, len)) { shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); goto err_unmap; } - list_del(&req->list); - return 0; err_unmap: srp_unmap_data(scmnd, target, req); +err_iu: + srp_put_tx_iu(target, iu, SRP_IU_CMD); + + spin_lock_irqsave(shost->host_lock, flags); + list_add(&req->list, &target->free_reqs); + spin_unlock_irqrestore(shost->host_lock, flags); + err: return SCSI_MLQUEUE_HOST_BUSY; } -static DEF_SCSI_QCMD(srp_queuecommand) - static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; @@ -1433,17 +1446,18 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; - spin_lock_irq(target->scsi_host->host_lock); - if (target->state == SRP_TARGET_DEAD || target->state == SRP_TARGET_REMOVED) - goto out; + return -1; init_completion(&target->tsk_mgmt_done); + spin_lock_irq(target->scsi_host->host_lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); + spin_unlock_irq(target->scsi_host->host_lock); + if (!iu) - goto out; + return -1; ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); @@ -1458,20 +1472,16 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); - if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) - goto out; - - spin_unlock_irq(target->scsi_host->host_lock); + if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { + srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + return -1; + } if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) return -1; return 0; - -out: - spin_unlock_irq(target->scsi_host->host_lock); - return -1; } static int srp_abort(struct scsi_cmnd *scmnd) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 924d8e9c6672..81686eee7e62 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -168,7 +168,6 @@ struct srp_iu { void *buf; size_t size; enum dma_data_direction direction; - enum srp_iu_type type; }; #endif /* IB_SRP_H */ From 94a9174c630c8465ed9e97ecd242993429930c05 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 14:50:09 -0500 Subject: [PATCH 06/42] IB/srp: reduce lock coverage of command completion We only need the lock to cover list and credit manipulations, so push those into srp_remove_req() and update the call chains. We reorder the request removal and command completion in srp_process_rsp() to avoid the SCSI mid-layer sending another command before we've released our request and added any credits returned by the target. This prevents us from returning HOST_BUSY unneccesarily. Signed-off-by: Bart Van Assche [ broken out, small cleanups, and modified to avoid potential extraneous HOST_BUSY returns by David Dillow ] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 37 +++++++++++------------------ 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e5bd181dbce5..e76fe54faeea 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -549,18 +549,24 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, scsi_sg_count(scmnd), scmnd->sc_data_direction); } -static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) +static void srp_remove_req(struct srp_target_port *target, + struct srp_request *req, s32 req_lim_delta) { + unsigned long flags; + srp_unmap_data(req->scmnd, target, req); + spin_lock_irqsave(target->scsi_host->host_lock, flags); + target->req_lim += req_lim_delta; req->scmnd = NULL; list_add_tail(&req->list, &target->free_reqs); + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); } static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) { req->scmnd->result = DID_RESET << 16; req->scmnd->scsi_done(req->scmnd); - srp_remove_req(target, req); + srp_remove_req(target, req, 0); } static int srp_reconnect_target(struct srp_target_port *target) @@ -595,13 +601,11 @@ static int srp_reconnect_target(struct srp_target_port *target) while (ib_poll_cq(target->send_cq, 1, &wc) > 0) ; /* nothing */ - spin_lock_irq(target->scsi_host->host_lock); for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd) srp_reset_req(target, req); } - spin_unlock_irq(target->scsi_host->host_lock); INIT_LIST_HEAD(&target->free_tx); for (i = 0; i < SRP_SQ_SIZE; ++i) @@ -914,15 +918,12 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) struct srp_request *req; struct scsi_cmnd *scmnd; unsigned long flags; - s32 delta; - - delta = (s32) be32_to_cpu(rsp->req_lim_delta); - - spin_lock_irqsave(target->scsi_host->host_lock, flags); - - target->req_lim += delta; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { + spin_lock_irqsave(target->scsi_host->host_lock, flags); + target->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + target->tsk_mgmt_status = -1; if (be32_to_cpu(rsp->resp_data_len) >= 4) target->tsk_mgmt_status = rsp->data[3]; @@ -948,12 +949,10 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + srp_remove_req(target, req, be32_to_cpu(rsp->req_lim_delta)); scmnd->host_scribble = NULL; scmnd->scsi_done(scmnd); - srp_remove_req(target, req); } - - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); } static int srp_response_common(struct srp_target_port *target, s32 req_delta, @@ -1498,18 +1497,14 @@ static int srp_abort(struct scsi_cmnd *scmnd) SRP_TSK_ABORT_TASK)) return FAILED; - spin_lock_irq(target->scsi_host->host_lock); - if (req->scmnd) { if (!target->tsk_mgmt_status) { - srp_remove_req(target, req); + srp_remove_req(target, req, 0); scmnd->result = DID_ABORT << 16; } else ret = FAILED; } - spin_unlock_irq(target->scsi_host->host_lock); - return ret; } @@ -1528,16 +1523,12 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (target->tsk_mgmt_status) return FAILED; - spin_lock_irq(target->scsi_host->host_lock); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd && req->scmnd->device == scmnd->device) srp_reset_req(target, req); } - spin_unlock_irq(target->scsi_host->host_lock); - return SUCCESS; } From e9684678221441f886b4d7c74f8770bb0981737a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 26 Nov 2010 15:08:38 -0500 Subject: [PATCH 07/42] IB/srp: stop sharing the host lock with SCSI We don't need protection against the SCSI stack, so use our own lock to allow parallel progress on separate CPUs. Signed-off-by: Bart Van Assche [ broken out and small cleanups by David Dillow ] Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 46 ++++++++++++++--------------- drivers/infiniband/ulp/srp/ib_srp.h | 2 ++ 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e76fe54faeea..8691fc83f70b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -447,12 +447,12 @@ static bool srp_change_state(struct srp_target_port *target, { bool changed = false; - spin_lock_irq(target->scsi_host->host_lock); + spin_lock_irq(&target->lock); if (target->state == old) { target->state = new; changed = true; } - spin_unlock_irq(target->scsi_host->host_lock); + spin_unlock_irq(&target->lock); return changed; } @@ -555,11 +555,11 @@ static void srp_remove_req(struct srp_target_port *target, unsigned long flags; srp_unmap_data(req->scmnd, target, req); - spin_lock_irqsave(target->scsi_host->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); target->req_lim += req_lim_delta; req->scmnd = NULL; list_add_tail(&req->list, &target->free_reqs); - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); } static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) @@ -634,13 +634,13 @@ err: * Schedule our work inside the lock to avoid a race with * the flush_scheduled_work() in srp_remove_one(). */ - spin_lock_irq(target->scsi_host->host_lock); + spin_lock_irq(&target->lock); if (target->state == SRP_TARGET_CONNECTING) { target->state = SRP_TARGET_DEAD; INIT_WORK(&target->work, srp_remove_work); schedule_work(&target->work); } - spin_unlock_irq(target->scsi_host->host_lock); + spin_unlock_irq(&target->lock); return ret; } @@ -829,17 +829,16 @@ static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, { unsigned long flags; - spin_lock_irqsave(target->scsi_host->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); list_add(&iu->list, &target->free_tx); if (iu_type != SRP_IU_RSP) ++target->req_lim; - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); } /* - * Must be called with target->scsi_host->host_lock held to protect - * req_lim and free_tx. If IU is not sent, it must be returned using - * srp_put_tx_iu(). + * Must be called with target->lock held to protect req_lim and free_tx. + * If IU is not sent, it must be returned using srp_put_tx_iu(). * * Note: * An upper limit for the number of allocated information units for each @@ -920,9 +919,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) unsigned long flags; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { - spin_lock_irqsave(target->scsi_host->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); target->req_lim += be32_to_cpu(rsp->req_lim_delta); - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); target->tsk_mgmt_status = -1; if (be32_to_cpu(rsp->resp_data_len) >= 4) @@ -963,10 +962,10 @@ static int srp_response_common(struct srp_target_port *target, s32 req_delta, struct srp_iu *iu; int err; - spin_lock_irqsave(target->scsi_host->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); target->req_lim += req_delta; iu = __srp_get_tx_iu(target, SRP_IU_RSP); - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); if (!iu) { shost_printk(KERN_ERR, target->scsi_host, PFX @@ -1131,14 +1130,14 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) return 0; } - spin_lock_irqsave(shost->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); iu = __srp_get_tx_iu(target, SRP_IU_CMD); if (iu) { req = list_first_entry(&target->free_reqs, struct srp_request, list); list_del(&req->list); } - spin_unlock_irqrestore(shost->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); if (!iu) goto err; @@ -1184,9 +1183,9 @@ err_unmap: err_iu: srp_put_tx_iu(target, iu, SRP_IU_CMD); - spin_lock_irqsave(shost->host_lock, flags); + spin_lock_irqsave(&target->lock, flags); list_add(&req->list, &target->free_reqs); - spin_unlock_irqrestore(shost->host_lock, flags); + spin_unlock_irqrestore(&target->lock, flags); err: return SCSI_MLQUEUE_HOST_BUSY; @@ -1451,9 +1450,9 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&target->tsk_mgmt_done); - spin_lock_irq(target->scsi_host->host_lock); + spin_lock_irq(&target->lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); - spin_unlock_irq(target->scsi_host->host_lock); + spin_unlock_irq(&target->lock); if (!iu) return -1; @@ -1957,6 +1956,7 @@ static ssize_t srp_create_target(struct device *dev, target->scsi_host = target_host; target->srp_host = host; + spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); INIT_LIST_HEAD(&target->free_reqs); for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { @@ -2186,9 +2186,9 @@ static void srp_remove_one(struct ib_device *device) */ spin_lock(&host->target_lock); list_for_each_entry(target, &host->target_list, list) { - spin_lock_irq(target->scsi_host->host_lock); + spin_lock_irq(&target->lock); target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(target->scsi_host->host_lock); + spin_unlock_irq(&target->lock); } spin_unlock(&host->target_lock); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 81686eee7e62..acb435d3c1e3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -144,6 +144,8 @@ struct srp_target_port { struct srp_iu *rx_ring[SRP_RQ_SIZE]; + spinlock_t lock; + struct list_head free_tx; struct srp_iu *tx_ring[SRP_SQ_SIZE]; From 9af762719e8f8fa282de02997dced593030eb238 Mon Sep 17 00:00:00 2001 From: David Dillow Date: Fri, 26 Nov 2010 15:34:46 -0500 Subject: [PATCH 08/42] IB/srp: consolidate hot-path variables into cache lines Put the variables accessed together in the hot-path into common cachelines, and separate them by RW vs RO to avoid false dirtying. We keep a local copy of the lkey and rkey in the target to avoid traversing pointers (and associated cache lines) to find them. Reviewed-by: Bart Van Assche Signed-off-by: David Dillow --- drivers/infiniband/ulp/srp/ib_srp.c | 12 ++++++----- drivers/infiniband/ulp/srp/ib_srp.h | 31 ++++++++++++++++++----------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 8691fc83f70b..4b62105ed1e8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -768,7 +768,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_direct_buf *buf = (void *) cmd->add_data; buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); - buf->key = cpu_to_be32(dev->mr->rkey); + buf->key = cpu_to_be32(target->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); } else if (srp_map_fmr(target, scat, count, req, (void *) cmd->add_data)) { @@ -793,7 +793,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->desc_list[i].va = cpu_to_be64(ib_sg_dma_address(ibdev, sg)); buf->desc_list[i].key = - cpu_to_be32(dev->mr->rkey); + cpu_to_be32(target->rkey); buf->desc_list[i].len = cpu_to_be32(dma_len); datalen += dma_len; } @@ -806,7 +806,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, buf->table_desc.va = cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf); buf->table_desc.key = - cpu_to_be32(target->srp_host->srp_dev->mr->rkey); + cpu_to_be32(target->rkey); buf->table_desc.len = cpu_to_be32(count * sizeof (struct srp_direct_buf)); @@ -883,7 +883,7 @@ static int srp_post_send(struct srp_target_port *target, list.addr = iu->dma; list.length = len; - list.lkey = target->srp_host->srp_dev->mr->lkey; + list.lkey = target->lkey; wr.next = NULL; wr.wr_id = (uintptr_t) iu; @@ -902,7 +902,7 @@ static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) list.addr = iu->dma; list.length = iu->size; - list.lkey = target->srp_host->srp_dev->mr->lkey; + list.lkey = target->lkey; wr.next = NULL; wr.wr_id = (uintptr_t) iu; @@ -1955,6 +1955,8 @@ static ssize_t srp_create_target(struct device *dev, target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; + target->lkey = host->srp_dev->mr->lkey; + target->rkey = host->srp_dev->mr->rkey; spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index acb435d3c1e3..9dc6fc3fd894 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -117,6 +117,24 @@ struct srp_request { }; struct srp_target_port { + /* These are RW in the hot path, and commonly used together */ + struct list_head free_tx; + struct list_head free_reqs; + spinlock_t lock; + s32 req_lim; + + /* These are read-only in the hot path */ + struct ib_cq *send_cq ____cacheline_aligned_in_smp; + struct ib_cq *recv_cq; + struct ib_qp *qp; + u32 lkey; + u32 rkey; + enum srp_target_state state; + + /* Everything above this point is used in the hot path of + * command processing. Try to keep them packed into cachelines. + */ + __be64 id_ext; __be64 ioc_guid; __be64 service_id; @@ -133,23 +151,13 @@ struct srp_target_port { int path_query_id; struct ib_cm_id *cm_id; - struct ib_cq *recv_cq; - struct ib_cq *send_cq; - struct ib_qp *qp; int max_ti_iu_len; - s32 req_lim; int zero_req_lim; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - - spinlock_t lock; - - struct list_head free_tx; struct srp_iu *tx_ring[SRP_SQ_SIZE]; - - struct list_head free_reqs; + struct srp_iu *rx_ring[SRP_RQ_SIZE]; struct srp_request req_ring[SRP_CMD_SQ_SIZE]; struct work_struct work; @@ -157,7 +165,6 @@ struct srp_target_port { struct list_head list; struct completion done; int status; - enum srp_target_state state; int qp_in_error; struct completion tsk_mgmt_done; From c94310916390ac162e70c8c134bf70680c944a02 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 10 Jan 2011 17:41:43 -0800 Subject: [PATCH 09/42] RDMA/cxgb3,cxgb4: Remove dead code This removes unused code found by running 'make namespacecheck'; compile tested only. Signed-off-by: Stephen Hemminger Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 2 + drivers/infiniband/hw/cxgb3/iwch_provider.h | 2 - drivers/infiniband/hw/cxgb3/iwch_qp.c | 56 --------------------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 1 - drivers/infiniband/hw/cxgb4/qp.c | 30 ----------- 5 files changed, 2 insertions(+), 89 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 09dda0b8740e..c3f5aca4ef00 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -189,6 +189,7 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); } +#ifdef notyet int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) { struct rdma_cq_setup setup; @@ -200,6 +201,7 @@ int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) setup.ovfl_mode = 1; return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); } +#endif static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) { diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index a237d49bdcc9..c5406da3f4cd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -335,8 +335,6 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); int iwch_post_zb_read(struct iwch_qp *qhp); int iwch_register_device(struct iwch_dev *dev); void iwch_unregister_device(struct iwch_dev *dev); -int iwch_quiesce_qps(struct iwch_cq *chp); -int iwch_resume_qps(struct iwch_cq *chp); void stop_read_rep_timer(struct iwch_qp *qhp); int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift); diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 0993137181d7..1b4cd09f74dc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -1149,59 +1149,3 @@ out: PDBG("%s exit state %d\n", __func__, qhp->attr.state); return ret; } - -static int quiesce_qp(struct iwch_qp *qhp) -{ - spin_lock_irq(&qhp->lock); - iwch_quiesce_tid(qhp->ep); - qhp->flags |= QP_QUIESCED; - spin_unlock_irq(&qhp->lock); - return 0; -} - -static int resume_qp(struct iwch_qp *qhp) -{ - spin_lock_irq(&qhp->lock); - iwch_resume_tid(qhp->ep); - qhp->flags &= ~QP_QUIESCED; - spin_unlock_irq(&qhp->lock); - return 0; -} - -int iwch_quiesce_qps(struct iwch_cq *chp) -{ - int i; - struct iwch_qp *qhp; - - for (i=0; i < T3_MAX_NUM_QP; i++) { - qhp = get_qhp(chp->rhp, i); - if (!qhp) - continue; - if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) { - quiesce_qp(qhp); - continue; - } - if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp)) - quiesce_qp(qhp); - } - return 0; -} - -int iwch_resume_qps(struct iwch_cq *chp) -{ - int i; - struct iwch_qp *qhp; - - for (i=0; i < T3_MAX_NUM_QP; i++) { - qhp = get_qhp(chp->rhp, i); - if (!qhp) - continue; - if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) { - resume_qp(qhp); - continue; - } - if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp)) - resume_qp(qhp); - } - return 0; -} diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 16032cdb4337..cc600c2dd0b3 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -760,7 +760,6 @@ int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); u16 c4iw_rqes_posted(struct c4iw_qp *qhp); -int c4iw_post_zb_read(struct c4iw_qp *qhp); int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 057cb2505ea1..565a0612a125 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -892,36 +892,6 @@ static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, } } -int c4iw_post_zb_read(struct c4iw_qp *qhp) -{ - union t4_wr *wqe; - struct sk_buff *skb; - u8 len16; - - PDBG("%s enter\n", __func__); - skb = alloc_skb(40, GFP_KERNEL); - if (!skb) { - printk(KERN_ERR "%s cannot send zb_read!!\n", __func__); - return -ENOMEM; - } - set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); - - wqe = (union t4_wr *)skb_put(skb, sizeof wqe->read); - memset(wqe, 0, sizeof wqe->read); - wqe->read.r2 = cpu_to_be64(0); - wqe->read.stag_sink = cpu_to_be32(1); - wqe->read.to_sink_hi = cpu_to_be32(0); - wqe->read.to_sink_lo = cpu_to_be32(1); - wqe->read.stag_src = cpu_to_be32(1); - wqe->read.plen = cpu_to_be32(0); - wqe->read.to_src_hi = cpu_to_be32(0); - wqe->read.to_src_lo = cpu_to_be32(1); - len16 = DIV_ROUND_UP(sizeof wqe->read, 16); - init_wr_hdr(wqe, 0, FW_RI_RDMA_READ_WR, FW_RI_COMPLETION_FLAG, len16); - - return c4iw_ofld_send(&qhp->rhp->rdev, skb); -} - static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, gfp_t gfp) { From db8b10167126d72829653690f57b9c7ca53c4d54 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 10 Jan 2011 17:41:43 -0800 Subject: [PATCH 10/42] RDMA/cxgb4: Don't re-init wait object in init/fini paths Re-initializing the wait object in rdma_init()/rdma_fini() causes a timing window which can lead to a deadlock during close. Once this deadlock hits, all RDMA activity over the T4 device will be stuck. There's no need to re-init the wait object, so remove it. Signed-off-by: Steve Wise Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/qp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 565a0612a125..20800900ef3f 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -999,7 +999,6 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.fini.type = FW_RI_TYPE_FINI; - c4iw_init_wr_wait(&ep->com.wr_wait); ret = c4iw_ofld_send(&rhp->rdev, skb); if (ret) goto out; @@ -1095,7 +1094,6 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) if (qhp->attr.mpa_attr.initiator) build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); - c4iw_init_wr_wait(&qhp->ep->com.wr_wait); ret = c4iw_ofld_send(&rhp->rdev, skb); if (ret) goto out; From 1eba27e87a85f6ed68905055bc9a7dbfb024c255 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Jan 2011 17:41:50 -0800 Subject: [PATCH 11/42] IB/ipath: Use printf extension %pR for struct resource Using %pR standardizes the struct resource output. Signed-off-by: Joe Perches Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 765f0fc1da76..b33f0457a1ff 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -530,9 +530,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, for (j = 0; j < 6; j++) { if (!pdev->resource[j].start) continue; - ipath_cdbg(VERBOSE, "BAR %d start %llx, end %llx, len %llx\n", - j, (unsigned long long)pdev->resource[j].start, - (unsigned long long)pdev->resource[j].end, + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], (unsigned long long)pci_resource_len(pdev, j)); } From 19e364f6801e38972673278adedaab1abf6f854c Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 10 Jan 2011 17:41:54 -0800 Subject: [PATCH 12/42] IPoIB: Remove LRO support As a first step in moving from LRO to GRO, revert commit af40da894e9 ("IPoIB: add LRO support"). Also eliminate the ethtool set_flags callback which isn't needed anymore. Finally, we need to include directly to get the declaration of restart_syscall() (which used to be included implicitly through ). Cc: Ben Hutchings Cc: Eric W. Biederman Cc: Vladimir Sokolovsky Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/Kconfig | 1 - drivers/infiniband/ulp/ipoib/ipoib.h | 12 +--- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 51 ---------------- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 8 +-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 62 -------------------- 5 files changed, 2 insertions(+), 132 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 9d9a9dc51f18..55855eeabae7 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,7 +1,6 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" depends on NETDEVICES && INET && (IPV6 || IPV6=n) - select INET_LRO ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 753a983a5fdc..ab97f92fc257 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -50,7 +50,7 @@ #include #include #include -#include +#include /* constants */ @@ -100,9 +100,6 @@ enum { IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, - IPOIB_MAX_LRO_DESCRIPTORS = 8, - IPOIB_LRO_MAX_AGGR = 64, - MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, }; @@ -262,11 +259,6 @@ struct ipoib_ethtool_st { u16 max_coalesced_frames; }; -struct ipoib_lro { - struct net_lro_mgr lro_mgr; - struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; -}; - /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside @@ -352,8 +344,6 @@ struct ipoib_dev_priv { int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; - - struct ipoib_lro lro; }; struct ipoib_ah { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 1a1657c82edd..19f7f5206f78 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -106,63 +106,12 @@ static int ipoib_set_coalesce(struct net_device *dev, return 0; } -static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { - "LRO aggregated", "LRO flushed", - "LRO avg aggr", "LRO no desc" -}; - -static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) -{ - switch (stringset) { - case ETH_SS_STATS: - memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys)); - break; - } -} - -static int ipoib_get_sset_count(struct net_device *dev, int sset) -{ - switch (sset) { - case ETH_SS_STATS: - return ARRAY_SIZE(ipoib_stats_keys); - default: - return -EOPNOTSUPP; - } -} - -static void ipoib_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, uint64_t *data) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - int index = 0; - - /* Get LRO statistics */ - data[index++] = priv->lro.lro_mgr.stats.aggregated; - data[index++] = priv->lro.lro_mgr.stats.flushed; - if (priv->lro.lro_mgr.stats.flushed) - data[index++] = priv->lro.lro_mgr.stats.aggregated / - priv->lro.lro_mgr.stats.flushed; - else - data[index++] = 0; - data[index++] = priv->lro.lro_mgr.stats.no_desc; -} - -static int ipoib_set_flags(struct net_device *dev, u32 flags) -{ - return ethtool_op_set_flags(dev, flags, ETH_FLAG_LRO); -} - static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_rx_csum = ipoib_get_rx_csum, .set_tso = ipoib_set_tso, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, - .get_flags = ethtool_op_get_flags, - .set_flags = ipoib_set_flags, - .get_strings = ipoib_get_strings, - .get_sset_count = ipoib_get_sset_count, - .get_ethtool_stats = ipoib_get_ethtool_stats, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index dfa71903d6e4..44c33bd97952 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -295,10 +295,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (dev->features & NETIF_F_LRO) - lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); - else - netif_receive_skb(skb); + netif_receive_skb(skb); repost: if (unlikely(ipoib_ib_post_receive(dev, wr_id))) @@ -450,9 +447,6 @@ poll_more: } if (done < budget) { - if (dev->features & NETIF_F_LRO) - lro_flush_all(&priv->lro.lro_mgr); - napi_complete(napi); if (unlikely(ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9ff7bc73ed95..c434a856a787 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -60,15 +60,6 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); -static int lro; -module_param(lro, bool, 0444); -MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)"); - -static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; -module_param(lro_max_aggr, int, 0644); -MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " - "(default = 64)"); - #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -976,54 +967,6 @@ static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; -static int get_skb_hdr(struct sk_buff *skb, void **iphdr, - void **tcph, u64 *hdr_flags, void *priv) -{ - unsigned int ip_len; - struct iphdr *iph; - - if (unlikely(skb->protocol != htons(ETH_P_IP))) - return -1; - - /* - * In the future we may add an else clause that verifies the - * checksum and allows devices which do not calculate checksum - * to use LRO. - */ - if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) - return -1; - - /* Check for non-TCP packet */ - skb_reset_network_header(skb); - iph = ip_hdr(skb); - if (iph->protocol != IPPROTO_TCP) - return -1; - - ip_len = ip_hdrlen(skb); - skb_set_transport_header(skb, ip_len); - *tcph = tcp_hdr(skb); - - /* check if IP header and TCP header are complete */ - if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) - return -1; - - *hdr_flags = LRO_IPV4 | LRO_TCP; - *iphdr = iph; - - return 0; -} - -static void ipoib_lro_setup(struct ipoib_dev_priv *priv) -{ - priv->lro.lro_mgr.max_aggr = lro_max_aggr; - priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; - priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; - priv->lro.lro_mgr.get_skb_header = get_skb_hdr; - priv->lro.lro_mgr.features = LRO_F_NAPI; - priv->lro.lro_mgr.dev = priv->dev; - priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; -} - static const struct net_device_ops ipoib_netdev_ops = { .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, @@ -1067,8 +1010,6 @@ static void ipoib_setup(struct net_device *dev) priv->dev = dev; - ipoib_lro_setup(priv); - spin_lock_init(&priv->lock); mutex_init(&priv->vlan_mutex); @@ -1218,9 +1159,6 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; } - if (lro) - priv->dev->features |= NETIF_F_LRO; - if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) priv->dev->features |= NETIF_F_TSO; From 8ae31e5b1fc73751d800d551fb30340caa53c7dd Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 10 Jan 2011 17:41:55 -0800 Subject: [PATCH 13/42] IPoIB: Add GRO support Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 + drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index bb1004114dec..c1c49f2d35b5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1480,6 +1480,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; + priv->dev->features |= NETIF_F_GRO; if (priv->hca_caps & IB_DEVICE_UD_TSO) dev->features |= NETIF_F_TSO; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 44c33bd97952..806d0292dc39 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -295,7 +295,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) skb->ip_summed = CHECKSUM_UNNECESSARY; - netif_receive_skb(skb); + napi_gro_receive(&priv->napi, skb); repost: if (unlikely(ipoib_ib_post_receive(dev, wr_id))) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index c434a856a787..7a07a728fe0d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1159,6 +1159,8 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; } + priv->dev->features |= NETIF_F_GRO; + if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) priv->dev->features |= NETIF_F_TSO; From f5a49539a64ae5e3833fddec54a3c2524c7fe333 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 10 Jan 2011 17:42:05 -0800 Subject: [PATCH 14/42] mlx4_core: Remove warning message about firmware bug The kernel warning message added in commit 58d74bb1d9f7 ("mlx4_core: Workaround firmware bug in query dev cap") about mlx4 reporting the wrong number of "blue flame registers" doesn't really help anyone, since the firmware bug is known and fixed and the bug is pretty much harmless to users. So just get rid of the warning. Signed-off-by: Roland Dreier --- drivers/net/mlx4/fw.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 7a7e18ba278a..5de1db897835 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -289,10 +289,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET); dev_cap->bf_reg_size = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET); - if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) { - mlx4_warn(dev, "firmware bug: log2 # of blue flame regs is invalid (%d), forcing 3\n", field & 0x1f); + if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) field = 3; - } dev_cap->bf_regs_per_page = 1 << (field & 0x3f); mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n", dev_cap->bf_reg_size, dev_cap->bf_regs_per_page); From 3afa9f19e5bd16abed998b7bf1b178206403286f Mon Sep 17 00:00:00 2001 From: Vladimir Sokolovsky Date: Mon, 10 Jan 2011 17:42:06 -0800 Subject: [PATCH 15/42] IB/mlx4: Don't call dma_free_coherent() with irqs disabled mlx4_ib_free_cq_buf() should not be called under spin_lock_irq() since it calls dma_free_coherent(), which needs irqs enabled. Fix this by deferring the free to outside the locked region. This was found due to the WARN_ON(irqs_disabled()); in swiotlb_free_coherent(). Signed-off-by: Vladimir Sokolovsky Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 5a219a2fdf16..e8df155bc3b0 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -397,10 +397,14 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) cq->resize_buf = NULL; cq->resize_umem = NULL; } else { + struct mlx4_ib_cq_buf tmp_buf; + int tmp_cqe = 0; + spin_lock_irq(&cq->lock); if (cq->resize_buf) { mlx4_ib_cq_resize_copy_cqes(cq); - mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + tmp_buf = cq->buf; + tmp_cqe = cq->ibcq.cqe; cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; @@ -408,6 +412,9 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) cq->resize_buf = NULL; } spin_unlock_irq(&cq->lock); + + if (tmp_cqe) + mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe); } goto out; From 030b4b3309e29c6c857d8521c3076743663c259e Mon Sep 17 00:00:00 2001 From: Ali Ayoub Date: Mon, 10 Jan 2011 17:42:06 -0800 Subject: [PATCH 16/42] mlx4_core: Avoid vunmap() of invalid pointer if allocation fails Signed-off-by: Ali Ayoub Signed-off-by: Roland Dreier --- drivers/net/mlx4/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c index 8f4bf1f07c11..3a4277f6fac4 100644 --- a/drivers/net/mlx4/alloc.c +++ b/drivers/net/mlx4/alloc.c @@ -178,6 +178,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, } else { int i; + buf->direct.buf = NULL; buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; @@ -229,7 +230,7 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, buf->direct.map); else { - if (BITS_PER_LONG == 64) + if (BITS_PER_LONG == 64 && buf->direct.buf) vunmap(buf->direct.buf); for (i = 0; i < buf->nbufs; ++i) From 1397490938aa0aca39001c3fd5a9fc9387110d86 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Jan 2011 17:42:06 -0800 Subject: [PATCH 17/42] IB/mlx4: Handle -ENOMEM in forward_trap() ib_create_send_mad() can return ERR_PTR(-ENOMEM) here. Signed-off-by: Dan Carpenter Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/mad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index c9a8dd63b9e2..57ffa50f509e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -211,6 +211,8 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is From d0444f1527f22d193c209d4fdad0dbabbf2be2ee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Jan 2011 17:42:10 -0800 Subject: [PATCH 18/42] IB/mthca: Handle -ENOMEM in forward_trap() ib_create_send_mad() can return ERR_PTR(-ENOMEM) here. Signed-off-by: Dan Carpenter Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_mad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 5648659ff0b0..03a59534f59e 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -171,6 +171,8 @@ static void forward_trap(struct mthca_dev *dev, if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is From 601d87b0795c5affe5b73318c1088edbf10ce72f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 10 Jan 2011 17:42:14 -0800 Subject: [PATCH 19/42] RDMA/nes: Fix string continuation line Signed-off-by: Joe Perches Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_nic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 3892e2c0e95a..5a4c36484722 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -908,8 +908,8 @@ static void nes_netdev_set_multicast_list(struct net_device *netdev) nesvnic->nic_index && mc_index < max_pft_entries_avaiable) { nes_debug(NES_DBG_NIC_RX, - "mc_index=%d skipping nic_index=%d,\ - used for=%d \n", mc_index, + "mc_index=%d skipping nic_index=%d, " + "used for=%d \n", mc_index, nesvnic->nic_index, nesadapter->pft_mcast_map[mc_index]); mc_index++; From 9d5b243f24212966d6d06058f96b2b1a22482f59 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:19 -0800 Subject: [PATCH 20/42] IB/qib: Remove IB latency turnoff This is required for hardware testing. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 584d443b5335..9031cd82e879 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -7271,8 +7271,6 @@ static int serdes_7322_init(struct qib_pportdata *ppd) ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ data = qib_read_kreg_port(ppd, krp_serdesctrl); - /* Turn off IB latency mode */ - data &= ~SYM_MASK(IBSerdesCtrl_0, IB_LAT_MODE); qib_write_kreg_port(ppd, krp_serdesctrl, data | SYM_MASK(IBSerdesCtrl_0, RXLOSEN)); From 0a43e117221702b08a023d6aa1a31ac30e40866b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:19 -0800 Subject: [PATCH 21/42] IB/qib: Add receive header queue size module parameters The receive header queue sizes need to modified for performance tuning. Three module parameters are added to support this. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 28 +++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 9031cd82e879..b35676f44373 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -111,6 +111,21 @@ static ushort qib_singleport; module_param_named(singleport, qib_singleport, ushort, S_IRUGO); MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space"); +/* + * Receive header queue sizes + */ +static unsigned qib_rcvhdrcnt; +module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "receive header count"); + +static unsigned qib_rcvhdrsize; +module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words"); + +static unsigned qib_rcvhdrentsize; +module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words"); + #define MAX_ATTEN_LEN 64 /* plenty for any real system */ /* for read back, default index is ~5m copper cable */ static char txselect_list[MAX_ATTEN_LEN] = "10"; @@ -3530,8 +3545,11 @@ static void qib_7322_config_ctxts(struct qib_devdata *dd) /* kr_rcvegrcnt changes based on the number of contexts enabled */ dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); - dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, - dd->num_pports > 1 ? 1024U : 2048U); + if (qib_rcvhdrcnt) + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt); + else + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, + dd->num_pports > 1 ? 1024U : 2048U); } static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which) @@ -6097,8 +6115,10 @@ static int qib_init_7322_variables(struct qib_devdata *dd) ppd++; } - dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; - dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rcvhdrentsize = qib_rcvhdrentsize ? + qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = qib_rcvhdrsize ? + qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE; dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); /* we always allocate at least 2048 bytes for eager buffers */ From f509f9c14d3f70834f964189293bed3e0e1fc839 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:19 -0800 Subject: [PATCH 22/42] IB/qib: Add support for the new QME7362 card Add support to recognize another board variation named QME7362. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index b35676f44373..40f4a2353320 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -3172,6 +3172,10 @@ static unsigned qib_7322_boardname(struct qib_devdata *dd) case BOARD_QME7342: n = "InfiniPath_QME7342"; break; + case 8: + n = "InfiniPath_QME7362"; + dd->flags |= QIB_HAS_QSFP; + break; case 15: n = "InfiniPath_QLE7342_TEST"; dd->flags |= QIB_HAS_QSFP; From a377acd15188cf11e47a67af5feea098cd8b25d2 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:19 -0800 Subject: [PATCH 23/42] IB/qib: Generate completion callback on errors According to IBTA vol. 1, C11-30.1.1, a notification callback is invoked if the CQ is armed for the next solicited completion event or an error completion. The error case wasn't being generated correctly. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_cq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c index a86cbf880f98..5246aa486bbe 100644 --- a/drivers/infiniband/hw/qib/qib_cq.c +++ b/drivers/infiniband/hw/qib/qib_cq.c @@ -100,7 +100,8 @@ void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited) wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || - (cq->notify == IB_CQ_SOLICITED && solicited)) { + (cq->notify == IB_CQ_SOLICITED && + (solicited || entry->status != IB_WC_SUCCESS))) { cq->notify = IB_CQ_NONE; cq->triggered++; /* From 3c9e5f4d657170c2fbc3d382d2daae3820713a6c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:19 -0800 Subject: [PATCH 24/42] IB/qib: Set port physical state even if other fields are invalid The IBTA vol. 1 release 1.2.1 spec. says: C14-24.2.1: If PortInfo:Portstate=Down, then a SubnSet(PortInfo) shall make any changes it specifies to PortInfo:PortPhysicalState; any other result is vendor-dependent. The patch changes the error handling so that the reply says there are invalid fields but still attempts to set fields that are in range including PortInfo:PortPhysicalState. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_mad.c | 45 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 94b0d1f3a8f0..5ad224e4a38b 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -668,8 +668,8 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, lid = be16_to_cpu(pip->lid); /* Must be a valid unicast LID address. */ if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE) - goto err; - if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) { + smp->status |= IB_SMP_INVALID_FIELD; + else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) { if (ppd->lid != lid) qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT); if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) @@ -683,8 +683,8 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, msl = pip->neighbormtu_mastersmsl & 0xF; /* Must be a valid unicast LID address. */ if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE) - goto err; - if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { + smp->status |= IB_SMP_INVALID_FIELD; + else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { spin_lock_irqsave(&ibp->lock, flags); if (ibp->sm_ah) { if (smlid != ibp->sm_lid) @@ -707,8 +707,9 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, if (lwe == 0xFF) lwe = ppd->link_width_supported; else if (lwe >= 16 || (lwe & ~ppd->link_width_supported)) - goto err; - set_link_width_enabled(ppd, lwe); + smp->status |= IB_SMP_INVALID_FIELD; + else if (lwe != ppd->link_width_enabled) + set_link_width_enabled(ppd, lwe); } lse = pip->linkspeedactive_enabled & 0xF; @@ -721,8 +722,9 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, if (lse == 15) lse = ppd->link_speed_supported; else if (lse >= 8 || (lse & ~ppd->link_speed_supported)) - goto err; - set_link_speed_enabled(ppd, lse); + smp->status |= IB_SMP_INVALID_FIELD; + else if (lse != ppd->link_speed_enabled) + set_link_speed_enabled(ppd, lse); } /* Set link down default state. */ @@ -738,7 +740,7 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, IB_LINKINITCMD_POLL); break; default: - goto err; + smp->status |= IB_SMP_INVALID_FIELD; } ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6; @@ -748,15 +750,17 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF); if (mtu == -1) - goto err; - qib_set_mtu(ppd, mtu); + smp->status |= IB_SMP_INVALID_FIELD; + else + qib_set_mtu(ppd, mtu); /* Set operational VLs */ vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF; if (vls) { if (vls > ppd->vls_supported) - goto err; - (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls); + smp->status |= IB_SMP_INVALID_FIELD; + else + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls); } if (pip->mkey_violations == 0) @@ -770,10 +774,10 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, ore = pip->localphyerrors_overrunerrors; if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF)) - goto err; + smp->status |= IB_SMP_INVALID_FIELD; if (set_overrunthreshold(ppd, (ore & 0xF))) - goto err; + smp->status |= IB_SMP_INVALID_FIELD; ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; @@ -792,7 +796,7 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, state = pip->linkspeed_portstate & 0xF; lstate = (pip->portphysstate_linkdown >> 4) & 0xF; if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) - goto err; + smp->status |= IB_SMP_INVALID_FIELD; /* * Only state changes of DOWN, ARM, and ACTIVE are valid @@ -812,8 +816,10 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, lstate = QIB_IB_LINKDOWN; else if (lstate == 3) lstate = QIB_IB_LINKDOWN_DISABLE; - else - goto err; + else { + smp->status |= IB_SMP_INVALID_FIELD; + break; + } spin_lock_irqsave(&ppd->lflags_lock, flags); ppd->lflags &= ~QIBL_LINKV; spin_unlock_irqrestore(&ppd->lflags_lock, flags); @@ -835,8 +841,7 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, qib_set_linkstate(ppd, QIB_IB_LINKACTIVE); break; default: - /* XXX We have already partially updated our state! */ - goto err; + smp->status |= IB_SMP_INVALID_FIELD; } ret = subn_get_portinfo(smp, ibdev, port); From c7665e5a6988450a6cc19c2dc1dff4d290077614 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 25/42] IB/qib: UD send with immediate receive completion has wrong size The code to generate receive completion entries for UD send with immediate contains the wrong payload length. This is because when the code to compute the payload size was moved, the value of hdrsize didn't get moved too. The fix is to update tlen directly. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_ud.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index e1b3da2a1f85..a4b945d9a303 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -519,7 +519,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { wc.ex.imm_data = ohdr->u.ud.imm_data; wc.wc_flags = IB_WC_WITH_IMM; - hdrsize += sizeof(u32); + tlen -= sizeof(u32); } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { wc.ex.imm_data = 0; wc.wc_flags = 0; From b3d5cb2f2067b30da53aa67e42fdd733030fb411 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 26/42] IB/qib: Handle transitions from ACTIVE_DEFERRED to ACTIVE better When the link transitions from ACTIVE_DEFERRED to ACTIVE, the driver only sees the ACTIVE state. With this change, it will check whether the state was already ACTIVE and if so, it will not generated IB events and will not clear symbol error counts. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_intr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c index 54a40828a106..a693c56ec8a6 100644 --- a/drivers/infiniband/hw/qib/qib_intr.c +++ b/drivers/infiniband/hw/qib/qib_intr.c @@ -131,7 +131,8 @@ void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs) /* start a 75msec timer to clear symbol errors */ mod_timer(&ppd->symerr_clear_timer, msecs_to_jiffies(75)); - } else if (ltstate == IB_PHYSPORTSTATE_LINKUP) { + } else if (ltstate == IB_PHYSPORTSTATE_LINKUP && + !(ppd->lflags & QIBL_LINKACTIVE)) { /* active, but not active defered */ qib_hol_up(ppd); /* useful only for 6120 now */ *ppd->statusp |= From 5dbbcb97cc55dd6e0c34cf06f9e1ee181247d054 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 27/42] IB/qib: Fix multi-Florida HCA host panic on reboot Add check when setting configured contexts that the value does not exceed the number of contexts allocated for the card. If the value exceeds the already allocated count, set it to what is already allocated. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_init.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index f3b503936043..7896afbb9ce8 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -92,9 +92,11 @@ unsigned long *qib_cpulist; /* set number of contexts we'll actually use */ void qib_set_ctxtcnt(struct qib_devdata *dd) { - if (!qib_cfgctxts) + if (!qib_cfgctxts) { dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); - else if (qib_cfgctxts < dd->num_pports) + if (dd->cfgctxts > dd->ctxtcnt) + dd->cfgctxts = dd->ctxtcnt; + } else if (qib_cfgctxts < dd->num_pports) dd->cfgctxts = dd->ctxtcnt; else if (qib_cfgctxts <= dd->ctxtcnt) dd->cfgctxts = qib_cfgctxts; From 6676b3f746ff164a4a367c9aa5ae4b5b1145083c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 28/42] IB/qib: Fix context allocation with multiple HCAs The driver was incorrectly choosing HCAs on which to allocate new user contexts based on overall count of usable ports regardless whether the usable port was on the currently selected HCA. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_file_ops.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 79d9971aff1f..75bfad16c114 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1379,17 +1379,17 @@ static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, /* find device (with ACTIVE ports) with fewest ctxts in use */ for (ndev = 0; ndev < devmax; ndev++) { struct qib_devdata *dd = qib_lookup(ndev); - unsigned cused = 0, cfree = 0; + unsigned cused = 0, cfree = 0, pusable = 0; if (!dd) continue; if (port && port <= dd->num_pports && usable(dd->pport + port - 1)) - dusable = 1; + pusable = 1; else for (i = 0; i < dd->num_pports; i++) if (usable(dd->pport + i)) - dusable++; - if (!dusable) + pusable++; + if (!pusable) continue; for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) @@ -1397,7 +1397,7 @@ static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, cused++; else cfree++; - if (cfree && cused < inuse) { + if (pusable && cfree && cused < inuse) { udd = dd; inuse = cused; } From 16028f27778cb6439516c36c0a72446d29805691 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 29/42] IB/qib: Clear WAIT_SEND flags when setting QP to error state If these flags are set when the QP is transitioned to the error state, it will wait until the flags are cleared, which may never happen if the error transition is due to a link going down. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_qp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 6c39851d2ded..32dacd444158 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -468,6 +468,10 @@ int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err) qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); del_timer(&qp->s_timer); } + + if (qp->s_flags & QIB_S_ANY_WAIT_SEND) + qp->s_flags &= ~QIB_S_ANY_WAIT_SEND; + spin_lock(&dev->pending_lock); if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) { qp->s_flags &= ~QIB_S_ANY_WAIT_IO; From a0a234d47dcacfdb0a8dfcb861e0bd8300702674 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:20 -0800 Subject: [PATCH 30/42] IB/qib: New SERDES init routine and improvements to SI quality Implement new SERDES initialization routine and improvements to signal integrity -- disable LE1 adaptation, disable LOS after link-up, set better SERDES parameters. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 270 ++++++++++++++++++++++-- 1 file changed, 255 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 40f4a2353320..d23297a307aa 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -71,6 +71,9 @@ static void qib_7322_mini_pcs_reset(struct qib_pportdata *); static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32); static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned); +static void serdes_7322_los_enable(struct qib_pportdata *, int); +static int serdes_7322_init_old(struct qib_pportdata *); +static int serdes_7322_init_new(struct qib_pportdata *); #define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb)) @@ -1692,6 +1695,8 @@ static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst) (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) { force_h1(ppd); ppd->cpspec->qdr_reforce = 1; + if (!ppd->dd->cspec->r1) + serdes_7322_los_enable(ppd, 0); } else if (ppd->cpspec->qdr_reforce && (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) && (ibclt == IB_7322_LT_STATE_CFGENH || @@ -1707,15 +1712,32 @@ static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst) ibclt <= IB_7322_LT_STATE_SLEEPQUIET))) adj_tx_serdes(ppd); - if (!ppd->cpspec->qdr_dfe_on && ibclt != IB_7322_LT_STATE_LINKUP && - ibclt <= IB_7322_LT_STATE_SLEEPQUIET) { - ppd->cpspec->qdr_dfe_on = 1; - ppd->cpspec->qdr_dfe_time = 0; - /* On link down, reenable QDR adaptation */ - qib_write_kreg_port(ppd, krp_static_adapt_dis(2), - ppd->dd->cspec->r1 ? - QDR_STATIC_ADAPT_DOWN_R1 : - QDR_STATIC_ADAPT_DOWN); + if (ibclt != IB_7322_LT_STATE_LINKUP) { + u8 ltstate = qib_7322_phys_portstate(ibcst); + u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, + LinkTrainingState); + if (!ppd->dd->cspec->r1 && + pibclt == IB_7322_LT_STATE_LINKUP && + ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER && + ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN && + ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && + ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) + /* If the link went down (but no into recovery, + * turn LOS back on */ + serdes_7322_los_enable(ppd, 1); + if (!ppd->cpspec->qdr_dfe_on && + ibclt <= IB_7322_LT_STATE_SLEEPQUIET) { + ppd->cpspec->qdr_dfe_on = 1; + ppd->cpspec->qdr_dfe_time = 0; + /* On link down, reenable QDR adaptation */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : + QDR_STATIC_ADAPT_DOWN); + printk(KERN_INFO QIB_DRV_NAME + " IB%u:%u re-enabled QDR adaptation " + "ibclt %x\n", ppd->dd->unit, ppd->port, ibclt); + } } } @@ -5544,7 +5566,7 @@ static void qsfp_7322_event(struct work_struct *work) u64 now = get_jiffies_64(); if (time_after64(now, pwrup)) break; - msleep(1); + msleep(20); } ret = qib_refresh_qsfp_cache(ppd, &qd->cache); /* @@ -6519,7 +6541,7 @@ static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start, /* make sure we see an updated copy next time around */ sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); sleeps++; - msleep(1); + msleep(20); } switch (which) { @@ -7234,9 +7256,30 @@ static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data, } } +static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable) +{ + u64 data = qib_read_kreg_port(ppd, krp_serdesctrl); + printk(KERN_INFO QIB_DRV_NAME " Turning LOS %s for port %d\n", + (enable ? "on" : "off"), ppd->port); + if (enable) + data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + else + data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + qib_write_kreg_port(ppd, krp_serdesctrl, data); +} + static int serdes_7322_init(struct qib_pportdata *ppd) { - u64 data; + int ret = 0; + if (ppd->dd->cspec->r1) + ret = serdes_7322_init_old(ppd); + else + ret = serdes_7322_init_new(ppd); + return ret; +} + +static int serdes_7322_init_old(struct qib_pportdata *ppd) +{ u32 le_val; /* @@ -7294,9 +7337,7 @@ static int serdes_7322_init(struct qib_pportdata *ppd) ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ - data = qib_read_kreg_port(ppd, krp_serdesctrl); - qib_write_kreg_port(ppd, krp_serdesctrl, data | - SYM_MASK(IBSerdesCtrl_0, RXLOSEN)); + serdes_7322_los_enable(ppd, 1); /* rxbistena; set 0 to avoid effects of it switch later */ ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15); @@ -7336,6 +7377,205 @@ static int serdes_7322_init(struct qib_pportdata *ppd) return 0; } +static int serdes_7322_init_new(struct qib_pportdata *ppd) +{ + u64 tstart; + u32 le_val, rxcaldone; + int chan, chan_done = (1 << SERDES_CHANS) - 1; + + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + + /* Clear cmode-override, may be set from older driver */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); + + /* ensure no tx overrides from earlier driver loads */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + /* START OF LSI SUGGESTED SERDES BRINGUP */ + /* Reset - Calibration Setup */ + /* Stop DFE adaptaion */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1)); + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5)); + /* Disable autoadapt for LE1 */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15)); + /* Disable LE2 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6)); + /* Disable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + /* Disable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12)); + /* Disable Timing Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3)); + /* Disable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4)); + /* Disable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13)); + /* Disable RX Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + /* Disable RX Offset Calibration */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4)); + /* Select BB CDR */ + ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15)); + /* CDR Step Size */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8)); + /* Enable phase Calibration */ + ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5)); + /* DFE Bandwidth [2:14-12] */ + ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12)); + /* DFE Config (4 taps only) */ + ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0)); + /* Gain Loop Bandwidth */ + if (!ppd->dd->cspec->r1) { + ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12)); + ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8)); + } else { + ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11)); + } + /* Baseline Wander Correction Gain [13:4-0] (leave as default) */ + /* Baseline Wander Correction Gain [3:7-5] (leave as default) */ + /* Data Rate Select [5:7-6] (leave as default) */ + /* RX Parralel Word Width [3:10-8] (leave as default) */ + + /* RX REST */ + /* Single- or Multi-channel reset */ + /* RX Analog reset */ + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13)); + msleep(20); + /* RX Analog reset */ + ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14)); + msleep(20); + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13)); + msleep(20); + + /* setup LoS params; these are subsystem, so chan == 5 */ + /* LoS filter threshold_count on, ch 0-3, set to 8 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4)); + + /* LoS filter threshold_count off, ch 0-3, set to 4 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8)); + + /* LoS filter select enabled */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15); + + /* LoS target data: SDR=4, DDR=2, QDR=1 */ + ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */ + ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ + ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ + + /* Turn on LOS on initial SERDES init */ + serdes_7322_los_enable(ppd, 1); + /* FLoop LOS gate: PPM filter enabled */ + ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10); + + /* RX LATCH CALIBRATION */ + /* Enable Eyefinder Phase Calibration latch */ + ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0)); + /* Enable RX Offset Calibration latch */ + ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4)); + msleep(20); + /* Start Calibration */ + ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10)); + tstart = get_jiffies_64(); + while (chan_done && + !time_after64(tstart, tstart + msecs_to_jiffies(500))) { + msleep(20); + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 && + (~chan_done & (1 << chan)) == 0) + chan_done &= ~(1 << chan); + } + } + if (chan_done) { + printk(KERN_INFO QIB_DRV_NAME + " Serdes %d calibration not done after .5 sec: 0x%x\n", + IBSD(ppd->hw_pidx), chan_done); + } else { + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(10, 10)) == 0) + printk(KERN_INFO QIB_DRV_NAME + " Serdes %d chan %d calibration " + "failed\n", IBSD(ppd->hw_pidx), chan); + } + } + + /* Turn off Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + msleep(20); + + /* BRING RX UP */ + /* Set LE2 value (May be overridden in qsfp_7322_event) */ + le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7)); + /* Set LE2 Loop bandwidth */ + ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5)); + /* Enable LE2 */ + ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6)); + msleep(20); + /* Enable H0 only */ + ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1)); + /* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */ + le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac; + ibsd_wr_allchans(ppd, 21, le_val, 0xfffe); + /* Enable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + msleep(20); + /* Set Frequency Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, (7 << 5), BMASK(8, 5)); + /* Enable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4)); + /* Set Timing Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9)); + /* Enable Timing Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3)); + msleep(50); + /* Enable DFE + * Set receive adaptation mode. SDR and DDR adaptation are + * always on, and QDR is initially enabled; later disabled. + */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN); + ppd->cpspec->qdr_dfe_on = 1; + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5)); + /* Disable auto adapt for LE1 */ + ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15)); + msleep(20); + /* Enable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12)); + /* Enable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13)); + /* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */ + ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11)); + /* VGA output common mode */ + ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2)); + + return 0; +} + /* start adjust QMH serdes parameters */ static void set_man_code(struct qib_pportdata *ppd, int chan, int code) From f73df408b299e4bc5ed66cc50058585af64a8f9e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 31/42] IB/qib: Reset packet list after freeing Reset the list pointers after freeing the SDMA packet list. This is done to any potential double-free cases. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_user_sdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 4c19e06b5e85..66208bcd7c13 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -382,6 +382,7 @@ static void qib_user_sdma_free_pkt_list(struct device *dev, kmem_cache_free(pq->pkt_slab, pkt); } + INIT_LIST_HEAD(list); } /* From e706203c7c1cff8c27f9ce6d58911014a6bd826c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 32/42] IB/qib: Add a few new SERDES tunings Add new SERDES tuning to aid manufacturing. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 29 ++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index d23297a307aa..a9c8c7235fcd 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -562,6 +562,7 @@ static void write_tx_serdes_param(struct qib_pportdata *, struct txdds_ent *); #define TXDDS_TABLE_SZ 16 /* number of entries per speed in onchip table */ #define TXDDS_EXTRA_SZ 13 /* number of extra tx settings entries */ +#define TXDDS_MFG_SZ 2 /* number of mfg tx settings entries */ #define SERDES_CHANS 4 /* yes, it's obvious, but one less magic number */ #define H1_FORCE_VAL 8 @@ -5623,6 +5624,7 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change) u32 pidx, unit, port, deflt, h1; unsigned long val; int any = 0, seth1; + int txdds_size; str = txselect_list; @@ -5631,6 +5633,10 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change) for (pidx = 0; pidx < dd->num_pports; ++pidx) dd->pport[pidx].cpspec->no_eep = deflt; + txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ; + if (IS_QME(dd) || IS_QMH(dd)) + txdds_size += TXDDS_MFG_SZ; + while (*nxt && nxt[1]) { str = ++nxt; unit = simple_strtoul(str, &nxt, 0); @@ -5653,7 +5659,7 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change) ; continue; } - if (val >= TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ) + if (val >= txdds_size) continue; seth1 = 0; h1 = 0; /* gcc thinks it might be used uninitted */ @@ -5705,10 +5711,11 @@ static int setup_txselect(const char *str, struct kernel_param *kp) return -ENOSPC; } val = simple_strtoul(str, &n, 0); - if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) { + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { printk(KERN_INFO QIB_DRV_NAME "txselect_values must start with a number < %d\n", - TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ); + TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); return -EINVAL; } strcpy(txselect_list, str); @@ -7039,6 +7046,12 @@ static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = { { 0, 1, 0, 12 }, /* QMH7342 backplane settings */ }; +static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 0 }, /* QME7342 mfg settings */ + { 0, 0, 0, 6 }, /* QME7342 P2 mfg settings */ +}; + static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds, unsigned atten) { @@ -7112,6 +7125,16 @@ static void find_best_ent(struct qib_pportdata *ppd, *sdr_dds = &txdds_extra_sdr[idx]; *ddr_dds = &txdds_extra_ddr[idx]; *qdr_dds = &txdds_extra_qdr[idx]; + } else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) && + ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { + idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ); + printk(KERN_INFO QIB_DRV_NAME + " IB%u:%u use idx %u into txdds_mfg\n", + ppd->dd->unit, ppd->port, idx); + *sdr_dds = &txdds_extra_mfg[idx]; + *ddr_dds = &txdds_extra_mfg[idx]; + *qdr_dds = &txdds_extra_mfg[idx]; } else { /* this shouldn't happen, it's range checked */ *sdr_dds = txdds_sdr + qib_long_atten; From aa7374ac19ca08715693b0a2d9f88f479af3ea7c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 33/42] IB/qib: Avoid duplicate writes to the rcv head register Avoid duplicate writes to the head register as this can lead to lost interrupts if the context goes full before the second write is done. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_driver.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 9cd193603fb1..3ed0d5a9a566 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -71,6 +71,11 @@ MODULE_DESCRIPTION("QLogic IB driver"); */ #define QIB_PIO_MAXIBHDR 128 +/* + * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define QIB_MAX_PKT_RECV 64 + struct qlogic_ib_stats qib_stats; const char *qib_get_unit_name(int unit) @@ -335,7 +340,7 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ } - for (last = 0, i = 1; !last && i <= 64; i += !last) { + for (last = 0, i = 1; !last; i += !last) { hdr = dd->f_get_msgheader(dd, rhf_addr); eflags = qib_hdrget_err_flags(rhf_addr); etype = qib_hdrget_rcv_type(rhf_addr); @@ -384,6 +389,9 @@ move_along: l += rsize; if (l >= maxcnt) l = 0; + if (i == QIB_MAX_PKT_RECV) + last = 1; + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; if (dd->flags & QIB_NODMA_RTAIL) { u32 seq = qib_hdrget_seq(rhf_addr); From 19ede2e422496b2a064b9b22823c6afb66ff927b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 34/42] IB/qib: Fix interrupt mitigation For SusieQ we need to write to the interrupt timer register before updating the header queue head with interrupt count. This is to ensure that the timer is enabled properly and a receive available interrupt is delivered. Otherwise this interrupt can be lost if the receiver header/eager queues are full before the timer is enabled. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib.h | 2 +- drivers/infiniband/hw/qib/qib_driver.c | 4 ++-- drivers/infiniband/hw/qib/qib_iba6120.c | 2 +- drivers/infiniband/hw/qib/qib_iba7220.c | 2 +- drivers/infiniband/hw/qib/qib_iba7322.c | 10 +++++++--- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 64c9e7d02d4a..73225eee3cc6 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -766,7 +766,7 @@ struct qib_devdata { void (*f_sdma_hw_start_up)(struct qib_pportdata *); void (*f_sdma_init_early)(struct qib_pportdata *); void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32); - void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32); + void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32); u32 (*f_hdrqempty)(struct qib_ctxtdata *); u64 (*f_portcntr)(struct qib_pportdata *, u32); u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **, diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 3ed0d5a9a566..816a6bdc0b1c 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -410,7 +410,7 @@ move_along: */ lval = l; if (!last && !(i & 0xf)) { - dd->f_update_usrhead(rcd, lval, updegr, etail); + dd->f_update_usrhead(rcd, lval, updegr, etail, i); updegr = 0; } } @@ -452,7 +452,7 @@ bail: * if no packets were processed. */ lval = (u64)rcd->head | dd->rhdrhead_intr_off; - dd->f_update_usrhead(rcd, lval, updegr, etail); + dd->f_update_usrhead(rcd, lval, updegr, etail, i); return crcs; } diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index a5e29dbb9537..774dea897e9c 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -2074,7 +2074,7 @@ static void qib_6120_config_ctxts(struct qib_devdata *dd) } static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd, - u32 updegr, u32 egrhd) + u32 updegr, u32 egrhd, u32 npkts) { qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); if (updegr) diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 6fd8d74e7392..df49e8e7cc2a 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -2703,7 +2703,7 @@ static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what) } static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd, - u32 updegr, u32 egrhd) + u32 updegr, u32 egrhd, u32 npkts) { qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); if (updegr) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index a9c8c7235fcd..9bc6d0835e30 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2823,7 +2823,6 @@ static irqreturn_t qib_7322intr(int irq, void *data) ctxtrbits &= ~rmask; if (dd->rcd[i]) { qib_kreceive(dd->rcd[i], NULL, &npkts); - adjust_rcv_timeout(dd->rcd[i], npkts); } } rmask <<= 1; @@ -2873,7 +2872,6 @@ static irqreturn_t qib_7322pintr(int irq, void *data) (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt); qib_kreceive(rcd, NULL, &npkts); - adjust_rcv_timeout(rcd, npkts); return IRQ_HANDLED; } @@ -4047,8 +4045,14 @@ static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t) } static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd, - u32 updegr, u32 egrhd) + u32 updegr, u32 egrhd, u32 npkts) { + /* + * Need to write timeout register before updating rcvhdrhead to ensure + * that the timer is enabled on reception of a packet. + */ + if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT) + adjust_rcv_timeout(rcd, npkts); qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); if (updegr) From 2528ea60f94ef9e1e1cd82066d55f62a1d19fde1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 35/42] IB/qib: Change receive queue/QPN selection The basic idea is that on SusieQ, the difficult part of mapping QPN to context is handled by the mapping registers so the generic QPN allocation doesn't need to worry about chip specifics. For Monty and Linda, there is no mapping table so the qpt->mask (same as dd->qpn_mask), is used to see if the QPN to context falls within [zero..dd->n_krcv_queues). Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7220.c | 2 +- drivers/infiniband/hw/qib/qib_iba7322.c | 8 ++------ drivers/infiniband/hw/qib/qib_qp.c | 26 ++++++++++--------------- drivers/infiniband/hw/qib/qib_verbs.h | 10 ++-------- 4 files changed, 15 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index df49e8e7cc2a..127a0d5069f0 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -2297,7 +2297,7 @@ static void qib_7220_config_ctxts(struct qib_devdata *dd) nchipctxts = qib_read_kreg32(dd, kr_portcnt); dd->cspec->numctxts = nchipctxts; if (qib_n_krcv_queues > 1) { - dd->qpn_mask = 0x3f; + dd->qpn_mask = 0x3e; dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; if (dd->first_user_ctxt > nchipctxts) dd->first_user_ctxt = nchipctxts; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 9bc6d0835e30..d3b493824cdc 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -3515,11 +3515,6 @@ static void qib_7322_config_ctxts(struct qib_devdata *dd) nchipctxts = qib_read_kreg32(dd, kr_contextcnt); dd->cspec->numctxts = nchipctxts; if (qib_n_krcv_queues > 1 && dd->num_pports) { - /* - * Set the mask for which bits from the QPN are used - * to select a context number. - */ - dd->qpn_mask = 0x3f; dd->first_user_ctxt = NUM_IB_PORTS + (qib_n_krcv_queues - 1) * dd->num_pports; if (dd->first_user_ctxt > nchipctxts) @@ -5865,7 +5860,8 @@ static void write_7322_initregs(struct qib_devdata *dd) unsigned n, regno; unsigned long flags; - if (!dd->qpn_mask || !dd->pport[pidx].link_speed_supported) + if (dd->n_krcv_queues < 2 || + !dd->pport[pidx].link_speed_supported) continue; ppd = &dd->pport[pidx]; diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 32dacd444158..eaab008466ca 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -48,13 +48,12 @@ static inline unsigned mk_qpn(struct qib_qpn_table *qpt, static inline unsigned find_next_offset(struct qib_qpn_table *qpt, struct qpn_map *map, unsigned off, - unsigned r) + unsigned n) { if (qpt->mask) { off++; - if ((off & qpt->mask) >> 1 != r) - off = ((off & qpt->mask) ? - (off | qpt->mask) + 1 : off) | (r << 1); + if (((off & qpt->mask) >> 1) >= n) + off = (off | qpt->mask) + 2; } else off = find_next_zero_bit(map->page, BITS_PER_PAGE, off); return off; @@ -123,7 +122,6 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, u32 i, offset, max_scan, qpn; struct qpn_map *map; u32 ret; - int r; if (type == IB_QPT_SMI || type == IB_QPT_GSI) { unsigned n; @@ -139,15 +137,11 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, goto bail; } - r = smp_processor_id(); - if (r >= dd->n_krcv_queues) - r %= dd->n_krcv_queues; qpn = qpt->last + 1; if (qpn >= QPN_MAX) qpn = 2; - if (qpt->mask && ((qpn & qpt->mask) >> 1) != r) - qpn = ((qpn & qpt->mask) ? (qpn | qpt->mask) + 1 : qpn) | - (r << 1); + if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) + qpn = (qpn | qpt->mask) + 2; offset = qpn & BITS_PER_PAGE_MASK; map = &qpt->map[qpn / BITS_PER_PAGE]; max_scan = qpt->nmaps - !offset; @@ -163,7 +157,8 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, ret = qpn; goto bail; } - offset = find_next_offset(qpt, map, offset, r); + offset = find_next_offset(qpt, map, offset, + dd->n_krcv_queues); qpn = mk_qpn(qpt, map, offset); /* * This test differs from alloc_pidmap(). @@ -183,13 +178,13 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, if (qpt->nmaps == QPNMAP_ENTRIES) break; map = &qpt->map[qpt->nmaps++]; - offset = qpt->mask ? (r << 1) : 0; + offset = 0; } else if (map < &qpt->map[qpt->nmaps]) { ++map; - offset = qpt->mask ? (r << 1) : 0; + offset = 0; } else { map = &qpt->map[0]; - offset = qpt->mask ? (r << 1) : 2; + offset = 2; } qpn = mk_qpn(qpt, map, offset); } @@ -1065,7 +1060,6 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, } qp->ibqp.qp_num = err; qp->port_num = init_attr->port_num; - qp->processor_id = smp_processor_id(); qib_reset_qp(qp, init_attr->qp_type); break; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index bd57c1273225..a08ceab510e1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -435,7 +435,6 @@ struct qib_qp { spinlock_t r_lock; /* used for APM */ spinlock_t s_lock; atomic_t s_dma_busy; - unsigned processor_id; /* Processor ID QP is bound to */ u32 s_flags; u32 s_cur_size; /* size of send packet in bytes */ u32 s_len; /* total length of s_sge */ @@ -813,13 +812,8 @@ extern struct workqueue_struct *qib_cq_wq; */ static inline void qib_schedule_send(struct qib_qp *qp) { - if (qib_send_ok(qp)) { - if (qp->processor_id == smp_processor_id()) - queue_work(qib_wq, &qp->s_work); - else - queue_work_on(qp->processor_id, - qib_wq, &qp->s_work); - } + if (qib_send_ok(qp)) + queue_work(qib_wq, &qp->s_work); } static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) From 057ae62faceccb50b0c2387af60dbbbcd40aab84 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:21 -0800 Subject: [PATCH 36/42] IB/qib: Add fix missing from earlier patch The upstream code was missing part of a receive/error race fix from the internal tree. Add the missing part, which makes future merges possible. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_ud.c | 55 +++++++++++++----------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index a4b945d9a303..4a51fd1e9cb7 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -445,13 +445,14 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, qkey = be32_to_cpu(ohdr->u.ud.deth[0]); src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK; - /* Get the number of bytes the message was padded by. */ + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - if (unlikely(tlen < (hdrsize + pad + 4))) { - /* Drop incomplete packets. */ - ibp->n_pkt_drops++; - goto bail; - } + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + tlen -= hdrsize + pad + 4; /* @@ -460,10 +461,8 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, */ if (qp->ibqp.qp_num) { if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || - hdr->lrh[3] == IB_LID_PERMISSIVE)) { - ibp->n_pkt_drops++; - goto bail; - } + hdr->lrh[3] == IB_LID_PERMISSIVE)) + goto drop; if (qp->ibqp.qp_num > 1) { u16 pkey1, pkey2; @@ -476,7 +475,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, 0xF, src_qp, qp->ibqp.qp_num, hdr->lrh[3], hdr->lrh[1]); - goto bail; + return; } } if (unlikely(qkey != qp->qkey)) { @@ -484,30 +483,24 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, src_qp, qp->ibqp.qp_num, hdr->lrh[3], hdr->lrh[1]); - goto bail; + return; } /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && (tlen != 256 || - (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) { - ibp->n_pkt_drops++; - goto bail; - } + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + goto drop; } else { struct ib_smp *smp; /* Drop invalid MAD packets (see 13.5.3.1). */ - if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15) { - ibp->n_pkt_drops++; - goto bail; - } + if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15) + goto drop; smp = (struct ib_smp *) data; if ((hdr->lrh[1] == IB_LID_PERMISSIVE || hdr->lrh[3] == IB_LID_PERMISSIVE) && - smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - ibp->n_pkt_drops++; - goto bail; - } + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; } /* @@ -523,10 +516,8 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { wc.ex.imm_data = 0; wc.wc_flags = 0; - } else { - ibp->n_pkt_drops++; - goto bail; - } + } else + goto drop; /* * A GRH is expected to preceed the data even if not @@ -556,8 +547,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, /* Silently drop packets which are too big. */ if (unlikely(wc.byte_len > qp->r_len)) { qp->r_flags |= QIB_R_REUSE_SGE; - ibp->n_pkt_drops++; - return; + goto drop; } if (has_grh) { qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, @@ -594,5 +584,8 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED)) != 0); -bail:; + return; + +drop: + ibp->n_pkt_drops++; } From 7c3edd3ff3098093e594dbcbc9dbeeae09b1b4a0 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:22 -0800 Subject: [PATCH 37/42] IB/qib: Change QPN increment Changing from +1 to +2 allows for better QP distribution across receive contexts. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index eaab008466ca..e16751f8639e 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -137,7 +137,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, goto bail; } - qpn = qpt->last + 1; + qpn = qpt->last + 2; if (qpn >= QPN_MAX) qpn = 2; if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) From 2a600f14d25fda341b5633c75cc50a7574fc1007 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:22 -0800 Subject: [PATCH 38/42] IB/qib: RDMA lkey/rkey validation is inefficient for large MRs The current code loops during rkey/lkey validiation to isolate the MR for the RDMA, which is expensive when the current operation is inside a very large memory region. This fix optimizes rkey/lkey validation routines for user memory regions and fast memory regions. The MR entry can be isolated by shifts/mods instead of looping. The existing loop is preserved for phys memory regions for now. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_keys.c | 74 +++++++++++++++++++-------- drivers/infiniband/hw/qib/qib_mr.c | 8 +-- drivers/infiniband/hw/qib/qib_verbs.h | 1 + 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index 4b80eb153d57..756d16098e73 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -158,31 +158,47 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, isge->sge_length = sge->length; isge->m = 0; isge->n = 0; + spin_unlock_irqrestore(&rkt->lock, flags); goto ok; } mr = rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) goto bail; + atomic_inc(&mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); off = sge->addr - mr->user_base; if (unlikely(sge->addr < mr->user_base || off + sge->length > mr->length || (mr->access_flags & acc) != acc)) - goto bail; + return ret; off += mr->offset; - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= QIB_SEGSZ) { - m++; - n = 0; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } } } - atomic_inc(&mr->refcount); isge->mr = mr; isge->vaddr = mr->map[m]->segs[n].vaddr + off; isge->length = mr->map[m]->segs[n].length - off; @@ -191,6 +207,7 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, isge->n = n; ok: ret = 1; + return ret; bail: spin_unlock_irqrestore(&rkt->lock, flags); return ret; @@ -237,30 +254,46 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, sge->sge_length = len; sge->m = 0; sge->n = 0; + spin_unlock_irqrestore(&rkt->lock, flags); goto ok; } mr = rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) goto bail; + atomic_inc(&mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); off = vaddr - mr->iova; if (unlikely(vaddr < mr->iova || off + len > mr->length || (mr->access_flags & acc) == 0)) - goto bail; + return ret; off += mr->offset; - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= QIB_SEGSZ) { - m++; - n = 0; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } } } - atomic_inc(&mr->refcount); sge->mr = mr; sge->vaddr = mr->map[m]->segs[n].vaddr + off; sge->length = mr->map[m]->segs[n].length - off; @@ -269,6 +302,7 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, sge->n = n; ok: ret = 1; + return ret; bail: spin_unlock_irqrestore(&rkt->lock, flags); return ret; diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 5f95f0f6385d..08944e2ee334 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -39,7 +39,6 @@ /* Fast memory region */ struct qib_fmr { struct ib_fmr ibfmr; - u8 page_shift; struct qib_mregion mr; /* must be last */ }; @@ -107,6 +106,7 @@ static struct qib_mr *alloc_mr(int count, struct qib_lkey_table *lk_table) goto bail; } mr->mr.mapsz = m; + mr->mr.page_shift = 0; mr->mr.max_segs = count; /* @@ -231,6 +231,8 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mr.access_flags = mr_access_flags; mr->umem = umem; + if (is_power_of_2(umem->page_size)) + mr->mr.page_shift = ilog2(umem->page_size); m = 0; n = 0; list_for_each_entry(chunk, &umem->chunk_list, list) { @@ -390,7 +392,7 @@ struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, fmr->mr.offset = 0; fmr->mr.access_flags = mr_access_flags; fmr->mr.max_segs = fmr_attr->max_pages; - fmr->page_shift = fmr_attr->page_shift; + fmr->mr.page_shift = fmr_attr->page_shift; atomic_set(&fmr->mr.refcount, 0); ret = &fmr->ibfmr; @@ -437,7 +439,7 @@ int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, spin_lock_irqsave(&rkt->lock, flags); fmr->mr.user_base = iova; fmr->mr.iova = iova; - ps = 1 << fmr->page_shift; + ps = 1 << fmr->mr.page_shift; fmr->mr.length = list_len * ps; m = 0; n = 0; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index a08ceab510e1..63b22a9a7feb 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -301,6 +301,7 @@ struct qib_mregion { int access_flags; u32 max_segs; /* number of qib_segs in all the arrays */ u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ atomic_t refcount; struct qib_segarray *map[0]; /* the segments */ }; From 994bcd28a36af1413381dfe0aac065e2cbc2af40 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:22 -0800 Subject: [PATCH 39/42] IB/qib: Issue pre-emptive NAKs on eager buffer overflow Under congestion resulting in eager buffer overflow attempt to send pre-emptive NAKs if header queue entries with TID errors are generated and a valid header is present. This prevents long timeouts and flow restarts if a trailing set of packets are dropped due to eager overflows. Pre-emptive NAKs are currently only supported for RDMA writes. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_driver.c | 141 ++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 816a6bdc0b1c..23e584f4c36c 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -289,14 +289,147 @@ static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail) * Returns 1 if error was a CRC, else 0. * Needed for some chip's synthesized error counters. */ -static u32 qib_rcv_hdrerr(struct qib_pportdata *ppd, u32 ctxt, - u32 eflags, u32 l, u32 etail, __le32 *rhf_addr, - struct qib_message_header *hdr) +static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, + u32 ctxt, u32 eflags, u32 l, u32 etail, + __le32 *rhf_addr, struct qib_message_header *rhdr) { u32 ret = 0; if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR)) ret = 1; + else if (eflags == QLOGIC_IB_RHF_H_TIDERR) { + /* For TIDERR and RC QPs premptively schedule a NAK */ + struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr; + struct qib_other_headers *ohdr = NULL; + struct qib_ibport *ibp = &ppd->ibport_data; + struct qib_qp *qp = NULL; + u32 tlen = qib_hdrget_length_in_bytes(rhf_addr); + u16 lid = be16_to_cpu(hdr->lrh[1]); + int lnh = be16_to_cpu(hdr->lrh[0]) & 3; + u32 qp_num; + u32 opcode; + u32 psn; + int diff; + unsigned long flags; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + if (lid < QIB_MULTICAST_LID_BASE) { + lid &= ~((1 << ppd->lmc) - 1); + if (unlikely(lid != ppd->lid)) + goto drop; + } + + /* Check for GRH */ + if (lnh == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == QIB_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else + goto drop; + + /* Get opcode and PSN from packet */ + opcode = be32_to_cpu(ohdr->bth[0]); + opcode >>= 24; + psn = be32_to_cpu(ohdr->bth[2]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + if (qp_num != QIB_MULTICAST_QPN) { + int ruc_res; + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_qib_state_ops[qp->state] & + QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto unlock; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + spin_lock_irqsave(&qp->s_lock, flags); + ruc_res = + qib_ruc_check_hdr( + ibp, hdr, + lnh == QIB_LRH_GRH, + qp, + be32_to_cpu(ohdr->bth[0])); + if (ruc_res) { + spin_unlock_irqrestore(&qp->s_lock, + flags); + goto unlock; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Only deal with RDMA Writes for now */ + if (opcode < + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = qib_cmp24(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->n_rc_seqnak++; + qp->r_nak_state = + IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= + QIB_R_RSP_NAK; + atomic_inc( + &qp->refcount); + list_add_tail( + &qp->rspwait, + &rcd->qp_wait_list); + } + } /* Out of sequence NAK */ + } /* QP Request NAKs */ + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + case IB_QPT_UC: + default: + /* For now don't handle any other QP types */ + break; + } + +unlock: + spin_unlock(&qp->r_lock); + /* + * Notify qib_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + +drop: return ret; } @@ -376,7 +509,7 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) * packets; only qibhdrerr should be set. */ if (unlikely(eflags)) - crcs += qib_rcv_hdrerr(ppd, rcd->ctxt, eflags, l, + crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l, etail, rhf_addr, hdr); else if (etype == RCVHQ_RCV_TYPE_NON_KD) { qib_ib_rcv(rcd, hdr, ebuf, tlen); From dd04e43d46ad7a4e625a9ff3b270dc0db9abe81d Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:22 -0800 Subject: [PATCH 40/42] IB/qib: Unnecessary delayed completions on RC connection Currently on receipt of a response message (ACKs, RDMA Response, Atomic Responses etc.) if the SDMA completion counter is not advanced the driver delays the completion of the WQE. In most cases this is overly pessimistic as the response (ACK) to a previously transmitted send implies that the send is complete. Ensure that SDMA queue is progressed appropriately before determining if a send has delayed completions. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_rc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 955fb7157793..8245237b67ce 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1407,6 +1407,7 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, struct qib_ctxtdata *rcd) { struct qib_swqe *wqe; + struct qib_pportdata *ppd = ppd_from_ibp(ibp); enum ib_wc_status status; unsigned long flags; int diff; @@ -1414,6 +1415,29 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, u32 aeth; u64 val; + if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) { + /* + * If ACK'd PSN on SDMA busy list try to make progress to + * reclaim SDMA credits. + */ + if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) && + (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) { + + /* + * If send tasklet not running attempt to progress + * SDMA queue. + */ + if (!(qp->s_flags & QIB_S_BUSY)) { + /* Acquire SDMA Lock */ + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* Invoke sdma make progress */ + qib_sdma_make_progress(ppd); + /* Release SDMA Lock */ + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + spin_lock_irqsave(&qp->s_lock, flags); /* Ignore invalid responses. */ From f2d255a0787119f7f4dc0e6093a0bd2700a49402 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:22 -0800 Subject: [PATCH 41/42] IB/qib: Improve SERDES tunning on QMH boards Improve the QMH SERDES tunning on initial driver load by having the driver go through a link state change. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 26 ++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index d3b493824cdc..dbbb0e85afe4 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -623,6 +623,7 @@ struct qib_chippport_specific { u8 ibmalfusesnap; struct qib_qsfp_data qsfp_data; char epmsgbuf[192]; /* for port error interrupt msg buffer */ + u8 bounced; }; static struct { @@ -1742,6 +1743,8 @@ static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst) } } +static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32); + /* * This is per-pport error handling. * will likely get it's own MSIx interrupt (one for each port, @@ -1878,7 +1881,23 @@ static noinline void handle_7322_p_errors(struct qib_pportdata *ppd) IB_PHYSPORTSTATE_DISABLED) qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); - else + else { + u32 lstate; + /* + * We need the current logical link state before + * lflags are set in handle_e_ibstatuschanged. + */ + lstate = qib_7322_iblink_state(ibcs); + + if (IS_QMH(dd) && !ppd->cpspec->bounced && + ltstate == IB_PHYSPORTSTATE_LINKUP && + (lstate >= IB_PORT_INIT && + lstate <= IB_PORT_ACTIVE)) { + ppd->cpspec->bounced = 1; + qib_7322_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL); + } + /* * Since going into a recovery state causes the link * state to go down and since recovery is transitory, @@ -1892,6 +1911,7 @@ static noinline void handle_7322_p_errors(struct qib_pportdata *ppd) ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) qib_handle_e_ibstatuschanged(ppd, ibcs); + } } if (*msg && iserr) qib_dev_porterr(dd, ppd->port, "%s error\n", msg); @@ -7282,8 +7302,8 @@ static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data, static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable) { u64 data = qib_read_kreg_port(ppd, krp_serdesctrl); - printk(KERN_INFO QIB_DRV_NAME " Turning LOS %s for port %d\n", - (enable ? "on" : "off"), ppd->port); + printk(KERN_INFO QIB_DRV_NAME " IB%u:%u Turning LOS %s\n", + ppd->dd->unit, ppd->port, (enable ? "on" : "off")); if (enable) data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN); else From 4db62d4786e946e6fc8c2bb1f9201508f7f46c41 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jan 2011 17:42:23 -0800 Subject: [PATCH 42/42] IB/qib: Fix refcount leak in lkey/rkey validation The mr optimization introduced a reference count leak on an exception test. The lock/refcount manipulation is moved down and the problematic exception test now calls bail to insure that the lock is released. Additional fixes as suggested by Ralph Campbell : - reduce lock scope of dma regions - use explicit values on returns vs. automatic ret value Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_keys.c | 30 +++++++++++++--------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index 756d16098e73..8fd19a47df0c 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -136,7 +136,6 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, struct qib_mregion *mr; unsigned n, m; size_t off; - int ret = 0; unsigned long flags; /* @@ -152,27 +151,28 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, if (!dev->dma_mr) goto bail; atomic_inc(&dev->dma_mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); + isge->mr = dev->dma_mr; isge->vaddr = (void *) sge->addr; isge->length = sge->length; isge->sge_length = sge->length; isge->m = 0; isge->n = 0; - spin_unlock_irqrestore(&rkt->lock, flags); goto ok; } mr = rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) goto bail; - atomic_inc(&mr->refcount); - spin_unlock_irqrestore(&rkt->lock, flags); off = sge->addr - mr->user_base; if (unlikely(sge->addr < mr->user_base || off + sge->length > mr->length || (mr->access_flags & acc) != acc)) - return ret; + goto bail; + atomic_inc(&mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); off += mr->offset; if (mr->page_shift) { @@ -206,11 +206,10 @@ int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, isge->m = m; isge->n = n; ok: - ret = 1; - return ret; + return 1; bail: spin_unlock_irqrestore(&rkt->lock, flags); - return ret; + return 0; } /** @@ -231,7 +230,6 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, struct qib_mregion *mr; unsigned n, m; size_t off; - int ret = 0; unsigned long flags; /* @@ -248,26 +246,27 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, if (!dev->dma_mr) goto bail; atomic_inc(&dev->dma_mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); + sge->mr = dev->dma_mr; sge->vaddr = (void *) vaddr; sge->length = len; sge->sge_length = len; sge->m = 0; sge->n = 0; - spin_unlock_irqrestore(&rkt->lock, flags); goto ok; } mr = rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) goto bail; - atomic_inc(&mr->refcount); - spin_unlock_irqrestore(&rkt->lock, flags); off = vaddr - mr->iova; if (unlikely(vaddr < mr->iova || off + len > mr->length || (mr->access_flags & acc) == 0)) - return ret; + goto bail; + atomic_inc(&mr->refcount); + spin_unlock_irqrestore(&rkt->lock, flags); off += mr->offset; if (mr->page_shift) { @@ -301,11 +300,10 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, sge->m = m; sge->n = n; ok: - ret = 1; - return ret; + return 1; bail: spin_unlock_irqrestore(&rkt->lock, flags); - return ret; + return 0; } /*