From f8181697fd09bb3b6d857cef31d9af93ea2b0758 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 1 Jul 2016 16:00:49 -0700 Subject: [PATCH 01/84] IB/hfi1: Clean up port state structure definition The definition of port state changed mid development and the old structure was kept accidentally. Remove this dead code. Reviewed-by: Dennis Dalessandro Signed-off-by: Ira Weiny Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 12 ------------ drivers/infiniband/hw/hfi1/mad.h | 7 ------- include/rdma/opa_port_info.h | 16 ---------------- 3 files changed, 35 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index fca07a1d6c28..223dd46cf2aa 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -588,7 +588,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->port_phys_conf = (ppd->port_type & 0xf); -#if PI_LED_ENABLE_SUP pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; pi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; @@ -602,11 +601,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6; pi->port_states.ledenable_offlinereason |= ppd->offline_disabled_reason; -#else - pi->port_states.offline_reason = ppd->neighbor_normal << 4; - pi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - pi->port_states.offline_reason |= ppd->offline_disabled_reason; -#endif /* PI_LED_ENABLE_SUP */ pi->port_states.portphysstate_portstate = (hfi1_ibphys_portstate(ppd) << 4) | state; @@ -1752,17 +1746,11 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, if (start_of_sm_config && (lstate == IB_PORT_INIT)) ppd->is_sm_config_started = 1; -#if PI_LED_ENABLE_SUP psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; psi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; psi->port_states.ledenable_offlinereason |= ppd->offline_disabled_reason; -#else - psi->port_states.offline_reason = ppd->neighbor_normal << 4; - psi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - psi->port_states.offline_reason |= ppd->offline_disabled_reason; -#endif /* PI_LED_ENABLE_SUP */ psi->port_states.portphysstate_portstate = (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf); diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index 8b734aaae88a..5aa3fd1be653 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -48,15 +48,8 @@ #define _HFI1_MAD_H #include -#define USE_PI_LED_ENABLE 1 /* - * use led enabled bit in struct - * opa_port_states, if available - */ #include #include -#ifndef PI_LED_ENABLE_SUP -#define PI_LED_ENABLE_SUP 0 -#endif #include "opa_compat.h" /* diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 2b95c2c336eb..9303e0e4f508 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -33,11 +33,6 @@ #if !defined(OPA_PORT_INFO_H) #define OPA_PORT_INFO_H -/* Temporary until HFI driver is updated */ -#ifndef USE_PI_LED_ENABLE -#define USE_PI_LED_ENABLE 0 -#endif - #define OPA_PORT_LINK_MODE_NOP 0 /* No change */ #define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */ @@ -274,23 +269,12 @@ enum port_info_field_masks { OPA_PI_MASK_MTU_CAP = 0x0F, }; -#if USE_PI_LED_ENABLE struct opa_port_states { u8 reserved; u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */ u8 reserved2; u8 portphysstate_portstate; /* 4 bits, 4 bits */ }; -#define PI_LED_ENABLE_SUP 1 -#else -struct opa_port_states { - u8 reserved; - u8 offline_reason; /* 2 res, 6 bits */ - u8 reserved2; - u8 portphysstate_portstate; /* 4 bits, 4 bits */ -}; -#define PI_LED_ENABLE_SUP 0 -#endif struct opa_port_state_info { struct opa_port_states port_states; From 0904f32796d4bb2d8102cd0056d8634f247ce45a Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 1 Jul 2016 16:00:55 -0700 Subject: [PATCH 02/84] IB/hfi1: Remove unnecessary done label in hfi1_write_iter Simple code clean up of hfi1_write_iter. Reviewed-by: Dennis Dalessandro Signed-off-by: Ira Weiny Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 31 ++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index c702a009608f..2f097d942f9c 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -392,41 +392,38 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) struct hfi1_filedata *fd = kiocb->ki_filp->private_data; struct hfi1_user_sdma_pkt_q *pq = fd->pq; struct hfi1_user_sdma_comp_q *cq = fd->cq; - int ret = 0, done = 0, reqs = 0; + int done = 0, reqs = 0; unsigned long dim = from->nr_segs; - if (!cq || !pq) { - ret = -EIO; - goto done; - } + if (!cq || !pq) + return -EIO; - if (!iter_is_iovec(from) || !dim) { - ret = -EINVAL; - goto done; - } + if (!iter_is_iovec(from) || !dim) + return -EINVAL; hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", fd->uctxt->ctxt, fd->subctxt, dim); - if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { - ret = -ENOSPC; - goto done; - } + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) + return -ENOSPC; while (dim) { + int ret; unsigned long count = 0; ret = hfi1_user_sdma_process_request( kiocb->ki_filp, (struct iovec *)(from->iov + done), dim, &count); - if (ret) - goto done; + if (ret) { + reqs = ret; + break; + } dim -= count; done += count; reqs++; } -done: - return ret ? ret : reqs; + + return reqs; } static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) From 21a4c95d3fe8a0aacc4682f46b892cb4048b36b7 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Fri, 1 Jul 2016 16:01:00 -0700 Subject: [PATCH 03/84] IB/hfi1: Fix typo Fix a copy and paste typo in comment. Reviewed-by: Dennis Dalessandro Signed-off-by: Tadeusz Struk Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qsfp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 9fb561682c66..6fca2a09b5f1 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -243,7 +243,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, /* * Perform a stand-alone single QSFP write. Acquire the resource, do the - * read, then release the resource. + * write, then release the resource. */ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) From 462b6b21709fb65ee42ddea722d5cf745251417a Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Fri, 1 Jul 2016 16:01:06 -0700 Subject: [PATCH 04/84] IB/hfi1: Separate tracepoints into specific headers The ftrace infrastructure used to evaluate the TRACE_SYSTEM macro on every DEFINE_EVENT() macro. Now the TRACE_SYSTEM macro only gets evaluated when trace/define_trace.h is included, so the group event information is lost. This was introduced in commit acd388fd3af3 ("tracing: Give system name a pointer") Therefore, each system tracepoint must be on its own file. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 51 + drivers/infiniband/hw/hfi1/rc.c | 8 +- drivers/infiniband/hw/hfi1/trace.h | 1333 +-------------------- drivers/infiniband/hw/hfi1/trace_ctxts.h | 141 +++ drivers/infiniband/hw/hfi1/trace_dbg.h | 155 +++ drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 209 ++++ drivers/infiniband/hw/hfi1/trace_misc.h | 81 ++ drivers/infiniband/hw/hfi1/trace_rc.h | 123 ++ drivers/infiniband/hw/hfi1/trace_rx.h | 322 +++++ drivers/infiniband/hw/hfi1/trace_tx.h | 642 ++++++++++ 10 files changed, 1735 insertions(+), 1330 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/trace_ctxts.h create mode 100644 drivers/infiniband/hw/hfi1/trace_dbg.h create mode 100644 drivers/infiniband/hw/hfi1/trace_ibhdrs.h create mode 100644 drivers/infiniband/hw/hfi1/trace_misc.h create mode 100644 drivers/infiniband/hw/hfi1/trace_rc.h create mode 100644 drivers/infiniband/hw/hfi1/trace_rx.h create mode 100644 drivers/infiniband/hw/hfi1/trace_tx.h diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4417a0fd3ef9..1dd48efb5b61 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1947,4 +1947,55 @@ static inline u32 qsfp_resource(struct hfi1_devdata *dd) int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 792f15eb8efe..3aeb83297408 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1047,7 +1047,7 @@ void hfi1_rc_timeout(unsigned long arg) ibp->rvp.n_rc_timeouts++; qp->s_flags &= ~RVT_S_TIMER; del_timer(&qp->s_timer); - trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1); + trace_hfi1_timeout(qp, qp->s_last_psn + 1); restart_rc(qp, qp->s_last_psn + 1, 1); hfi1_schedule_send(qp); } @@ -1171,7 +1171,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr) * If we were waiting for sends to complete before re-sending, * and they are now complete, restart sending. */ - trace_hfi1_rc_sendcomplete(qp, psn); + trace_hfi1_sendcomplete(qp, psn); if (qp->s_flags & RVT_S_WAIT_PSN && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { qp->s_flags &= ~RVT_S_WAIT_PSN; @@ -1567,7 +1567,7 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, spin_lock_irqsave(&qp->s_lock, flags); - trace_hfi1_rc_ack(qp, psn); + trace_hfi1_ack(qp, psn); /* Ignore invalid responses. */ smp_read_barrier_depends(); /* see post_one_send */ @@ -1782,7 +1782,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, u8 i, prev; int old_req; - trace_hfi1_rc_rcv_error(qp, psn); + trace_hfi1_rcv_error(qp, psn); if (diff > 0) { /* * Packet sequence error. diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 28c1d0832886..92dc88f013c9 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -44,1329 +44,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#undef TRACE_SYSTEM_VAR -#define TRACE_SYSTEM_VAR hfi1 - -#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __HFI1_TRACE_H - -#include -#include - -#include "hfi.h" -#include "mad.h" -#include "sdma.h" - -#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) -#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) - -#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } -#define show_packettype(etype) \ -__print_symbolic(etype, \ - packettype_name(EXPECTED), \ - packettype_name(EAGER), \ - packettype_name(IB), \ - packettype_name(ERROR), \ - packettype_name(BYPASS)) - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_rx - -TRACE_EVENT(hfi1_rcvhdr, - TP_PROTO(struct hfi1_devdata *dd, - u32 ctxt, - u64 eflags, - u32 etype, - u32 hlen, - u32 tlen, - u32 updegr, - u32 etail - ), - TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u64, eflags) - __field(u32, ctxt) - __field(u32, etype) - __field(u32, hlen) - __field(u32, tlen) - __field(u32, updegr) - __field(u32, etail) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->eflags = eflags; - __entry->ctxt = ctxt; - __entry->etype = etype; - __entry->hlen = hlen; - __entry->tlen = tlen; - __entry->updegr = updegr; - __entry->etail = etail; - ), - TP_printk( - "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", - __get_str(dev), - __entry->ctxt, - __entry->eflags, - __entry->etype, show_packettype(__entry->etype), - __entry->hlen, - __entry->tlen, - __entry->updegr, - __entry->etail - ) -); - -TRACE_EVENT(hfi1_receive_interrupt, - TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), - TP_ARGS(dd, ctxt), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u32, ctxt) - __field(u8, slow_path) - __field(u8, dma_rtail) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt) { - __entry->slow_path = 1; - __entry->dma_rtail = 0xFF; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_dma_rtail){ - __entry->dma_rtail = 1; - __entry->slow_path = 0; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_nodma_rtail) { - __entry->dma_rtail = 0; - __entry->slow_path = 0; - } - ), - TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", - __get_str(dev), - __entry->ctxt, - __entry->slow_path, - __entry->dma_rtail - ) -); - -TRACE_EVENT(hfi1_exp_tid_reg, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, - u32 npages, unsigned long va, unsigned long pa, - dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_exp_tid_unreg, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_exp_tid_inval, - TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr, - u32 npages, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __field(unsigned long, va) - __field(u32, rarr) - __field(u32, npages) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->va = va; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->va, - __entry->dma - ) - ); - -TRACE_EVENT(hfi1_mmu_invalidate, - TP_PROTO(unsigned ctxt, u16 subctxt, const char *type, - unsigned long start, unsigned long end), - TP_ARGS(ctxt, subctxt, type, start, end), - TP_STRUCT__entry( - __field(unsigned, ctxt) - __field(u16, subctxt) - __string(type, type) - __field(unsigned long, start) - __field(unsigned long, end) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __assign_str(type, type); - __entry->start = start; - __entry->end = end; - ), - TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", - __entry->ctxt, - __entry->subctxt, - __get_str(type), - __entry->start, - __entry->end - ) - ); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_tx - -TRACE_EVENT(hfi1_piofree, - TP_PROTO(struct send_context *sc, int extra), - TP_ARGS(sc, extra), - TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(int, extra) - ), - TP_fast_assign(DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->extra = extra; - ), - TP_printk("[%s] ctxt %u(%u) extra %d", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->extra - ) -); - -TRACE_EVENT(hfi1_wantpiointr, - TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), - TP_ARGS(sc, needint, credit_ctrl), - TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(u32, needint) - __field(u64, credit_ctrl) - ), - TP_fast_assign(DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->needint = needint; - __entry->credit_ctrl = credit_ctrl; - ), - TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->needint, - (unsigned long long)__entry->credit_ctrl - ) -); - -DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, flags) - __field(u32, s_flags) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->flags = flags; - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - ), - TP_printk( - "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", - __get_str(dev), - __entry->qpn, - __entry->flags, - __entry->s_flags - ) -); - -DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags)); - -DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, - TP_PROTO(struct rvt_qp *qp, u32 flags), - TP_ARGS(qp, flags)); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_ibhdrs - -u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); -const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); - -#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) - -const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); - -#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) - -#define lrh_name(lrh) { HFI1_##lrh, #lrh } -#define show_lnh(lrh) \ -__print_symbolic(lrh, \ - lrh_name(LRH_BTH), \ - lrh_name(LRH_GRH)) - -#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } -#define show_ib_opcode(opcode) \ -__print_symbolic(opcode, \ - ib_opcode_name(RC_SEND_FIRST), \ - ib_opcode_name(RC_SEND_MIDDLE), \ - ib_opcode_name(RC_SEND_LAST), \ - ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_SEND_ONLY), \ - ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_FIRST), \ - ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(RC_RDMA_WRITE_LAST), \ - ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY), \ - ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(RC_RDMA_READ_REQUEST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ - ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ - ib_opcode_name(RC_ACKNOWLEDGE), \ - ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ - ib_opcode_name(RC_COMPARE_SWAP), \ - ib_opcode_name(RC_FETCH_ADD), \ - ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ - ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ - ib_opcode_name(UC_SEND_FIRST), \ - ib_opcode_name(UC_SEND_MIDDLE), \ - ib_opcode_name(UC_SEND_LAST), \ - ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_SEND_ONLY), \ - ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_FIRST), \ - ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ - ib_opcode_name(UC_RDMA_WRITE_LAST), \ - ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY), \ - ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(UD_SEND_ONLY), \ - ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ - ib_opcode_name(CNP)) - -#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" -#define BTH_PRN \ - "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ - "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" -#define EHDR_PRN "%s" - -DECLARE_EVENT_CLASS(hfi1_ibhdr_template, - TP_PROTO(struct hfi1_devdata *dd, - struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - /* LRH */ - __field(u8, vl) - __field(u8, lver) - __field(u8, sl) - __field(u8, lnh) - __field(u16, dlid) - __field(u16, len) - __field(u16, slid) - /* BTH */ - __field(u8, opcode) - __field(u8, se) - __field(u8, m) - __field(u8, pad) - __field(u8, tver) - __field(u16, pkey) - __field(u8, f) - __field(u8, b) - __field(u32, qpn) - __field(u8, a) - __field(u32, psn) - /* extended headers */ - __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) - ), - TP_fast_assign( - struct hfi1_other_headers *ohdr; - - DD_DEV_ASSIGN(dd); - /* LRH */ - __entry->vl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); - __entry->lver = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; - __entry->sl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->lnh = - (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - __entry->dlid = - be16_to_cpu(hdr->lrh[1]); - /* allow for larger len */ - __entry->len = - be16_to_cpu(hdr->lrh[2]); - __entry->slid = - be16_to_cpu(hdr->lrh[3]); - /* BTH */ - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - __entry->opcode = - (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->se = - (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; - __entry->m = - (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; - __entry->pad = - (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - __entry->tver = - (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; - __entry->pkey = - be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->f = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & - HFI1_FECN_MASK; - __entry->b = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & - HFI1_BECN_MASK; - __entry->qpn = - be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->a = - (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; - /* allow for larger PSN */ - __entry->psn = - be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; - /* extended headers */ - memcpy(__get_dynamic_array(ehdrs), &ohdr->u, - ibhdr_exhdr_len(hdr)); - ), - TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, - __get_str(dev), - /* LRH */ - __entry->vl, - __entry->lver, - __entry->sl, - __entry->lnh, show_lnh(__entry->lnh), - __entry->dlid, - __entry->len, - __entry->slid, - /* BTH */ - __entry->opcode, show_ib_opcode(__entry->opcode), - __entry->se, - __entry->m, - __entry->pad, - __entry->tver, - __entry->pkey, - __entry->f, - __entry->b, - __entry->qpn, - __entry->a, - __entry->psn, - /* extended headers */ - __parse_ib_ehdrs( - __entry->opcode, - (void *)__get_dynamic_array(ehdrs)) - ) -); - -DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr)); - -#define SNOOP_PRN \ - "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ - "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_snoop - -TRACE_EVENT(snoop_capture, - TP_PROTO(struct hfi1_devdata *dd, - int hdr_len, - struct hfi1_ib_header *hdr, - int data_len, - void *data), - TP_ARGS(dd, hdr_len, hdr, data_len, data), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, slid) - __field(u16, dlid) - __field(u32, qpn) - __field(u8, opcode) - __field(u8, sl) - __field(u16, pkey) - __field(u32, hdr_len) - __field(u32, data_len) - __field(u8, lnh) - __dynamic_array(u8, raw_hdr, hdr_len) - __dynamic_array(u8, raw_pkt, data_len) - ), - TP_fast_assign( - struct hfi1_other_headers *ohdr; - - __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - DD_DEV_ASSIGN(dd); - __entry->slid = be16_to_cpu(hdr->lrh[3]); - __entry->dlid = be16_to_cpu(hdr->lrh[1]); - __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->hdr_len = hdr_len; - __entry->data_len = data_len; - memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); - memcpy(__get_dynamic_array(raw_pkt), data, data_len); - ), - TP_printk( - "[%s] " SNOOP_PRN, - __get_str(dev), - __entry->slid, - __entry->dlid, - __entry->qpn, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->sl, - __entry->pkey, - __entry->hdr_len, - __entry->data_len - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_ctxts - -#define UCTXT_FMT \ - "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ - "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" -TRACE_EVENT(hfi1_uctxtdata, - TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), - TP_ARGS(dd, uctxt), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(u32, credits) - __field(u64, hw_free) - __field(u64, piobase) - __field(u16, rcvhdrq_cnt) - __field(u64, rcvhdrq_phys) - __field(u32, eager_cnt) - __field(u64, rcvegr_phys) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = uctxt->ctxt; - __entry->credits = uctxt->sc->credits; - __entry->hw_free = (u64)uctxt->sc->hw_free; - __entry->piobase = (u64)uctxt->sc->base_addr; - __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; - __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; - __entry->eager_cnt = uctxt->egrbufs.alloced; - __entry->rcvegr_phys = - uctxt->egrbufs.rcvtids[0].phys; - ), - TP_printk("[%s] ctxt %u " UCTXT_FMT, - __get_str(dev), - __entry->ctxt, - __entry->credits, - __entry->hw_free, - __entry->piobase, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_phys, - __entry->eager_cnt, - __entry->rcvegr_phys - ) -); - -#define CINFO_FMT \ - "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" -TRACE_EVENT(hfi1_ctxt_info, - TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt, - struct hfi1_ctxt_info cinfo), - TP_ARGS(dd, ctxt, subctxt, cinfo), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(unsigned, subctxt) - __field(u16, egrtids) - __field(u16, rcvhdrq_cnt) - __field(u16, rcvhdrq_size) - __field(u16, sdma_ring_size) - __field(u32, rcvegr_size) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->egrtids = cinfo.egrtids; - __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; - __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; - __entry->sdma_ring_size = cinfo.sdma_ring_size; - __entry->rcvegr_size = cinfo.rcvegr_size; - ), - TP_printk("[%s] ctxt %u:%u " CINFO_FMT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->egrtids, - __entry->rcvegr_size, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_size, - __entry->sdma_ring_size - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_sma - -#define BCT_FORMAT \ - "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" - -#define BCT(field) \ - be16_to_cpu( \ - ((struct buffer_control *)__get_dynamic_array(bct))->field \ - ) - -DECLARE_EVENT_CLASS(hfi1_bct_template, - TP_PROTO(struct hfi1_devdata *dd, - struct buffer_control *bc), - TP_ARGS(dd, bc), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __dynamic_array(u8, bct, sizeof(*bc)) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - memcpy(__get_dynamic_array(bct), bc, - sizeof(*bc)); - ), - TP_printk(BCT_FORMAT, - BCT(overall_shared_limit), - - BCT(vl[0].dedicated), - BCT(vl[0].shared), - - BCT(vl[1].dedicated), - BCT(vl[1].shared), - - BCT(vl[2].dedicated), - BCT(vl[2].shared), - - BCT(vl[3].dedicated), - BCT(vl[3].shared), - - BCT(vl[4].dedicated), - BCT(vl[4].shared), - - BCT(vl[5].dedicated), - BCT(vl[5].shared), - - BCT(vl[6].dedicated), - BCT(vl[6].shared), - - BCT(vl[7].dedicated), - BCT(vl[7].shared), - - BCT(vl[15].dedicated), - BCT(vl[15].shared) - ) -); - -DEFINE_EVENT(hfi1_bct_template, bct_set, - TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), - TP_ARGS(dd, bc)); - -DEFINE_EVENT(hfi1_bct_template, bct_get, - TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), - TP_ARGS(dd, bc)); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_sdma - -TRACE_EVENT(hfi1_sdma_descriptor, - TP_PROTO(struct sdma_engine *sde, - u64 desc0, - u64 desc1, - u16 e, - void *descp), - TP_ARGS(sde, desc0, desc1, e, descp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(void *, descp) - __field(u64, desc0) - __field(u64, desc1) - __field(u16, e) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->desc0 = desc0; - __entry->desc1 = desc1; - __entry->idx = sde->this_idx; - __entry->descp = descp; - __entry->e = e; - ), - TP_printk( - "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", - __get_str(dev), - __entry->idx, - __parse_sdma_flags(__entry->desc0, __entry->desc1), - (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & - SDMA_DESC0_PHY_ADDR_MASK, - (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & - SDMA_DESC1_GENERATION_MASK), - (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & - SDMA_DESC0_BYTE_COUNT_MASK), - __entry->desc0, - __entry->desc1, - __entry->descp, - __entry->e - ) -); - -TRACE_EVENT(hfi1_sdma_engine_select, - TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), - TP_ARGS(dd, sel, vl, idx), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __field(u32, sel) - __field(u8, vl) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd); - __entry->sel = sel; - __entry->vl = vl; - __entry->idx = idx; - ), - TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", - __get_str(dev), - __entry->idx, - __entry->sel, - __entry->vl - ) -); - -DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, status) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->status = status; - __entry->idx = sde->this_idx; - ), - TP_printk("[%s] SDE(%u) status %llx", - __get_str(dev), - __entry->idx, - (unsigned long long)__entry->status - ) -); - -DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status) -); - -DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, - TP_PROTO(struct sdma_engine *sde, u64 status), - TP_ARGS(sde, status) -); - -DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(int, aidx) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->idx = sde->this_idx; - __entry->aidx = aidx; - ), - TP_printk("[%s] SDE(%u) aidx %d", - __get_str(dev), - __entry->idx, - __entry->aidx - ) -); - -DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx)); - -DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, - TP_PROTO(struct sdma_engine *sde, int aidx), - TP_ARGS(sde, aidx)); - -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER -TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO(struct sdma_engine *sde, - u16 hwhead, - u16 swhead, - struct sdma_txreq *txp - ), - TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - __entry->sn = txp ? txp->sn : ~0; - ), - TP_printk( - "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->sn, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) -); -#else -TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO(struct sdma_engine *sde, - u16 hwhead, u16 swhead, - struct sdma_txreq *txp - ), - TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - ), - TP_printk( - "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) -); -#endif - -DECLARE_EVENT_CLASS(hfi1_sdma_sn, - TP_PROTO(struct sdma_engine *sde, u64 sn), - TP_ARGS(sde, sn), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u8, idx) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __entry->sn = sn; - __entry->idx = sde->this_idx; - ), - TP_printk("[%s] SDE(%u) sn %llu", - __get_str(dev), - __entry->idx, - __entry->sn - ) -); - -DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, - TP_PROTO( - struct sdma_engine *sde, - u64 sn - ), - TP_ARGS(sde, sn) -); - -DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, - TP_PROTO(struct sdma_engine *sde, u64 sn), - TP_ARGS(sde, sn) -); - -#define USDMA_HDR_FORMAT \ - "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" - -TRACE_EVENT(hfi1_sdma_user_header, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, - struct hfi1_pkt_header *hdr, u32 tidval), - TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, req) - __field(__le32, pbc0) - __field(__le32, pbc1) - __field(__be32, lrh0) - __field(__be32, lrh1) - __field(__be32, bth0) - __field(__be32, bth1) - __field(__be32, bth2) - __field(__le32, kdeth0) - __field(__le32, kdeth1) - __field(__le32, kdeth2) - __field(__le32, kdeth3) - __field(__le32, kdeth4) - __field(__le32, kdeth5) - __field(__le32, kdeth6) - __field(__le32, kdeth7) - __field(__le32, kdeth8) - __field(u32, tidval) - ), - TP_fast_assign( - __le32 *pbc = (__le32 *)hdr->pbc; - __be32 *lrh = (__be32 *)hdr->lrh; - __be32 *bth = (__be32 *)hdr->bth; - __le32 *kdeth = (__le32 *)&hdr->kdeth; - - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->req = req; - __entry->pbc0 = pbc[0]; - __entry->pbc1 = pbc[1]; - __entry->lrh0 = be32_to_cpu(lrh[0]); - __entry->lrh1 = be32_to_cpu(lrh[1]); - __entry->bth0 = be32_to_cpu(bth[0]); - __entry->bth1 = be32_to_cpu(bth[1]); - __entry->bth2 = be32_to_cpu(bth[2]); - __entry->kdeth0 = kdeth[0]; - __entry->kdeth1 = kdeth[1]; - __entry->kdeth2 = kdeth[2]; - __entry->kdeth3 = kdeth[3]; - __entry->kdeth4 = kdeth[4]; - __entry->kdeth5 = kdeth[5]; - __entry->kdeth6 = kdeth[6]; - __entry->kdeth7 = kdeth[7]; - __entry->kdeth8 = kdeth[8]; - __entry->tidval = tidval; - ), - TP_printk(USDMA_HDR_FORMAT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->req, - __entry->pbc1, - __entry->pbc0, - __entry->lrh0, - __entry->lrh1, - __entry->bth0, - __entry->bth1, - __entry->bth2, - __entry->kdeth0, - __entry->kdeth1, - __entry->kdeth2, - __entry->kdeth3, - __entry->kdeth4, - __entry->kdeth5, - __entry->kdeth6, - __entry->kdeth7, - __entry->kdeth8, - __entry->tidval - ) - ); - -#define SDMA_UREQ_FMT \ - "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" -TRACE_EVENT(hfi1_sdma_user_reqinfo, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), - TP_ARGS(dd, ctxt, subctxt, i), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd); - __field(u16, ctxt) - __field(u8, subctxt) - __field(u8, ver_opcode) - __field(u8, iovcnt) - __field(u16, npkts) - __field(u16, fragsize) - __field(u16, comp_idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->ver_opcode = i[0] & 0xff; - __entry->iovcnt = (i[0] >> 8) & 0xff; - __entry->npkts = i[1]; - __entry->fragsize = i[2]; - __entry->comp_idx = i[3]; - ), - TP_printk(SDMA_UREQ_FMT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->ver_opcode, - __entry->iovcnt, - __entry->npkts, - __entry->fragsize, - __entry->comp_idx - ) - ); - -#define usdma_complete_name(st) { st, #st } -#define show_usdma_complete_state(st) \ - __print_symbolic(st, \ - usdma_complete_name(FREE), \ - usdma_complete_name(QUEUED), \ - usdma_complete_name(COMPLETE), \ - usdma_complete_name(ERROR)) - -TRACE_EVENT(hfi1_sdma_user_completion, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, - u8 state, int code), - TP_ARGS(dd, ctxt, subctxt, idx, state, code), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, idx) - __field(u8, state) - __field(int, code) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->idx = idx; - __entry->state = state; - __entry->code = code; - ), - TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", - __get_str(dev), __entry->ctxt, __entry->subctxt, - __entry->idx, show_usdma_complete_state(__entry->state), - __entry->code) - ); - -const char *print_u32_array(struct trace_seq *, u32 *, int); -#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) - -TRACE_EVENT(hfi1_sdma_user_header_ahg, - TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, - u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), - TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u16, ctxt) - __field(u8, subctxt) - __field(u16, req) - __field(u8, sde) - __field(u8, idx) - __field(int, len) - __field(u32, tidval) - __array(u32, ahg, 10) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->req = req; - __entry->sde = sde; - __entry->idx = ahgidx; - __entry->len = len; - __entry->tidval = tidval; - memcpy(__entry->ahg, ahg, len * sizeof(u32)); - ), - TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->req, - __entry->sde, - __entry->idx, - __entry->len - 1, - __print_u32_hex(__entry->ahg, __entry->len), - __entry->tidval - ) - ); - -TRACE_EVENT(hfi1_sdma_state, - TP_PROTO(struct sdma_engine *sde, - const char *cstate, - const char *nstate - ), - TP_ARGS(sde, cstate, nstate), - TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) - __string(curstate, cstate) - __string(newstate, nstate) - ), - TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); - ), - TP_printk("[%s] current state %s new state %s", - __get_str(dev), - __get_str(curstate), - __get_str(newstate) - ) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_rc - -DECLARE_EVENT_CLASS(hfi1_rc_template, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, s_flags) - __field(u32, psn) - __field(u32, s_psn) - __field(u32, s_next_psn) - __field(u32, s_sending_psn) - __field(u32, s_sending_hpsn) - __field(u32, r_psn) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - __entry->psn = psn; - __entry->s_psn = qp->s_psn; - __entry->s_next_psn = qp->s_next_psn; - __entry->s_sending_psn = qp->s_sending_psn; - __entry->s_sending_hpsn = qp->s_sending_hpsn; - __entry->r_psn = qp->r_psn; - ), - TP_printk( - "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", - __get_str(dev), - __entry->qpn, - __entry->s_flags, - __entry->psn, - __entry->s_psn, - __entry->s_next_psn, - __entry->s_sending_psn, - __entry->s_sending_hpsn, - __entry->r_psn - ) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error, - TP_PROTO(struct rvt_qp *qp, u32 psn), - TP_ARGS(qp, psn) -); - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_misc - -TRACE_EVENT(hfi1_interrupt, - TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, - int src), - TP_ARGS(dd, is_entry, src), - TP_STRUCT__entry(DD_DEV_ENTRY(dd) - __array(char, buf, 64) - __field(int, src) - ), - TP_fast_assign(DD_DEV_ASSIGN(dd) - is_entry->is_name(__entry->buf, 64, - src - is_entry->start); - __entry->src = src; - ), - TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, - __entry->src) -); - -/* - * Note: - * This produces a REALLY ugly trace in the console output when the string is - * too long. - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_trace - -#define MAX_MSG_LEN 512 - -DECLARE_EVENT_CLASS(hfi1_trace_template, - TP_PROTO(const char *function, struct va_format *vaf), - TP_ARGS(function, vaf), - TP_STRUCT__entry(__string(function, function) - __dynamic_array(char, msg, MAX_MSG_LEN) - ), - TP_fast_assign(__assign_str(function, function); - WARN_ON_ONCE(vsnprintf - (__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= - MAX_MSG_LEN); - ), - TP_printk("(%s) %s", - __get_str(function), - __get_str(msg)) -); - -/* - * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an - * actual function to work and can not be in a macro. - */ -#define __hfi1_trace_def(lvl) \ -void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ - \ -DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ - TP_PROTO(const char *function, struct va_format *vaf), \ - TP_ARGS(function, vaf)) - -#define __hfi1_trace_fn(lvl) \ -void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ -{ \ - struct va_format vaf = { \ - .fmt = fmt, \ - }; \ - va_list args; \ - \ - va_start(args, fmt); \ - vaf.va = &args; \ - trace_hfi1_ ##lvl(func, &vaf); \ - va_end(args); \ - return; \ -} - -/* - * To create a new trace level simply define it below and as a __hfi1_trace_fn - * in trace.c. This will create all the hooks for calling - * hfi1_cdbg(LVL, fmt, ...); as well as take care of all - * the debugfs stuff. - */ -__hfi1_trace_def(PKT); -__hfi1_trace_def(PROC); -__hfi1_trace_def(SDMA); -__hfi1_trace_def(LINKVERB); -__hfi1_trace_def(DEBUG); -__hfi1_trace_def(SNOOP); -__hfi1_trace_def(CNTR); -__hfi1_trace_def(PIO); -__hfi1_trace_def(DC8051); -__hfi1_trace_def(FIRMWARE); -__hfi1_trace_def(RCVCTRL); -__hfi1_trace_def(TID); -__hfi1_trace_def(MMU); -__hfi1_trace_def(IOCTL); - -#define hfi1_cdbg(which, fmt, ...) \ - __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) - -#define hfi1_dbg(fmt, ...) \ - hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) - -/* - * Define HFI1_EARLY_DBG at compile time or here to enable early trace - * messages. Do not check in an enablement for this. - */ - -#ifdef HFI1_EARLY_DBG -#define hfi1_dbg_early(fmt, ...) \ - trace_printk(fmt, ##__VA_ARGS__) -#else -#define hfi1_dbg_early(fmt, ...) -#endif - -#endif /* __HFI1_TRACE_H */ - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace -#include +#include "trace_dbg.h" +#include "trace_misc.h" +#include "trace_ctxts.h" +#include "trace_ibhdrs.h" +#include "trace_rc.h" +#include "trace_rx.h" +#include "trace_tx.h" diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h new file mode 100644 index 000000000000..5052d497df19 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -0,0 +1,141 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_CTXTS_H + +#include +#include + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ctxts + +#define UCTXT_FMT \ + "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ + "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" +TRACE_EVENT(hfi1_uctxtdata, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), + TP_ARGS(dd, uctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(u64, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_phys) + __field(u32, eager_cnt) + __field(u64, rcvegr_phys) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = (u64)uctxt->sc->hw_free; + __entry->piobase = (u64)uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_phys = + uctxt->egrbufs.rcvtids[0].phys; + ), + TP_printk("[%s] ctxt %u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_phys, + __entry->eager_cnt, + __entry->rcvegr_phys + ) +); + +#define CINFO_FMT \ + "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" +TRACE_EVENT(hfi1_ctxt_info, + TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt, + unsigned int subctxt, + struct hfi1_ctxt_info cinfo), + TP_ARGS(dd, ctxt, subctxt, cinfo), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(unsigned int, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo.egrtids; + __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo.sdma_ring_size; + __entry->rcvegr_size = cinfo.rcvegr_size; + ), + TP_printk("[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) +); + +#endif /* __HFI1_TRACE_CTXTS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ctxts +#include diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h new file mode 100644 index 000000000000..0e7d929530c5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -0,0 +1,155 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_EXTRA_H + +#include +#include + +#include "hfi.h" + +/* + * Note: + * This produces a REALLY ugly trace in the console output when the string is + * too long. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_dbg + +#define MAX_MSG_LEN 512 + +DECLARE_EVENT_CLASS(hfi1_trace_template, + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry(__string(function, function) + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + TP_fast_assign(__assign_str(function, function); + WARN_ON_ONCE(vsnprintf + (__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= + MAX_MSG_LEN); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) +); + +/* + * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an + * actual function to work and can not be in a macro. + */ +#define __hfi1_trace_def(lvl) \ +void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ + \ +DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ + TP_PROTO(const char *function, struct va_format *vaf), \ + TP_ARGS(function, vaf)) + +#define __hfi1_trace_fn(lvl) \ +void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ +{ \ + struct va_format vaf = { \ + .fmt = fmt, \ + }; \ + va_list args; \ + \ + va_start(args, fmt); \ + vaf.va = &args; \ + trace_hfi1_ ##lvl(func, &vaf); \ + va_end(args); \ + return; \ +} + +/* + * To create a new trace level simply define it below and as a __hfi1_trace_fn + * in trace.c. This will create all the hooks for calling + * hfi1_cdbg(LVL, fmt, ...); as well as take care of all + * the debugfs stuff. + */ +__hfi1_trace_def(PKT); +__hfi1_trace_def(PROC); +__hfi1_trace_def(SDMA); +__hfi1_trace_def(LINKVERB); +__hfi1_trace_def(DEBUG); +__hfi1_trace_def(SNOOP); +__hfi1_trace_def(CNTR); +__hfi1_trace_def(PIO); +__hfi1_trace_def(DC8051); +__hfi1_trace_def(FIRMWARE); +__hfi1_trace_def(RCVCTRL); +__hfi1_trace_def(TID); +__hfi1_trace_def(MMU); +__hfi1_trace_def(IOCTL); + +#define hfi1_cdbg(which, fmt, ...) \ + __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) + +#define hfi1_dbg(fmt, ...) \ + hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) + +/* + * Define HFI1_EARLY_DBG at compile time or here to enable early trace + * messages. Do not check in an enablement for this. + */ + +#ifdef HFI1_EARLY_DBG +#define hfi1_dbg_early(fmt, ...) \ + trace_printk(fmt, ##__VA_ARGS__) +#else +#define hfi1_dbg_early(fmt, ...) +#endif + +#endif /* __HFI1_TRACE_EXTRA_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_dbg +#include diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h new file mode 100644 index 000000000000..c3e41aed0034 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -0,0 +1,209 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IBHDRS_H + +#include +#include + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ibhdrs + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); + +#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) + +#define lrh_name(lrh) { HFI1_##lrh, #lrh } +#define show_lnh(lrh) \ +__print_symbolic(lrh, \ + lrh_name(LRH_BTH), \ + lrh_name(LRH_GRH)) + +#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" +#define BTH_PRN \ + "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ + "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" +#define EHDR_PRN "%s" + +DECLARE_EVENT_CLASS(hfi1_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + /* LRH */ + __field(u8, vl) + __field(u8, lver) + __field(u8, sl) + __field(u8, lnh) + __field(u16, dlid) + __field(u16, len) + __field(u16, slid) + /* BTH */ + __field(u8, opcode) + __field(u8, se) + __field(u8, m) + __field(u8, pad) + __field(u8, tver) + __field(u16, pkey) + __field(u8, f) + __field(u8, b) + __field(u32, qpn) + __field(u8, a) + __field(u32, psn) + /* extended headers */ + __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + /* LRH */ + __entry->vl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); + __entry->lver = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; + __entry->sl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->lnh = + (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + __entry->dlid = + be16_to_cpu(hdr->lrh[1]); + /* allow for larger len */ + __entry->len = + be16_to_cpu(hdr->lrh[2]); + __entry->slid = + be16_to_cpu(hdr->lrh[3]); + /* BTH */ + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + __entry->opcode = + (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->se = + (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; + __entry->m = + (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; + __entry->pad = + (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + __entry->tver = + (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; + __entry->pkey = + be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->f = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & + HFI1_FECN_MASK; + __entry->b = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & + HFI1_BECN_MASK; + __entry->qpn = + be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->a = + (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; + /* allow for larger PSN */ + __entry->psn = + be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + /* extended headers */ + memcpy(__get_dynamic_array(ehdrs), &ohdr->u, + ibhdr_exhdr_len(hdr)); + ), + TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, + __get_str(dev), + /* LRH */ + __entry->vl, + __entry->lver, + __entry->sl, + __entry->lnh, show_lnh(__entry->lnh), + __entry->dlid, + __entry->len, + __entry->slid, + /* BTH */ + __entry->opcode, show_ib_opcode(__entry->opcode), + __entry->se, + __entry->m, + __entry->pad, + __entry->tver, + __entry->pkey, + __entry->f, + __entry->b, + __entry->qpn, + __entry->a, + __entry->psn, + /* extended headers */ + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +#endif /* __HFI1_TRACE_IBHDRS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ibhdrs +#include diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h new file mode 100644 index 000000000000..d308454af7fd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_misc.h @@ -0,0 +1,81 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_MISC_H + +#include +#include + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_misc + +TRACE_EVENT(hfi1_interrupt, + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd) + is_entry->is_name(__entry->buf, 64, + src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) +); + +#endif /* __HFI1_TRACE_MISC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_misc +#include diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h new file mode 100644 index 000000000000..5ea5005f9f41 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -0,0 +1,123 @@ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +* +* This file is provided under a dual BSD/GPLv2 license. When using or +* redistributing this file, you may do so under either license. +* +* GPL LICENSE SUMMARY +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of version 2 of the GNU General Public License as +* published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* BSD LICENSE +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RC_H + +#include +#include + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rc + +DECLARE_EVENT_CLASS(hfi1_rc_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u32, psn) + __field(u32, s_psn) + __field(u32, s_next_psn) + __field(u32, s_sending_psn) + __field(u32, s_sending_hpsn) + __field(u32, r_psn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->psn = psn; + __entry->s_psn = qp->s_psn; + __entry->s_next_psn = qp->s_next_psn; + __entry->s_sending_psn = qp->s_sending_psn; + __entry->s_sending_hpsn = qp->s_sending_hpsn; + __entry->r_psn = qp->r_psn; + ), + TP_printk( + "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->psn, + __entry->s_psn, + __entry->s_next_psn, + __entry->s_sending_psn, + __entry->s_sending_hpsn, + __entry->r_psn + ) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_timeout, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +#endif /* __HFI1_TRACE_RC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rc +#include diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h new file mode 100644 index 000000000000..9ba1f615ec95 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -0,0 +1,322 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RX_H + +#include +#include + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rx + +TRACE_EVENT(hfi1_rcvhdr, + TP_PROTO(struct hfi1_devdata *dd, + u32 ctxt, + u64 eflags, + u32 etype, + u32 hlen, + u32 tlen, + u32 updegr, + u32 etail + ), + TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->eflags = eflags; + __entry->ctxt = ctxt; + __entry->etype = etype; + __entry->hlen = hlen; + __entry->tlen = tlen; + __entry->updegr = updegr; + __entry->etail = etail; + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); + +TRACE_EVENT(hfi1_receive_interrupt, + TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), + TP_ARGS(dd, ctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt) { + __entry->slow_path = 1; + __entry->dma_rtail = 0xFF; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_dma_rtail){ + __entry->dma_rtail = 1; + __entry->slow_path = 0; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_nodma_rtail) { + __entry->dma_rtail = 0; + __entry->slow_path = 0; + } + ), + TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) +); + +TRACE_EVENT(hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, + u32 npages, unsigned long va, unsigned long pa, + dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_inval, + TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_mmu_invalidate, + TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, + unsigned long start, unsigned long end), + TP_ARGS(ctxt, subctxt, type, start, end), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __string(type, type) + __field(unsigned long, start) + __field(unsigned long, end) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __assign_str(type, type); + __entry->start = start; + __entry->end = end; + ), + TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", + __entry->ctxt, + __entry->subctxt, + __get_str(type), + __entry->start, + __entry->end + ) + ); + +#define SNOOP_PRN \ + "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ + "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" + +TRACE_EVENT(snoop_capture, + TP_PROTO(struct hfi1_devdata *dd, + int hdr_len, + struct hfi1_ib_header *hdr, + int data_len, + void *data), + TP_ARGS(dd, hdr_len, hdr, data_len, data), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, slid) + __field(u16, dlid) + __field(u32, qpn) + __field(u8, opcode) + __field(u8, sl) + __field(u16, pkey) + __field(u32, hdr_len) + __field(u32, data_len) + __field(u8, lnh) + __dynamic_array(u8, raw_hdr, hdr_len) + __dynamic_array(u8, raw_pkt, data_len) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + DD_DEV_ASSIGN(dd); + __entry->slid = be16_to_cpu(hdr->lrh[3]); + __entry->dlid = be16_to_cpu(hdr->lrh[1]); + __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->hdr_len = hdr_len; + __entry->data_len = data_len; + memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); + memcpy(__get_dynamic_array(raw_pkt), data, data_len); + ), + TP_printk( + "[%s] " SNOOP_PRN, + __get_str(dev), + __entry->slid, + __entry->dlid, + __entry->qpn, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->sl, + __entry->pkey, + __entry->hdr_len, + __entry->data_len + ) +); + +#endif /* __HFI1_TRACE_RX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rx +#include diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h new file mode 100644 index 000000000000..79c93ecea3d3 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -0,0 +1,642 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TX_H + +#include +#include + +#include "hfi.h" +#include "mad.h" +#include "sdma.h" + +const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); + +#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tx + +TRACE_EVENT(hfi1_piofree, + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk("[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) +); + +TRACE_EVENT(hfi1_wantpiointr, + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) +); + +DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags + ) +); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +TRACE_EVENT(hfi1_sdma_descriptor, + TP_PROTO(struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), + TP_ARGS(sde, desc0, desc1, e, descp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), + TP_printk( + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & + SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & + SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & + SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) +); + +TRACE_EVENT(hfi1_sdma_engine_select, + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk("[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) +); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#else +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#endif + +DECLARE_EVENT_CLASS(hfi1_sdma_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn) +); + +#define USDMA_HDR_FORMAT \ + "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" + +TRACE_EVENT(hfi1_sdma_user_header, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + struct hfi1_pkt_header *hdr, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(__le32, pbc0) + __field(__le32, pbc1) + __field(__be32, lrh0) + __field(__be32, lrh1) + __field(__be32, bth0) + __field(__be32, bth1) + __field(__be32, bth2) + __field(__le32, kdeth0) + __field(__le32, kdeth1) + __field(__le32, kdeth2) + __field(__le32, kdeth3) + __field(__le32, kdeth4) + __field(__le32, kdeth5) + __field(__le32, kdeth6) + __field(__le32, kdeth7) + __field(__le32, kdeth8) + __field(u32, tidval) + ), + TP_fast_assign( + __le32 *pbc = (__le32 *)hdr->pbc; + __be32 *lrh = (__be32 *)hdr->lrh; + __be32 *bth = (__be32 *)hdr->bth; + __le32 *kdeth = (__le32 *)&hdr->kdeth; + + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->pbc0 = pbc[0]; + __entry->pbc1 = pbc[1]; + __entry->lrh0 = be32_to_cpu(lrh[0]); + __entry->lrh1 = be32_to_cpu(lrh[1]); + __entry->bth0 = be32_to_cpu(bth[0]); + __entry->bth1 = be32_to_cpu(bth[1]); + __entry->bth2 = be32_to_cpu(bth[2]); + __entry->kdeth0 = kdeth[0]; + __entry->kdeth1 = kdeth[1]; + __entry->kdeth2 = kdeth[2]; + __entry->kdeth3 = kdeth[3]; + __entry->kdeth4 = kdeth[4]; + __entry->kdeth5 = kdeth[5]; + __entry->kdeth6 = kdeth[6]; + __entry->kdeth7 = kdeth[7]; + __entry->kdeth8 = kdeth[8]; + __entry->tidval = tidval; + ), + TP_printk(USDMA_HDR_FORMAT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->pbc1, + __entry->pbc0, + __entry->lrh0, + __entry->lrh1, + __entry->bth0, + __entry->bth1, + __entry->bth2, + __entry->kdeth0, + __entry->kdeth1, + __entry->kdeth2, + __entry->kdeth3, + __entry->kdeth4, + __entry->kdeth5, + __entry->kdeth6, + __entry->kdeth7, + __entry->kdeth8, + __entry->tidval + ) +); + +#define SDMA_UREQ_FMT \ + "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" +TRACE_EVENT(hfi1_sdma_user_reqinfo, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), + TP_ARGS(dd, ctxt, subctxt, i), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd); + __field(u16, ctxt) + __field(u8, subctxt) + __field(u8, ver_opcode) + __field(u8, iovcnt) + __field(u16, npkts) + __field(u16, fragsize) + __field(u16, comp_idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->ver_opcode = i[0] & 0xff; + __entry->iovcnt = (i[0] >> 8) & 0xff; + __entry->npkts = i[1]; + __entry->fragsize = i[2]; + __entry->comp_idx = i[3]; + ), + TP_printk(SDMA_UREQ_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->ver_opcode, + __entry->iovcnt, + __entry->npkts, + __entry->fragsize, + __entry->comp_idx + ) +); + +#define usdma_complete_name(st) { st, #st } +#define show_usdma_complete_state(st) \ + __print_symbolic(st, \ + usdma_complete_name(FREE), \ + usdma_complete_name(QUEUED), \ + usdma_complete_name(COMPLETE), \ + usdma_complete_name(ERROR)) + +TRACE_EVENT(hfi1_sdma_user_completion, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, + u8 state, int code), + TP_ARGS(dd, ctxt, subctxt, idx, state, code), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, idx) + __field(u8, state) + __field(int, code) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->idx = idx; + __entry->state = state; + __entry->code = code; + ), + TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", + __get_str(dev), __entry->ctxt, __entry->subctxt, + __entry->idx, show_usdma_complete_state(__entry->state), + __entry->code) +); + +const char *print_u32_array(struct trace_seq *, u32 *, int); +#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) + +TRACE_EVENT(hfi1_sdma_user_header_ahg, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u8, sde) + __field(u8, idx) + __field(int, len) + __field(u32, tidval) + __array(u32, ahg, 10) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->sde = sde; + __entry->idx = ahgidx; + __entry->len = len; + __entry->tidval = tidval; + memcpy(__entry->ahg, ahg, len * sizeof(u32)); + ), + TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->sde, + __entry->idx, + __entry->len - 1, + __print_u32_hex(__entry->ahg, __entry->len), + __entry->tidval + ) +); + +TRACE_EVENT(hfi1_sdma_state, + TP_PROTO(struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), + TP_printk("[%s] current state %s new state %s", + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) +); + +#define BCT_FORMAT \ + "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" + +#define BCT(field) \ + be16_to_cpu( \ + ((struct buffer_control *)__get_dynamic_array(bct))->field \ + ) + +DECLARE_EVENT_CLASS(hfi1_bct_template, + TP_PROTO(struct hfi1_devdata *dd, + struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + memcpy(__get_dynamic_array(bct), bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) +); + +DEFINE_EVENT(hfi1_bct_template, bct_set, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +DEFINE_EVENT(hfi1_bct_template, bct_get, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +#endif /* __HFI1_TRACE_TX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tx +#include From ad4210823bf379660ebfeb85100b7743094ca7be Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 1 Jul 2016 16:01:11 -0700 Subject: [PATCH 05/84] IB/hfi1: Fix trace sparse errors Fix sparse errors by making sure the fast assign destinations are host cpu typed. For the void __iomem *, just make the field match source data. Fix a bug where the hw_free trace printed the pointer vs. the dereferenced value. Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/trace_ctxts.h | 8 ++-- drivers/infiniband/hw/hfi1/trace_tx.h | 54 ++++++++++++------------ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h index 5052d497df19..31654bbac1cf 100644 --- a/drivers/infiniband/hw/hfi1/trace_ctxts.h +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -56,7 +56,7 @@ #define TRACE_SYSTEM hfi1_ctxts #define UCTXT_FMT \ - "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ + "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, " \ "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" TRACE_EVENT(hfi1_uctxtdata, TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), @@ -65,7 +65,7 @@ TRACE_EVENT(hfi1_uctxtdata, __field(unsigned int, ctxt) __field(u32, credits) __field(u64, hw_free) - __field(u64, piobase) + __field(void __iomem *, piobase) __field(u16, rcvhdrq_cnt) __field(u64, rcvhdrq_phys) __field(u32, eager_cnt) @@ -74,8 +74,8 @@ TRACE_EVENT(hfi1_uctxtdata, TP_fast_assign(DD_DEV_ASSIGN(dd); __entry->ctxt = uctxt->ctxt; __entry->credits = uctxt->sc->credits; - __entry->hw_free = (u64)uctxt->sc->hw_free; - __entry->piobase = (u64)uctxt->sc->base_addr; + __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free); + __entry->piobase = uctxt->sc->base_addr; __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; __entry->eager_cnt = uctxt->egrbufs.alloced; diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index 79c93ecea3d3..415d6be42c5d 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -369,22 +369,22 @@ TRACE_EVENT(hfi1_sdma_user_header, __field(u16, ctxt) __field(u8, subctxt) __field(u16, req) - __field(__le32, pbc0) - __field(__le32, pbc1) - __field(__be32, lrh0) - __field(__be32, lrh1) - __field(__be32, bth0) - __field(__be32, bth1) - __field(__be32, bth2) - __field(__le32, kdeth0) - __field(__le32, kdeth1) - __field(__le32, kdeth2) - __field(__le32, kdeth3) - __field(__le32, kdeth4) - __field(__le32, kdeth5) - __field(__le32, kdeth6) - __field(__le32, kdeth7) - __field(__le32, kdeth8) + __field(u32, pbc0) + __field(u32, pbc1) + __field(u32, lrh0) + __field(u32, lrh1) + __field(u32, bth0) + __field(u32, bth1) + __field(u32, bth2) + __field(u32, kdeth0) + __field(u32, kdeth1) + __field(u32, kdeth2) + __field(u32, kdeth3) + __field(u32, kdeth4) + __field(u32, kdeth5) + __field(u32, kdeth6) + __field(u32, kdeth7) + __field(u32, kdeth8) __field(u32, tidval) ), TP_fast_assign( @@ -397,22 +397,22 @@ TRACE_EVENT(hfi1_sdma_user_header, __entry->ctxt = ctxt; __entry->subctxt = subctxt; __entry->req = req; - __entry->pbc0 = pbc[0]; - __entry->pbc1 = pbc[1]; + __entry->pbc0 = le32_to_cpu(pbc[0]); + __entry->pbc1 = le32_to_cpu(pbc[1]); __entry->lrh0 = be32_to_cpu(lrh[0]); __entry->lrh1 = be32_to_cpu(lrh[1]); __entry->bth0 = be32_to_cpu(bth[0]); __entry->bth1 = be32_to_cpu(bth[1]); __entry->bth2 = be32_to_cpu(bth[2]); - __entry->kdeth0 = kdeth[0]; - __entry->kdeth1 = kdeth[1]; - __entry->kdeth2 = kdeth[2]; - __entry->kdeth3 = kdeth[3]; - __entry->kdeth4 = kdeth[4]; - __entry->kdeth5 = kdeth[5]; - __entry->kdeth6 = kdeth[6]; - __entry->kdeth7 = kdeth[7]; - __entry->kdeth8 = kdeth[8]; + __entry->kdeth0 = le32_to_cpu(kdeth[0]); + __entry->kdeth1 = le32_to_cpu(kdeth[1]); + __entry->kdeth2 = le32_to_cpu(kdeth[2]); + __entry->kdeth3 = le32_to_cpu(kdeth[3]); + __entry->kdeth4 = le32_to_cpu(kdeth[4]); + __entry->kdeth5 = le32_to_cpu(kdeth[5]); + __entry->kdeth6 = le32_to_cpu(kdeth[6]); + __entry->kdeth7 = le32_to_cpu(kdeth[7]); + __entry->kdeth8 = le32_to_cpu(kdeth[8]); __entry->tidval = tidval; ), TP_printk(USDMA_HDR_FORMAT, From 583eb8b8a155dd6fb50a0c7846aa85ba8381cfed Mon Sep 17 00:00:00 2001 From: Jakub Pawlak Date: Fri, 1 Jul 2016 16:01:17 -0700 Subject: [PATCH 06/84] IB/hfi1: Add VL XmitDiscards counters to the opapmaquery Add per VL XmitDiscards counters to the opapmaquery status and error response. Reviewed-by: Dean Luick Signed-off-by: Jakub Pawlak Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 223dd46cf2aa..349a138cd2dc 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2487,6 +2487,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl))); + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); vlinfo++; vfi++; } @@ -2878,7 +2881,9 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), 8 * sizeof(req->vl_select_mask)) { memset(vlinfo, 0, sizeof(*vlinfo)); - /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */ + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); vlinfo += 1; vfi++; } @@ -3211,7 +3216,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, /* if (counter_select & CS_PORT_MARK_FECN) * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); */ - /* port_vl_xmit_discards ??? */ + if (counter_select & C_SW_XMIT_DSCD_VL) + write_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl), 0); } if (resp_len) From 2b719046743d35b452f17956a5f19e1aa0fc3e8a Mon Sep 17 00:00:00 2001 From: Jakub Pawlak Date: Fri, 1 Jul 2016 16:01:22 -0700 Subject: [PATCH 07/84] IB/hfi1: Add counter to track unsupported packets drop Add sw counter to track dropped unsupported packets. Report unsupported packets drop as the RcvError. Reviewed-by: Dennis Dalessandro Signed-off-by: Jakub Pawlak Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 31 +++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/driver.c | 1 + drivers/infiniband/hw/hfi1/hfi.h | 3 ++- drivers/infiniband/hw/hfi1/mad.c | 3 ++- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index dad4d0ebbdff..97ce886bb171 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -238,6 +238,9 @@ struct flag_table { /* all CceStatus sub-block RXE pause bits */ #define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK +#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL +#define CNTR_32BIT_MAX 0x00000000FFFFFFFF + /* * CCE Error flags. */ @@ -3947,6 +3950,28 @@ static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry, return dd->sw_send_dma_eng_err_status_cnt[0]; } +static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + u64 val = 0; + u64 csr = entry->csr; + + val = read_write_csr(dd, csr, mode, data); + if (mode == CNTR_MODE_R) { + val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ? + CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors; + } else if (mode == CNTR_MODE_W) { + dd->sw_rcv_bypass_packet_errors = 0; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + return val; +} + #define def_access_sw_cpu(cntr) \ static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ void *context, int vl, int mode, u64 data) \ @@ -4020,7 +4045,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL), [C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT, CNTR_SYNTH), -[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH), +[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH, + access_dc_rcv_err_cnt), [C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT, CNTR_SYNTH), [C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT, @@ -11668,9 +11694,6 @@ static void free_cntrs(struct hfi1_devdata *dd) dd->cntrnames = NULL; } -#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL -#define CNTR_32BIT_MAX 0x00000000FFFFFFFF - static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry, u64 *psval, void *context, int vl) { diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index c75b0ae688f8..6c81d155665d 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1362,6 +1362,7 @@ int process_receive_bypass(struct hfi1_packet *packet) dd_dev_err(packet->rcd->dd, "Bypass packets are not supported in normal operation. Dropping\n"); + incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors); return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 1dd48efb5b61..748e235b828e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1128,7 +1128,8 @@ struct hfi1_devdata { NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS]; /* Software counter that aggregates all cce_err_status errors */ u64 sw_cce_err_status_aggregate; - + /* Software counter that aggregates all bypass packet rcv errors */ + u64 sw_rcv_bypass_packet_errors; /* receive interrupt functions */ rhf_rcv_function_ptr *rhf_rcv_function_map; rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 349a138cd2dc..962bb11074d9 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2874,7 +2874,8 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; - + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); vlinfo = &rsp->vls[0]; vfi = 0; vl_select_mask = be32_to_cpu(req->vl_select_mask); From eaa74ec7329a48a4b724d8de440b3a2cbaabf7c8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Jul 2016 13:03:09 -0700 Subject: [PATCH 08/84] IB/core: Make rdma_rw_ctx_init() initialize all used fields Some but not all callers of rdma_rw_ctx_init() zero-initialize struct rdma_rw_ctx. Hence make rdma_rw_ctx_init() initialize all work request fields that will be read by ib_post_send(). Fixes: a060b5629ab0 ("IB/core: generic RDMA READ/WRITE API") Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Tested-by: Steve Wise Tested-by: Laurence Oberman Cc: Parav Pandit Cc: Nicholas Bellinger Cc: #v4.7+ Signed-off-by: Doug Ledford --- drivers/infiniband/core/rw.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 1eb9b1294a63..1ad2baaa6c8c 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -71,6 +71,7 @@ static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); } +/* Caller must have zero-initialized *reg. */ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) @@ -114,6 +115,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { + struct rdma_rw_reg_ctx *prev = NULL; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); int i, j, ret = 0, count = 0; @@ -125,7 +127,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, } for (i = 0; i < ctx->nr_ops; i++) { - struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; u32 nents = min(sg_cnt, pages_per_mr); @@ -162,9 +163,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, sg_cnt -= nents; for (j = 0; j < nents; j++) sg = sg_next(sg); + prev = reg; offset = 0; } + if (prev) + prev->wr.wr.next = NULL; + ctx->type = RDMA_RW_MR; return count; @@ -205,11 +210,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->remote_addr = remote_addr + total_len; rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - rdma_wr->wr.num_sge++; - sge->addr = ib_sg_dma_address(dev, sg) + offset; sge->length = ib_sg_dma_len(dev, sg) - offset; sge->lkey = qp->pd->local_dma_lkey; @@ -220,8 +224,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, offset = 0; } - if (i + 1 < ctx->nr_ops) - rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + rdma_wr->wr.next = i + 1 < ctx->nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; } ctx->type = RDMA_RW_MULTI_WR; From 632bc3f65081dd1e2e5394a9161580a0f78e8839 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Jul 2016 13:03:30 -0700 Subject: [PATCH 09/84] IB/core, RDMA RW API: Do not exceed QP SGE send limit Compute the SGE limit for RDMA READ and WRITE requests in ib_create_qp(). Use that limit in the RDMA RW API implementation. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Steve Wise Cc: Parav Pandit Cc: Nicholas Bellinger Cc: Laurence Oberman Cc: #v4.7+ Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford --- drivers/infiniband/core/rw.c | 10 ++-------- drivers/infiniband/core/verbs.c | 9 +++++++++ include/rdma/ib_verbs.h | 6 ++++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 1ad2baaa6c8c..dbfd854c32c9 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -58,13 +58,6 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, return false; } -static inline u32 rdma_rw_max_sge(struct ib_device *dev, - enum dma_data_direction dir) -{ - return dir == DMA_TO_DEVICE ? - dev->attrs.max_sge : dev->attrs.max_sge_rd; -} - static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) { /* arbitrary limit to avoid allocating gigantic resources */ @@ -186,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 max_sge = rdma_rw_max_sge(dev, dir); + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; struct ib_sge *sge; u32 total_len = 0, i, j; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 6298f54b4137..e39a0b597234 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -814,6 +814,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, } } + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + return qp; } EXPORT_SYMBOL(ib_create_qp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7e440d41487a..e694f02d42e3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1428,6 +1428,10 @@ struct ib_srq { } ext; }; +/* + * @max_write_sge: Maximum SGE elements per RDMA WRITE request. + * @max_read_sge: Maximum SGE elements per RDMA READ request. + */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; @@ -1449,6 +1453,8 @@ struct ib_qp { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; + u32 max_write_sge; + u32 max_read_sge; enum ib_qp_type qp_type; }; From 30c6d8773de06878f920666d8c945f81cb2081b3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Jul 2016 13:03:47 -0700 Subject: [PATCH 10/84] IB/srpt: Limit the number of SG elements per work request Limit the number of SG elements per work request to what the HCA and the queue pair support. Fixes: 34693573fde0 ("IB/srpt: Reduce QP buffer size") Reported-by: Parav Pandit Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Steve Wise Cc: Parav Pandit Cc: Nicholas Bellinger Cc: Laurence Oberman Cc: #v4.7+ Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 ++- drivers/infiniband/ulp/srpt/ib_srpt.h | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 4a4155640d51..9a3b954e862d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1601,6 +1601,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) struct ib_qp_init_attr *qp_init; struct srpt_port *sport = ch->sport; struct srpt_device *sdev = sport->sdev; + const struct ib_device_attr *attrs = &sdev->device->attrs; u32 srp_sq_size = sport->port_attrib.srp_sq_size; int ret; @@ -1638,7 +1639,7 @@ retry: */ qp_init->cap.max_send_wr = srp_sq_size / 2; qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; - qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE); qp_init->port_num = ch->sport->port; ch->qp = ib_create_qp(sdev->pd, qp_init); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 389030487da7..581878782854 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -106,7 +106,11 @@ enum { SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, SRPT_DEF_SG_TABLESIZE = 128, - SRPT_DEF_SG_PER_WQE = 16, + /* + * An experimentally determined value that avoids that QP creation + * fails due to "swiotlb buffer is full" on systems using the swiotlb. + */ + SRPT_MAX_SG_PER_WQE = 16, MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, From 10fce586b20ba27013f0a6ac73b3ac87c95cf8b5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Jul 2016 13:04:06 -0700 Subject: [PATCH 11/84] IB/srpt: Simplify srpt_queue_response() Initialize first_wr to &send_wr. This allows to remove a ternary operator and an else branch. This patch does not change the behavior of srpt_queue_response(). Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Tested-by: Steve Wise Tested-by: Laurence Oberman Cc: Parav Pandit Cc: Nicholas Bellinger Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 9a3b954e862d..dfa23b075a88 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2262,7 +2262,7 @@ static void srpt_queue_response(struct se_cmd *cmd) container_of(cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; struct srpt_device *sdev = ch->sport->sdev; - struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr; + struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr; struct ib_sge sge; enum srpt_command_state state; unsigned long flags; @@ -2303,11 +2303,8 @@ static void srpt_queue_response(struct se_cmd *cmd) struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, - ch->sport->port, NULL, - first_wr ? first_wr : &send_wr); + ch->sport->port, NULL, first_wr); } - } else { - first_wr = &send_wr; } if (state != SRPT_STATE_MGMT) From e6d66e3eb65f8b083d827f6864e70b8dcea9d9bb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 21 Jul 2016 13:04:25 -0700 Subject: [PATCH 12/84] IB/isert: Remove an unused member variable Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Tested-by: Steve Wise Tested-by: Laurence Oberman Cc: Parav Pandit Cc: Nicholas Bellinger Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/isert/ib_isert.c | 2 -- drivers/infiniband/ulp/isert/ib_isert.h | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index a990c04208c9..ba6be060a476 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -137,8 +137,6 @@ isert_create_qp(struct isert_conn *isert_conn, attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; attr.cap.max_send_sge = device->ib_device->attrs.max_sge; - isert_conn->max_sge = min(device->ib_device->attrs.max_sge, - device->ib_device->attrs.max_sge_rd); attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index e512ba941f2f..fc791efe3a10 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -138,7 +138,6 @@ struct isert_conn { u32 responder_resources; u32 initiator_depth; bool pi_support; - u32 max_sge; struct iser_rx_desc *login_req_buf; char *login_rsp_buf; u64 login_req_dma; From 4197344ba5c2aab24b96f141cb00af9d0471f60b Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Mon, 25 Jul 2016 07:52:36 -0700 Subject: [PATCH 13/84] IB/hfi1: Add global structure for affinity assignments When HFI units get initialized, they each use their own mask copy for affinity assignments. On a multi-HFI system, affinity assignments overbook CPU cores as each HFI doesn't have knowledge of affinity assignments for other HFI units. Therefore, some CPU cores are never used for interrupt handlers in systems with high number of CPU cores per NUMA node. For multi-HFI systems, SDMA engine interrupt assignments start all over from the first CPU in the local NUMA node after the first HFI initialization. This change allows assignments to continue where the last HFI unit left off. Add global structure for affinity assignments for multiple HFIs to share affinity mask. Reviewed-by: Jianxin Xiong Reviewed-by: Jubin John Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 243 ++++++++++++++++++-------- drivers/infiniband/hw/hfi1/affinity.h | 25 ++- drivers/infiniband/hw/hfi1/chip.c | 20 +-- drivers/infiniband/hw/hfi1/init.c | 5 +- 4 files changed, 197 insertions(+), 96 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 14d7eeb09be6..164769952ff7 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -53,6 +53,11 @@ #include "sdma.h" #include "trace.h" +struct hfi1_affinity_node_list node_affinity = { + .list = LIST_HEAD_INIT(node_affinity.list), + .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock), +}; + /* Name of IRQ types, indexed by enum irq_type */ static const char * const irq_type_names[] = { "SDMA", @@ -69,45 +74,100 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) } /* Initialize non-HT cpu cores mask */ -int init_real_cpu_mask(struct hfi1_devdata *dd) +void init_real_cpu_mask(void) { - struct hfi1_affinity *info; int possible, curr_cpu, i, ht; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - cpumask_clear(&info->real_cpu_mask); + cpumask_clear(&node_affinity.real_cpu_mask); /* Start with cpu online mask as the real cpu mask */ - cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); /* * Remove HT cores from the real cpu mask. Do this in two steps below. */ - possible = cpumask_weight(&info->real_cpu_mask); + possible = cpumask_weight(&node_affinity.real_cpu_mask); ht = cpumask_weight(topology_sibling_cpumask( - cpumask_first(&info->real_cpu_mask))); + cpumask_first(&node_affinity.real_cpu_mask))); /* * Step 1. Skip over the first N HT siblings and use them as the * "real" cores. Assumes that HT cores are not enumerated in * succession (except in the single core case). */ - curr_cpu = cpumask_first(&info->real_cpu_mask); + curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); for (i = 0; i < possible / ht; i++) - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); /* * Step 2. Remove the remaining HT siblings. Use cpumask_next() to * skip any gaps. */ for (; i < possible; i++) { - cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); - curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); + } +} + +void node_affinity_init(void) +{ + cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); + /* + * The real cpu mask is part of the affinity struct but it has to be + * initialized early. It is needed to calculate the number of user + * contexts in set_up_context_variables(). + */ + init_real_cpu_mask(); +} + +void node_affinity_destroy(void) +{ + struct list_head *pos, *q; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + list_for_each_safe(pos, q, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, + list); + list_del(pos); + kfree(entry); + } + spin_unlock(&node_affinity.lock); +} + +static struct hfi1_affinity_node *node_affinity_allocate(int node) +{ + struct hfi1_affinity_node *entry; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + entry->node = node; + INIT_LIST_HEAD(&entry->list); + + return entry; +} + +/* + * It appends an entry to the list. + * It *must* be called with node_affinity.lock held. + */ +static void node_affinity_add_tail(struct hfi1_affinity_node *entry) +{ + list_add_tail(&entry->list, &node_affinity.list); +} + +/* It must be called with node_affinity.lock held */ +static struct hfi1_affinity_node *node_affinity_lookup(int node) +{ + struct list_head *pos; + struct hfi1_affinity_node *entry; + + list_for_each(pos, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, list); + if (entry->node == node) + return entry; } - dd->affinity = info; - return 0; + return NULL; } /* @@ -121,10 +181,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd) * to the node relative 1 as necessary. * */ -void hfi1_dev_affinity_init(struct hfi1_devdata *dd) +int hfi1_dev_affinity_init(struct hfi1_devdata *dd) { int node = pcibus_to_node(dd->pcidev->bus); - struct hfi1_affinity *info = dd->affinity; + struct hfi1_affinity_node *entry; const struct cpumask *local_mask; int curr_cpu, possible, i; @@ -132,55 +192,75 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd) node = numa_node_id(); dd->node = node; - spin_lock_init(&info->lock); - - init_cpu_mask_set(&info->def_intr); - init_cpu_mask_set(&info->rcv_intr); - init_cpu_mask_set(&info->proc); - local_mask = cpumask_of_node(dd->node); if (cpumask_first(local_mask) >= nr_cpu_ids) local_mask = topology_core_cpumask(0); - /* Use the "real" cpu mask of this node as the default */ - cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); - /* fill in the receive list */ - possible = cpumask_weight(&info->def_intr.mask); - curr_cpu = cpumask_first(&info->def_intr.mask); - if (possible == 1) { - /* only one CPU, everyone will use it */ - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - } else { - /* - * Retain the first CPU in the default list for the control - * context. - */ - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - /* - * Remove the remaining kernel receive queues from - * the default list and add them to the receive list. - */ - for (i = 0; i < dd->n_krcv_queues - 1; i++) { - cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); - cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - if (curr_cpu >= nr_cpu_ids) - break; + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + /* + * If this is the first time this NUMA node's affinity is used, + * create an entry in the global affinity structure and initialize it. + */ + if (!entry) { + entry = node_affinity_allocate(node); + if (!entry) { + dd_dev_err(dd, + "Unable to allocate global affinity node\n"); + return -ENOMEM; } + init_cpu_mask_set(&entry->def_intr); + init_cpu_mask_set(&entry->rcv_intr); + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, + local_mask); + + /* fill in the receive list */ + possible = cpumask_weight(&entry->def_intr.mask); + curr_cpu = cpumask_first(&entry->def_intr.mask); + + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); + } else { + /* + * Retain the first CPU in the default list for the + * control context. + */ + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; i < dd->n_krcv_queues - 1; i++) { + cpumask_clear_cpu(curr_cpu, + &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, + &entry->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } + } + + spin_lock(&node_affinity.lock); + node_affinity_add_tail(entry); + spin_unlock(&node_affinity.lock); } - cpumask_copy(&info->proc.mask, cpu_online_mask); -} - -void hfi1_dev_affinity_free(struct hfi1_devdata *dd) -{ - kfree(dd->affinity); + return 0; } int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) { int ret; cpumask_var_t diff; + struct hfi1_affinity_node *entry; struct cpu_mask_set *set; struct sdma_engine *sde = NULL; struct hfi1_ctxtdata *rcd = NULL; @@ -194,21 +274,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) if (!ret) return -ENOMEM; + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + switch (msix->type) { case IRQ_SDMA: sde = (struct sdma_engine *)msix->arg; scnprintf(extra, 64, "engine %u", sde->this_idx); /* fall through */ case IRQ_GENERAL: - set = &dd->affinity->def_intr; + set = &entry->def_intr; break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; if (rcd->ctxt == HFI1_CTRL_CTXT) { - set = &dd->affinity->def_intr; + set = &entry->def_intr; cpu = cpumask_first(&set->mask); } else { - set = &dd->affinity->rcv_intr; + set = &entry->rcv_intr; } scnprintf(extra, 64, "ctxt %u", rcd->ctxt); break; @@ -222,8 +306,8 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) * is set above. Skip accounting for it. Everything else finds its * CPU here. */ - if (cpu == -1) { - spin_lock(&dd->affinity->lock); + if (cpu == -1 && set) { + spin_lock(&node_affinity.lock); if (cpumask_equal(&set->mask, &set->used)) { /* * We've used up all the CPUs, bump up the generation @@ -235,7 +319,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) cpumask_andnot(diff, &set->mask, &set->used); cpu = cpumask_first(diff); cpumask_set_cpu(cpu, &set->used); - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } switch (msix->type) { @@ -263,30 +347,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, { struct cpu_mask_set *set = NULL; struct hfi1_ctxtdata *rcd; + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); switch (msix->type) { case IRQ_SDMA: case IRQ_GENERAL: - set = &dd->affinity->def_intr; + set = &entry->def_intr; break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; /* only do accounting for non control contexts */ if (rcd->ctxt != HFI1_CTRL_CTXT) - set = &dd->affinity->rcv_intr; + set = &entry->rcv_intr; break; default: return; } if (set) { - spin_lock(&dd->affinity->lock); + spin_lock(&node_affinity.lock); cpumask_andnot(&set->used, &set->used, &msix->mask); if (cpumask_empty(&set->used) && set->gen) { set->gen--; cpumask_copy(&set->used, &set->mask); } - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } irq_set_affinity_hint(msix->msix.vector, NULL); @@ -297,9 +386,10 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) { int cpu = -1, ret; cpumask_var_t diff, mask, intrs; + struct hfi1_affinity_node *entry; const struct cpumask *node_mask, *proc_mask = tsk_cpus_allowed(current); - struct cpu_mask_set *set = &dd->affinity->proc; + struct cpu_mask_set *set = &node_affinity.proc; /* * check whether process/context affinity has already @@ -338,7 +428,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) if (!ret) goto free_mask; - spin_lock(&dd->affinity->lock); + spin_lock(&node_affinity.lock); /* * If we've used all available CPUs, clear the mask and start * overloading. @@ -348,13 +438,14 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) cpumask_clear(&set->used); } + entry = node_affinity_lookup(dd->node); /* CPUs used by interrupt handlers */ - cpumask_copy(intrs, (dd->affinity->def_intr.gen ? - &dd->affinity->def_intr.mask : - &dd->affinity->def_intr.used)); - cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ? - &dd->affinity->rcv_intr.mask : - &dd->affinity->rcv_intr.used)); + cpumask_copy(intrs, (entry->def_intr.gen ? + &entry->def_intr.mask : + &entry->def_intr.used)); + cpumask_or(intrs, intrs, (entry->rcv_intr.gen ? + &entry->rcv_intr.mask : + &entry->rcv_intr.used)); hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", cpumask_pr_args(intrs)); @@ -400,7 +491,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) cpu = -1; else cpumask_set_cpu(cpu, &set->used); - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); free_cpumask_var(intrs); free_mask: @@ -413,16 +504,16 @@ done: void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu) { - struct cpu_mask_set *set = &dd->affinity->proc; + struct cpu_mask_set *set = &node_affinity.proc; if (cpu < 0) return; - spin_lock(&dd->affinity->lock); + spin_lock(&node_affinity.lock); cpumask_clear_cpu(cpu, &set->used); if (cpumask_empty(&set->used) && set->gen) { set->gen--; cpumask_copy(&set->used, &set->mask); } - spin_unlock(&dd->affinity->lock); + spin_unlock(&node_affinity.lock); } diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 20f52fe74091..ad3e730a8d8f 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -82,11 +82,9 @@ struct hfi1_affinity { struct hfi1_msix_entry; /* Initialize non-HT cpu cores mask */ -int init_real_cpu_mask(struct hfi1_devdata *); +void init_real_cpu_mask(void); /* Initialize driver affinity data */ -void hfi1_dev_affinity_init(struct hfi1_devdata *); -/* Free driver affinity data */ -void hfi1_dev_affinity_free(struct hfi1_devdata *); +int hfi1_dev_affinity_init(struct hfi1_devdata *); /* * Set IRQ affinity to a CPU. The function will determine the * CPU and set the affinity to it. @@ -105,4 +103,23 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *, int); /* Release a CPU used by a user process. */ void hfi1_put_proc_affinity(struct hfi1_devdata *, int); +struct hfi1_affinity_node { + int node; + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct list_head list; +}; + +struct hfi1_affinity_node_list { + struct list_head list; + struct cpumask real_cpu_mask; + struct cpu_mask_set proc; + /* protect affinity node list */ + spinlock_t lock; +}; + +void node_affinity_init(void); +void node_affinity_destroy(void); +extern struct hfi1_affinity_node_list node_affinity; + #endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 97ce886bb171..0de6c0ca7078 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -63,6 +63,7 @@ #include "efivar.h" #include "platform.h" #include "aspm.h" +#include "affinity.h" #define NUM_IB_PORTS 1 @@ -12838,7 +12839,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) */ if (num_user_contexts < 0) num_user_contexts = - cpumask_weight(&dd->affinity->real_cpu_mask); + cpumask_weight(&node_affinity.real_cpu_mask); total_contexts = num_kernel_contexts + num_user_contexts; @@ -14468,19 +14469,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); - /* - * The real cpu mask is part of the affinity struct but has to be - * initialized earlier than the rest of the affinity struct because it - * is needed to calculate the number of user contexts in - * set_up_context_variables(). However, hfi1_dev_affinity_init(), - * which initializes the rest of the affinity struct members, - * depends on set_up_context_variables() for the number of kernel - * contexts, so it cannot be called before set_up_context_variables(). - */ - ret = init_real_cpu_mask(dd); - if (ret) - goto bail_cleanup; - ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -14494,7 +14482,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); - hfi1_dev_affinity_init(dd); + ret = hfi1_dev_affinity_init(dd); + if (ret) + goto bail_cleanup; /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index eed971ccd2a1..b0c3e8a97725 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -64,6 +64,7 @@ #include "debugfs.h" #include "verbs.h" #include "aspm.h" +#include "affinity.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1004,7 +1005,6 @@ static void __hfi1_free_devdata(struct kobject *kobj) rcu_barrier(); /* wait for rcu callbacks to complete */ free_percpu(dd->int_counter); free_percpu(dd->rcv_limit); - hfi1_dev_affinity_free(dd); free_percpu(dd->send_schedule); rvt_dealloc_device(&dd->verbs_dev.rdi); } @@ -1198,6 +1198,8 @@ static int __init hfi1_mod_init(void) if (ret) goto bail; + node_affinity_init(); + /* validate max MTU before any devices start */ if (!valid_opa_max_mtu(hfi1_max_mtu)) { pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", @@ -1278,6 +1280,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + node_affinity_destroy(); hfi1_wss_exit(); hfi1_dbg_exit(); hfi1_cpulist_count = 0; From d63730192f5914c0f6feec3d45116486be1d36e3 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Mon, 25 Jul 2016 07:54:48 -0700 Subject: [PATCH 14/84] IB/hfi1: Reserve and collapse CPU cores for contexts Kernel receive queues oversubscribe CPU cores on multi-HFI systems. To prevent this, the kernel receive queues are separated onto different cores, and the SDMA engine interrupts are constrained to a lesser number of cores. hfi1s_on_numa_node*krcvqs is the number of CPU cores that are reserved for kernel receive queues for all HFIs. Each HFI initializes its kernel receive queues to one of the reserved CPU cores. If there ends up being 0 CPU cores leftover for SDMA engines, use the same CPU cores as receive contexts. In addition, general and control contexts are assigned to their own CPU core, however, both types of contexts tend to have low traffic. To save CPU cores, collapse general and control contexts to one CPU core for all HFI units. This change prevents SDMA engine interrupts from wrapping around general contexts. Reviewed-by: Dean Luick Signed-off-by: Sebastian Sanchez Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 101 +++++++++++++++++++------- drivers/infiniband/hw/hfi1/affinity.h | 3 +- drivers/infiniband/hw/hfi1/hfi.h | 2 + drivers/infiniband/hw/hfi1/init.c | 6 +- 4 files changed, 84 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 164769952ff7..eb889270fbeb 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -66,6 +66,9 @@ static const char * const irq_type_names[] = { "OTHER", }; +/* Per NUMA node count of HFI devices */ +static unsigned int *hfi1_per_node_cntr; + static inline void init_cpu_mask_set(struct cpu_mask_set *set) { cpumask_clear(&set->mask); @@ -107,8 +110,12 @@ void init_real_cpu_mask(void) } } -void node_affinity_init(void) +int node_affinity_init(void) { + int node; + struct pci_dev *dev = NULL; + const struct pci_device_id *ids = hfi1_pci_tbl; + cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); /* * The real cpu mask is part of the affinity struct but it has to be @@ -116,6 +123,25 @@ void node_affinity_init(void) * contexts in set_up_context_variables(). */ init_real_cpu_mask(); + + hfi1_per_node_cntr = kcalloc(num_possible_nodes(), + sizeof(*hfi1_per_node_cntr), GFP_KERNEL); + if (!hfi1_per_node_cntr) + return -ENOMEM; + + while (ids->vendor) { + dev = NULL; + while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { + node = pcibus_to_node(dev->bus); + if (node < 0) + node = numa_node_id(); + + hfi1_per_node_cntr[node]++; + } + ids++; + } + + return 0; } void node_affinity_destroy(void) @@ -131,6 +157,7 @@ void node_affinity_destroy(void) kfree(entry); } spin_unlock(&node_affinity.lock); + kfree(hfi1_per_node_cntr); } static struct hfi1_affinity_node *node_affinity_allocate(int node) @@ -213,6 +240,7 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) } init_cpu_mask_set(&entry->def_intr); init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->general_intr_mask); /* Use the "real" cpu mask of this node as the default */ cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, local_mask); @@ -224,11 +252,15 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) if (possible == 1) { /* only one CPU, everyone will use it */ cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); } else { /* - * Retain the first CPU in the default list for the - * control context. + * The general/control context will be the first CPU in + * the default list, so it is removed from the default + * list and added to the general interrupt list. */ + cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); curr_cpu = cpumask_next(curr_cpu, &entry->def_intr.mask); @@ -236,7 +268,10 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) * Remove the remaining kernel receive queues from * the default list and add them to the receive list. */ - for (i = 0; i < dd->n_krcv_queues - 1; i++) { + for (i = 0; + i < (dd->n_krcv_queues - 1) * + hfi1_per_node_cntr[dd->node]; + i++) { cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); cpumask_set_cpu(curr_cpu, @@ -246,6 +281,15 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) if (curr_cpu >= nr_cpu_ids) break; } + + /* + * If there ends up being 0 CPU cores leftover for SDMA + * engines, use the same CPU cores as general/control + * context. + */ + if (cpumask_weight(&entry->def_intr.mask) == 0) + cpumask_copy(&entry->def_intr.mask, + &entry->general_intr_mask); } spin_lock(&node_affinity.lock); @@ -261,7 +305,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) int ret; cpumask_var_t diff; struct hfi1_affinity_node *entry; - struct cpu_mask_set *set; + struct cpu_mask_set *set = NULL; struct sdma_engine *sde = NULL; struct hfi1_ctxtdata *rcd = NULL; char extra[64]; @@ -282,18 +326,17 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) case IRQ_SDMA: sde = (struct sdma_engine *)msix->arg; scnprintf(extra, 64, "engine %u", sde->this_idx); - /* fall through */ - case IRQ_GENERAL: set = &entry->def_intr; break; + case IRQ_GENERAL: + cpu = cpumask_first(&entry->general_intr_mask); + break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - if (rcd->ctxt == HFI1_CTRL_CTXT) { - set = &entry->def_intr; - cpu = cpumask_first(&set->mask); - } else { + if (rcd->ctxt == HFI1_CTRL_CTXT) + cpu = cpumask_first(&entry->general_intr_mask); + else set = &entry->rcv_intr; - } scnprintf(extra, 64, "ctxt %u", rcd->ctxt); break; default: @@ -302,9 +345,9 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) } /* - * The control receive context is placed on a particular CPU, which - * is set above. Skip accounting for it. Everything else finds its - * CPU here. + * The general and control contexts are placed on a particular + * CPU, which is set above. Skip accounting for it. Everything else + * finds its CPU here. */ if (cpu == -1 && set) { spin_lock(&node_affinity.lock); @@ -355,12 +398,14 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, switch (msix->type) { case IRQ_SDMA: - case IRQ_GENERAL: set = &entry->def_intr; break; + case IRQ_GENERAL: + /* Don't accounting for general contexts */ + break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; - /* only do accounting for non control contexts */ + /* Don't do accounting for control contexts */ if (rcd->ctxt != HFI1_CTRL_CTXT) set = &entry->rcv_intr; break; @@ -438,14 +483,20 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) cpumask_clear(&set->used); } - entry = node_affinity_lookup(dd->node); - /* CPUs used by interrupt handlers */ - cpumask_copy(intrs, (entry->def_intr.gen ? - &entry->def_intr.mask : - &entry->def_intr.used)); - cpumask_or(intrs, intrs, (entry->rcv_intr.gen ? - &entry->rcv_intr.mask : - &entry->rcv_intr.used)); + /* + * If NUMA node has CPUs used by interrupt handlers, include them in the + * interrupt handler mask. + */ + entry = node_affinity_lookup(node); + if (entry) { + cpumask_copy(intrs, (entry->def_intr.gen ? + &entry->def_intr.mask : + &entry->def_intr.used)); + cpumask_or(intrs, intrs, (entry->rcv_intr.gen ? + &entry->rcv_intr.mask : + &entry->rcv_intr.used)); + cpumask_or(intrs, intrs, &entry->general_intr_mask); + } hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", cpumask_pr_args(intrs)); diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index ad3e730a8d8f..003860ed0d25 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -107,6 +107,7 @@ struct hfi1_affinity_node { int node; struct cpu_mask_set def_intr; struct cpu_mask_set rcv_intr; + struct cpumask general_intr_mask; struct list_head list; }; @@ -118,7 +119,7 @@ struct hfi1_affinity_node_list { spinlock_t lock; }; -void node_affinity_init(void); +int node_affinity_init(void); void node_affinity_destroy(void); extern struct hfi1_affinity_node_list node_affinity; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 748e235b828e..fd67e98e3178 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1235,6 +1235,8 @@ int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int); void set_all_slowpath(struct hfi1_devdata *dd); +extern const struct pci_device_id hfi1_pci_tbl[]; + /* receive packet handler dispositions */ #define RCV_PKT_OK 0x0 /* keep going */ #define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index b0c3e8a97725..1620d6882d10 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1162,7 +1162,7 @@ static int init_one(struct pci_dev *, const struct pci_device_id *); #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " #define PFX DRIVER_NAME ": " -static const struct pci_device_id hfi1_pci_tbl[] = { +const struct pci_device_id hfi1_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, { 0, } @@ -1198,7 +1198,9 @@ static int __init hfi1_mod_init(void) if (ret) goto bail; - node_affinity_init(); + ret = node_affinity_init(); + if (ret) + goto bail; /* validate max MTU before any devices start */ if (!valid_opa_max_mtu(hfi1_max_mtu)) { From b094a36f90975373c3a241839869217a65f17d81 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Mon, 25 Jul 2016 07:54:57 -0700 Subject: [PATCH 15/84] IB/hfi1: Refine user process affinity algorithm When performing process affinity recommendations for MPI ranks, the current algorithm doesn't take into account multiple HFI units. Also, real cores and HT cores are not distinguished from one another. Therefore, all HT cores are recommended to be assigned first within the local NUMA node before recommending the assignments of cores in other NUMA nodes. It's ideal to assign all real cores across all NUMA nodes first, then all HT 1 cores, then all HT 2 cores, and so on to balance CPU workload. CPU cores in other NUMA nodes could be running interrupt handlers, and this is not taken into account. To balance the CPU workload for user processes, the following recommendation algorithm is used: For each user process that is opening a context on HFI Y: a) If all cores are assigned to user processes, start assignments all over from the first core b) Assign real cores first, then HT cores (First set of HT cores on all physical cores, then second set of HT cores, and, so on) in the following order: 1. Same NUMA node as HFI Y and not running an IRQ handler 2. Same NUMA node as HFI Y and running an IRQ handler 3. Different NUMA node to HFI Y and not running an IRQ handler 4. Different NUMA node to HFI Y and running an IRQ handler c) Mark core as assigned in the global affinity structure. As user processes are done, remove core assignments from global affinity structure. This implementation allows an arbitrary number of HT cores and provides support for multiple HFIs. This is being included in the kernel rather than user space due to the fact that user space has no way of knowing the CPU recommendations for contexts running as part of other jobs. Reviewed-by: Ira Weiny Reviewed-by: Mitko Haralanov Reviewed-by: Dennis Dalessandro Signed-off-by: Sebastian Sanchez Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 211 +++++++++++++++++++------- drivers/infiniband/hw/hfi1/affinity.h | 8 +- drivers/infiniband/hw/hfi1/file_ops.c | 15 +- 3 files changed, 173 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index eb889270fbeb..c9dcbd55883a 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -116,7 +116,17 @@ int node_affinity_init(void) struct pci_dev *dev = NULL; const struct pci_device_id *ids = hfi1_pci_tbl; + cpumask_clear(&node_affinity.proc.used); cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); + + node_affinity.proc.gen = 0; + node_affinity.num_core_siblings = + cpumask_weight(topology_sibling_cpumask( + cpumask_first(&node_affinity.proc.mask) + )); + node_affinity.num_online_nodes = num_online_nodes(); + node_affinity.num_online_cpus = num_online_cpus(); + /* * The real cpu mask is part of the affinity struct but it has to be * initialized early. It is needed to calculate the number of user @@ -401,7 +411,7 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, set = &entry->def_intr; break; case IRQ_GENERAL: - /* Don't accounting for general contexts */ + /* Don't do accounting for general contexts */ break; case IRQ_RCVCTXT: rcd = (struct hfi1_ctxtdata *)msix->arg; @@ -427,14 +437,47 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, cpumask_clear(&msix->mask); } -int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) +/* This should be called with node_affinity.lock held */ +static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, + struct hfi1_affinity_node_list *affinity) { - int cpu = -1, ret; - cpumask_var_t diff, mask, intrs; + int possible, curr_cpu, i; + uint num_cores_per_socket = node_affinity.num_online_cpus / + affinity->num_core_siblings / + node_affinity.num_online_nodes; + + cpumask_copy(hw_thread_mask, &affinity->proc.mask); + if (affinity->num_core_siblings > 0) { + /* Removing other siblings not needed for now */ + possible = cpumask_weight(hw_thread_mask); + curr_cpu = cpumask_first(hw_thread_mask); + for (i = 0; + i < num_cores_per_socket * node_affinity.num_online_nodes; + i++) + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, hw_thread_mask); + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + } + + /* Identifying correct HW threads within physical cores */ + cpumask_shift_left(hw_thread_mask, hw_thread_mask, + num_cores_per_socket * + node_affinity.num_online_nodes * + hw_thread_no); + } +} + +int hfi1_get_proc_affinity(int node) +{ + int cpu = -1, ret, i; struct hfi1_affinity_node *entry; + cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; const struct cpumask *node_mask, *proc_mask = tsk_cpus_allowed(current); - struct cpu_mask_set *set = &node_affinity.proc; + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; /* * check whether process/context affinity has already @@ -460,22 +503,41 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) /* * The process does not have a preset CPU affinity so find one to - * recommend. We prefer CPUs on the same NUMA as the device. + * recommend using the following algorithm: + * + * For each user process that is opening a context on HFI Y: + * a) If all cores are filled, reinitialize the bitmask + * b) Fill real cores first, then HT cores (First set of HT + * cores on all physical cores, then second set of HT core, + * and, so on) in the following order: + * + * 1. Same NUMA node as HFI Y and not running an IRQ + * handler + * 2. Same NUMA node as HFI Y and running an IRQ handler + * 3. Different NUMA node to HFI Y and not running an IRQ + * handler + * 4. Different NUMA node to HFI Y and running an IRQ + * handler + * c) Mark core as filled in the bitmask. As user processes are + * done, clear cores from the bitmask. */ ret = zalloc_cpumask_var(&diff, GFP_KERNEL); if (!ret) goto done; - ret = zalloc_cpumask_var(&mask, GFP_KERNEL); + ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); if (!ret) goto free_diff; - ret = zalloc_cpumask_var(&intrs, GFP_KERNEL); + ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); if (!ret) - goto free_mask; + goto free_hw_thread_mask; + ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); + if (!ret) + goto free_available_mask; - spin_lock(&node_affinity.lock); + spin_lock(&affinity->lock); /* - * If we've used all available CPUs, clear the mask and start + * If we've used all available HW threads, clear the mask and start * overloading. */ if (cpumask_equal(&set->mask, &set->used)) { @@ -489,82 +551,125 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) */ entry = node_affinity_lookup(node); if (entry) { - cpumask_copy(intrs, (entry->def_intr.gen ? - &entry->def_intr.mask : - &entry->def_intr.used)); - cpumask_or(intrs, intrs, (entry->rcv_intr.gen ? - &entry->rcv_intr.mask : - &entry->rcv_intr.used)); - cpumask_or(intrs, intrs, &entry->general_intr_mask); + cpumask_copy(intrs_mask, (entry->def_intr.gen ? + &entry->def_intr.mask : + &entry->def_intr.used)); + cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? + &entry->rcv_intr.mask : + &entry->rcv_intr.used)); + cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); } hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", - cpumask_pr_args(intrs)); + cpumask_pr_args(intrs_mask)); + + cpumask_copy(hw_thread_mask, &set->mask); /* - * If we don't have a NUMA node requested, preference is towards - * device NUMA node + * If HT cores are enabled, identify which HW threads within the + * physical cores should be used. */ - if (node == -1) - node = dd->node; + if (affinity->num_core_siblings > 0) { + for (i = 0; i < affinity->num_core_siblings; i++) { + find_hw_thread_mask(i, hw_thread_mask, affinity); + + /* + * If there's at least one available core for this HW + * thread number, stop looking for a core. + * + * diff will always be not empty at least once in this + * loop as the used mask gets reset when + * (set->mask == set->used) before this loop. + */ + cpumask_andnot(diff, hw_thread_mask, &set->used); + if (!cpumask_empty(diff)) + break; + } + } + hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", + cpumask_pr_args(hw_thread_mask)); + node_mask = cpumask_of_node(node); - hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node, + hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, cpumask_pr_args(node_mask)); - /* diff will hold all unused cpus */ - cpumask_andnot(diff, &set->mask, &set->used); - hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff)); - - /* get cpumask of available CPUs on preferred NUMA */ - cpumask_and(mask, diff, node_mask); - hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask)); + /* Get cpumask of available CPUs on preferred NUMA */ + cpumask_and(available_mask, hw_thread_mask, node_mask); + cpumask_andnot(available_mask, available_mask, &set->used); + hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, + cpumask_pr_args(available_mask)); /* * At first, we don't want to place processes on the same - * CPUs as interrupt handlers. + * CPUs as interrupt handlers. Then, CPUs running interrupt + * handlers are used. + * + * 1) If diff is not empty, then there are CPUs not running + * non-interrupt handlers available, so diff gets copied + * over to available_mask. + * 2) If diff is empty, then all CPUs not running interrupt + * handlers are taken, so available_mask contains all + * available CPUs running interrupt handlers. + * 3) If available_mask is empty, then all CPUs on the + * preferred NUMA node are taken, so other NUMA nodes are + * used for process assignments using the same method as + * the preferred NUMA node. */ - cpumask_andnot(diff, mask, intrs); + cpumask_andnot(diff, available_mask, intrs_mask); if (!cpumask_empty(diff)) - cpumask_copy(mask, diff); + cpumask_copy(available_mask, diff); - /* - * if we don't have a cpu on the preferred NUMA, get - * the list of the remaining available CPUs - */ - if (cpumask_empty(mask)) { - cpumask_andnot(diff, &set->mask, &set->used); - cpumask_andnot(mask, diff, node_mask); + /* If we don't have CPUs on the preferred node, use other NUMA nodes */ + if (cpumask_empty(available_mask)) { + cpumask_andnot(available_mask, hw_thread_mask, &set->used); + /* Excluding preferred NUMA cores */ + cpumask_andnot(available_mask, available_mask, node_mask); + hfi1_cdbg(PROC, + "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", + cpumask_pr_args(available_mask)); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. + */ + cpumask_andnot(diff, available_mask, intrs_mask); + if (!cpumask_empty(diff)) + cpumask_copy(available_mask, diff); } - hfi1_cdbg(PROC, "possible CPUs for process %*pbl", - cpumask_pr_args(mask)); + hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", + cpumask_pr_args(available_mask)); - cpu = cpumask_first(mask); + cpu = cpumask_first(available_mask); if (cpu >= nr_cpu_ids) /* empty */ cpu = -1; else cpumask_set_cpu(cpu, &set->used); - spin_unlock(&node_affinity.lock); + spin_unlock(&affinity->lock); + hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); - free_cpumask_var(intrs); -free_mask: - free_cpumask_var(mask); + free_cpumask_var(intrs_mask); +free_available_mask: + free_cpumask_var(available_mask); +free_hw_thread_mask: + free_cpumask_var(hw_thread_mask); free_diff: free_cpumask_var(diff); done: return cpu; } -void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu) +void hfi1_put_proc_affinity(int cpu) { - struct cpu_mask_set *set = &node_affinity.proc; + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; if (cpu < 0) return; - spin_lock(&node_affinity.lock); + spin_lock(&affinity->lock); cpumask_clear_cpu(cpu, &set->used); + hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); if (cpumask_empty(&set->used) && set->gen) { set->gen--; cpumask_copy(&set->used, &set->mask); } - spin_unlock(&node_affinity.lock); + spin_unlock(&affinity->lock); } - diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 003860ed0d25..f784de52e881 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -73,7 +73,6 @@ struct cpu_mask_set { struct hfi1_affinity { struct cpu_mask_set def_intr; struct cpu_mask_set rcv_intr; - struct cpu_mask_set proc; struct cpumask real_cpu_mask; /* spin lock to protect affinity struct */ spinlock_t lock; @@ -99,9 +98,9 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); * Determine a CPU affinity for a user process, if the process does not * have an affinity set yet. */ -int hfi1_get_proc_affinity(struct hfi1_devdata *, int); +int hfi1_get_proc_affinity(int); /* Release a CPU used by a user process. */ -void hfi1_put_proc_affinity(struct hfi1_devdata *, int); +void hfi1_put_proc_affinity(int); struct hfi1_affinity_node { int node; @@ -115,6 +114,9 @@ struct hfi1_affinity_node_list { struct list_head list; struct cpumask real_cpu_mask; struct cpu_mask_set proc; + int num_core_siblings; + int num_online_nodes; + int num_online_cpus; /* protect affinity node list */ spinlock_t lock; }; diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2f097d942f9c..d7c07bc7bd14 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -715,7 +715,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_user_sdma_free_queues(fdata); /* release the cpu */ - hfi1_put_proc_affinity(dd, fdata->rec_cpu_num); + hfi1_put_proc_affinity(fdata->rec_cpu_num); /* * Clear any left over, unhandled events so the next process that @@ -815,9 +815,10 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) ret = find_shared_ctxt(fp, uinfo); if (ret < 0) goto done_unlock; - if (ret) - fd->rec_cpu_num = hfi1_get_proc_affinity( - fd->uctxt->dd, fd->uctxt->numa_id); + if (ret) { + fd->rec_cpu_num = + hfi1_get_proc_affinity(fd->uctxt->numa_id); + } } /* @@ -929,7 +930,11 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, if (ctxt == dd->num_rcv_contexts) return -EBUSY; - fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1); + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node. + */ + fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node); if (fd->rec_cpu_num != -1) numa = cpu_to_node(fd->rec_cpu_num); else From dba715f0c8b5daa1fca041c1c9011632c7a83105 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Wed, 6 Jul 2016 17:28:52 -0400 Subject: [PATCH 16/84] IB/hfi1: Use built-in i2c bit-shift bus adapter Use built-in i2c bit-shift bus adapter to control the i2c busses on the chip. Cc: Jason Gunthorpe Reviewed-by: Easwar Hariharan Reviewed-by: Dennis Dalessandro Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Kconfig | 3 +- drivers/infiniband/hw/hfi1/chip.c | 5 + drivers/infiniband/hw/hfi1/hfi.h | 11 + drivers/infiniband/hw/hfi1/init.c | 27 +- drivers/infiniband/hw/hfi1/qsfp.c | 405 +++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/qsfp.h | 3 + 6 files changed, 340 insertions(+), 114 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index a925fb0db706..bac18607fd5b 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,8 +1,9 @@ config INFINIBAND_HFI1 tristate "Intel OPA Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT + depends on X86_64 && INFINIBAND_RDMAVT && I2C select MMU_NOTIFIER select CRC32 + select I2C_ALGOBIT default m ---help--- This is a low-level driver for Intel OPA Gen1 adapter. diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 0de6c0ca7078..22bfe0e0ce4e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14165,6 +14165,11 @@ static int init_asic_data(struct hfi1_devdata *dd) } dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */ spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + /* first one through - set up i2c devices */ + if (!peer) + ret = set_up_i2c(dd, dd->asic_data); + return ret; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index fd67e98e3178..c433eb8d5729 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #include "chip_registers.h" @@ -805,10 +807,19 @@ struct hfi1_temp { u8 triggers; /* temperature triggers */ }; +struct hfi1_i2c_bus { + struct hfi1_devdata *controlling_dd; /* current controlling device */ + struct i2c_adapter adapter; /* bus details */ + struct i2c_algo_bit_data algo; /* bus algorithm details */ + int num; /* bus number, 0 or 1 */ +}; + /* common data between shared ASIC HFIs */ struct hfi1_asic_data { struct hfi1_devdata *dds[2]; /* back pointers */ struct mutex asic_resource_mutex; + struct hfi1_i2c_bus *i2c_bus0; + struct hfi1_i2c_bus *i2c_bus1; }; /* device data struct now contains only "general per-device" info. diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 1620d6882d10..ec77c7edb025 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -973,34 +973,45 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) /* * Release our hold on the shared asic data. If we are the last one, - * free the structure. Must be holding hfi1_devs_lock. + * return the structure to be finalized outside the lock. Must be + * holding hfi1_devs_lock. */ -static void release_asic_data(struct hfi1_devdata *dd) +static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) { + struct hfi1_asic_data *ad; int other; if (!dd->asic_data) - return; + return NULL; dd->asic_data->dds[dd->hfi1_id] = NULL; other = dd->hfi1_id ? 0 : 1; - if (!dd->asic_data->dds[other]) { - /* we are the last holder, free it */ - kfree(dd->asic_data); - } + ad = dd->asic_data; dd->asic_data = NULL; + /* return NULL if the other dd still has a link */ + return ad->dds[other] ? NULL : ad; +} + +static void finalize_asic_data(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad) +{ + clean_up_i2c(dd, ad); + kfree(ad); } static void __hfi1_free_devdata(struct kobject *kobj) { struct hfi1_devdata *dd = container_of(kobj, struct hfi1_devdata, kobj); + struct hfi1_asic_data *ad; unsigned long flags; spin_lock_irqsave(&hfi1_devs_lock, flags); idr_remove(&hfi1_unit_table, dd->unit); list_del(&dd->list); - release_asic_data(dd); + ad = release_asic_data(dd); spin_unlock_irqrestore(&hfi1_devs_lock, flags); + if (ad) + finalize_asic_data(dd, ad); free_platform_config(dd); rcu_barrier(); /* wait for rcu callbacks to complete */ free_percpu(dd->int_counter); diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 6fca2a09b5f1..a207717ade2a 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -50,46 +50,285 @@ #include #include "hfi.h" -#include "twsi.h" + +/* for the given bus number, return the CSR for reading an i2c line */ +static inline u32 i2c_in_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN; +} + +/* for the given bus number, return the CSR for writing an i2c line */ +static inline u32 i2c_oe_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; +} + +static void hfi1_setsda(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CDAT; + else + reg |= QSFP_HFI0_I2CDAT; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static void hfi1_setscl(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CCLK; + else + reg |= QSFP_HFI0_I2CCLK; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static int hfi1_getsda(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setsda(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CDAT); +} + +static int hfi1_getscl(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setscl(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CCLK); +} /* - * QSFP support for hfi driver, using "Two Wire Serial Interface" driver - * in twsi.c + * Allocate and initialize the given i2c bus number. + * Returns NULL on failure. */ -#define I2C_MAX_RETRY 4 +static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad, int num) +{ + struct hfi1_i2c_bus *bus; + int ret; + + bus = kzalloc(sizeof(*bus), GFP_KERNEL); + if (!bus) + return NULL; + + bus->controlling_dd = dd; + bus->num = num; /* our bus number */ + + bus->algo.setsda = hfi1_setsda; + bus->algo.setscl = hfi1_setscl; + bus->algo.getsda = hfi1_getsda; + bus->algo.getscl = hfi1_getscl; + bus->algo.udelay = 5; + bus->algo.timeout = usecs_to_jiffies(50); + bus->algo.data = bus; + + bus->adapter.owner = THIS_MODULE; + bus->adapter.algo_data = &bus->algo; + bus->adapter.dev.parent = &dd->pcidev->dev; + snprintf(bus->adapter.name, sizeof(bus->adapter.name), + "hfi1_i2c%d", num); + + ret = i2c_bit_add_bus(&bus->adapter); + if (ret) { + dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n", + __func__, num, ret); + kfree(bus); + return NULL; + } + + return bus; +} + +/* + * Initialize i2c buses. + * Return 0 on success, -errno on error. + */ +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) +{ + ad->i2c_bus0 = init_i2c_bus(dd, ad, 0); + ad->i2c_bus1 = init_i2c_bus(dd, ad, 1); + if (!ad->i2c_bus0 || !ad->i2c_bus1) + return -ENOMEM; + return 0; +}; + +static void clean_i2c_bus(struct hfi1_i2c_bus *bus) +{ + if (bus) { + i2c_del_adapter(&bus->adapter); + kfree(bus); + } +} + +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) +{ + clean_i2c_bus(ad->i2c_bus0); + ad->i2c_bus0 = NULL; + clean_i2c_bus(ad->i2c_bus1); + ad->i2c_bus1 = NULL; +} + +static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + /* fall through */ + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_NOSTART, + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; + } + + i2c->controlling_dd = dd; + ret = i2c_transfer(&i2c->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n", + __func__, i2c->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} + +static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = I2C_M_RD; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + /* fall through */ + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_RD, + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; + } + + bus->controlling_dd = dd; + ret = i2c_transfer(&bus->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n", + __func__, bus->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} /* * Raw i2c write. No set-up or lock checking. + * + * Return 0 on success, -errno on error. */ static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { struct hfi1_devdata *dd = ppd->dd; - int ret, cnt; - u8 *buff = bp; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; - cnt = 0; - while (cnt < len) { - int wlen = len - cnt; - - ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset, - buff + cnt, wlen); - if (ret) { - /* hfi1_twsi_blk_wr() 1 for error, else 0 */ - return -EIO; - } - offset += wlen; - cnt += wlen; - } - - /* Must wait min 20us between qsfp i2c transactions */ - udelay(20); - - return cnt; + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len); } /* * Caller must hold the i2c chain resource. + * + * Return number of bytes written, or -errno. */ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) @@ -99,63 +338,36 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "I2C chain %d write interface reset failed\n", - target); + ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len); + if (ret) return ret; - } - return __i2c_write(ppd, target, i2c_addr, offset, bp, len); + return len; } /* * Raw i2c read. No set-up or lock checking. + * + * Return 0 on success, -errno on error. */ static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { struct hfi1_devdata *dd = ppd->dd; - int ret, cnt, pass = 0; - int orig_offset = offset; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; - cnt = 0; - while (cnt < len) { - int rlen = len - cnt; - - ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset, - bp + cnt, rlen); - /* Some QSFP's fail first try. Retry as experiment */ - if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY) - continue; - if (ret) { - /* hfi1_twsi_blk_rd() 1 for error, else 0 */ - ret = -EIO; - goto exit; - } - offset += rlen; - cnt += rlen; - } - - ret = cnt; - -exit: - if (ret < 0) { - hfi1_dev_porterr(dd, ppd->port, - "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n", - target, i2c_addr, orig_offset, len); - } - - /* Must wait min 20us between qsfp i2c transactions */ - udelay(20); - - return ret; + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len); } /* * Caller must hold the i2c chain resource. + * + * Return number of bytes read, or -errno. */ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) @@ -165,16 +377,11 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "I2C chain %d read interface reset failed\n", - target); + ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len); + if (ret) return ret; - } - return __i2c_read(ppd, target, i2c_addr, offset, bp, len); + return len; } /* @@ -182,6 +389,8 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, * by writing @addr = ((256 * n) + m) * * Caller must hold the i2c chain resource. + * + * Return number of bytes written or -errno. */ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) @@ -189,21 +398,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int count = 0; int offset; int nwrite; - int ret; + int ret = 0; u8 page; if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "QSFP chain %d write interface reset failed\n", - target); - return ret; - } - while (count < len) { /* * Set the qsfp page based on a zero-based address @@ -213,11 +413,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); - if (ret != 1) { + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { hfi1_dev_porterr(ppd->dd, ppd->port, "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", target, ret); - ret = -EIO; break; } @@ -229,11 +430,13 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, offset, bp + count, nwrite); - if (ret <= 0) /* stop on error or nothing written */ + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) /* stop on error */ break; - count += ret; - addr += ret; + count += nwrite; + addr += nwrite; } if (ret < 0) @@ -266,6 +469,8 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, * by reading @addr = ((256 * n) + m) * * Caller must hold the i2c chain resource. + * + * Return the number of bytes read or -errno. */ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) @@ -273,21 +478,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int count = 0; int offset; int nread; - int ret; + int ret = 0; u8 page; if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; - /* make sure the TWSI bus is in a sane state */ - ret = hfi1_twsi_reset(ppd->dd, target); - if (ret) { - hfi1_dev_porterr(ppd->dd, ppd->port, - "QSFP chain %d read interface reset failed\n", - target); - return ret; - } - while (count < len) { /* * Set the qsfp page based on a zero-based address @@ -296,11 +492,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, page = (u8)(addr / QSFP_PAGESIZE); ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); - if (ret != 1) { + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { hfi1_dev_porterr(ppd->dd, ppd->port, "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", target, ret); - ret = -EIO; break; } @@ -310,15 +507,13 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY) nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); - /* QSFPs require a 5-10msec delay after write operations */ - mdelay(5); ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, offset, bp + count, nread); - if (ret <= 0) /* stop on error or nothing read */ + if (ret) /* stop on error */ break; - count += ret; - addr += ret; + count += nread; + addr += nread; } if (ret < 0) diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h index dadc66c442b9..69275ebd9597 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.h +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -238,3 +238,6 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); +struct hfi1_asic_data; +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); From e014991d0735ce4ca7473f9430ce71847fdc7e2f Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Fri, 1 Jul 2016 16:01:50 -0700 Subject: [PATCH 17/84] IB/hfi1: Remove TWSI references Remove the TWSI code. The driver now uses the kernel's built-in i2c bit bus module. Cc: Jason Gunthorpe Reviewed-by: Easwar Hariharan Reviewed-by: Dennis Dalessandro Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 2 +- drivers/infiniband/hw/hfi1/chip.c | 31 -- drivers/infiniband/hw/hfi1/chip.h | 2 - drivers/infiniband/hw/hfi1/twsi.c | 489 ---------------------------- drivers/infiniband/hw/hfi1/twsi.h | 65 ---- 5 files changed, 1 insertion(+), 588 deletions(-) delete mode 100644 drivers/infiniband/hw/hfi1/twsi.c delete mode 100644 drivers/infiniband/hw/hfi1/twsi.h diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 9b5382c94b0c..0cf97a09b64b 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ eprom.o file_ops.o firmware.o \ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ + qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o hfi1-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 22bfe0e0ce4e..40d485b9cb75 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -12349,37 +12349,6 @@ u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) return ib_pstate; } -/* - * Read/modify/write ASIC_QSFP register bits as selected by mask - * data: 0 or 1 in the positions depending on what needs to be written - * dir: 0 for read, 1 for write - * mask: select by setting - * I2CCLK (bit 0) - * I2CDATA (bit 1) - */ -u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, - u32 mask) -{ - u64 qsfp_oe, target_oe; - - target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; - if (mask) { - /* We are writing register bits, so lock access */ - dir &= mask; - data &= mask; - - qsfp_oe = read_csr(dd, target_oe); - qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir; - write_csr(dd, target_oe, qsfp_oe); - } - /* We are exclusively reading bits here, but it is unlikely - * we'll get valid data when we set the direction of the pin - * in the same call, so read should call this function again - * to get valid data - */ - return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); -} - #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 66a327978739..d0a4ddb421f7 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1338,8 +1338,6 @@ struct hfi1_message_header *hfi1_get_msgheader( struct hfi1_devdata *dd, __le32 *rhf_addr); int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, struct hfi1_ctxt_info *kinfo); -u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, - u32 mask); int hfi1_init_ctxt(struct send_context *sc); void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order); diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c deleted file mode 100644 index e82e52a63d35..000000000000 --- a/drivers/infiniband/hw/hfi1/twsi.c +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include -#include - -#include "hfi.h" -#include "twsi.h" - -/* - * "Two Wire Serial Interface" support. - * - * Originally written for a not-quite-i2c serial eeprom, which is - * still used on some supported boards. Later boards have added a - * variety of other uses, most board-specific, so the bit-boffing - * part has been split off to this file, while the other parts - * have been moved to chip-specific files. - * - * We have also dropped all pretense of fully generic (e.g. pretend - * we don't know whether '1' is the higher voltage) interface, as - * the restrictions of the generic i2c interface (e.g. no access from - * driver itself) make it unsuitable for this use. - */ - -#define READ_CMD 1 -#define WRITE_CMD 0 - -/** - * i2c_wait_for_writes - wait for a write - * @dd: the hfi1_ib device - * - * We use this instead of udelay directly, so we can make sure - * that previous register writes have been flushed all the way - * to the chip. Since we are delaying anyway, the cost doesn't - * hurt, and makes the bit twiddling more regular - */ -static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target) -{ - /* - * implicit read of EXTStatus is as good as explicit - * read of scratch, if all we want to do is flush - * writes. - */ - hfi1_gpio_mod(dd, target, 0, 0, 0); - rmb(); /* inlined, so prevent compiler reordering */ -} - -/* - * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that - * for "almost compliant" modules - */ -#define SCL_WAIT_USEC 1000 - -/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. - * Should be 20, but some chips need more. - */ -#define TWSI_BUF_WAIT_USEC 60 - -static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) -{ - u32 mask; - - udelay(1); - - mask = QSFP_HFI0_I2CCLK; - - /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); - - /* - * Allow for slow slaves by simple - * delay for falling edge, sampling on rise. - */ - if (!bit) { - udelay(2); - } else { - int rise_usec; - - for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { - if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0)) - break; - udelay(2); - } - if (rise_usec <= 0) - dd_dev_err(dd, "SCL interface stuck low > %d uSec\n", - SCL_WAIT_USEC); - } - i2c_wait_for_writes(dd, target); -} - -static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait) -{ - u32 read_val, mask; - - mask = QSFP_HFI0_I2CCLK; - /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); - if (wait) - i2c_wait_for_writes(dd, target); - return (read_val & mask) >> GPIO_SCL_NUM; -} - -static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit) -{ - u32 mask; - - mask = QSFP_HFI0_I2CDAT; - - /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); - - i2c_wait_for_writes(dd, target); - udelay(2); -} - -static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait) -{ - u32 read_val, mask; - - mask = QSFP_HFI0_I2CDAT; - /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); - if (wait) - i2c_wait_for_writes(dd, target); - return (read_val & mask) >> GPIO_SDA_NUM; -} - -/** - * i2c_ackrcv - see if ack following write is true - * @dd: the hfi1_ib device - */ -static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target) -{ - u8 ack_received; - - /* AT ENTRY SCL = LOW */ - /* change direction, ignore data */ - ack_received = sda_in(dd, target, 1); - scl_out(dd, target, 1); - ack_received = sda_in(dd, target, 1) == 0; - scl_out(dd, target, 0); - return ack_received; -} - -static void stop_cmd(struct hfi1_devdata *dd, u32 target); - -/** - * rd_byte - read a byte, sending STOP on last, else ACK - * @dd: the hfi1_ib device - * - * Returns byte shifted out of device - */ -static int rd_byte(struct hfi1_devdata *dd, u32 target, int last) -{ - int bit_cntr, data; - - data = 0; - - for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { - data <<= 1; - scl_out(dd, target, 1); - data |= sda_in(dd, target, 0); - scl_out(dd, target, 0); - } - if (last) { - scl_out(dd, target, 1); - stop_cmd(dd, target); - } else { - sda_out(dd, target, 0); - scl_out(dd, target, 1); - scl_out(dd, target, 0); - sda_out(dd, target, 1); - } - return data; -} - -/** - * wr_byte - write a byte, one bit at a time - * @dd: the hfi1_ib device - * @data: the byte to write - * - * Returns 0 if we got the following ack, otherwise 1 - */ -static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data) -{ - int bit_cntr; - u8 bit; - - for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { - bit = (data >> bit_cntr) & 1; - sda_out(dd, target, bit); - scl_out(dd, target, 1); - scl_out(dd, target, 0); - } - return (!i2c_ackrcv(dd, target)) ? 1 : 0; -} - -/* - * issue TWSI start sequence: - * (both clock/data high, clock high, data low while clock is high) - */ -static void start_seq(struct hfi1_devdata *dd, u32 target) -{ - sda_out(dd, target, 1); - scl_out(dd, target, 1); - sda_out(dd, target, 0); - udelay(1); - scl_out(dd, target, 0); -} - -/** - * stop_seq - transmit the stop sequence - * @dd: the hfi1_ib device - * - * (both clock/data low, clock high, data high while clock is high) - */ -static void stop_seq(struct hfi1_devdata *dd, u32 target) -{ - scl_out(dd, target, 0); - sda_out(dd, target, 0); - scl_out(dd, target, 1); - sda_out(dd, target, 1); -} - -/** - * stop_cmd - transmit the stop condition - * @dd: the hfi1_ib device - * - * (both clock/data low, clock high, data high while clock is high) - */ -static void stop_cmd(struct hfi1_devdata *dd, u32 target) -{ - stop_seq(dd, target); - udelay(TWSI_BUF_WAIT_USEC); -} - -/** - * hfi1_twsi_reset - reset I2C communication - * @dd: the hfi1_ib device - * returns 0 if ok, -EIO on error - */ -int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) -{ - int clock_cycles_left = 9; - u32 mask; - - /* Both SCL and SDA should be high. If not, there - * is something wrong. - */ - mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT; - - /* - * Force pins to desired innocuous state. - * This is the default power-on state with out=0 and dir=0, - * So tri-stated and should be floating high (barring HW problems) - */ - hfi1_gpio_mod(dd, target, 0, 0, mask); - - /* Check if SCL is low, if it is low then we have a slave device - * misbehaving and there is not much we can do. - */ - if (!scl_in(dd, target, 0)) - return -EIO; - - /* Check if SDA is low, if it is low then we have to clock SDA - * up to 9 times for the device to release the bus - */ - while (clock_cycles_left--) { - if (sda_in(dd, target, 0)) - return 0; - scl_out(dd, target, 0); - scl_out(dd, target, 1); - } - - return -EIO; -} - -#define HFI1_TWSI_START 0x100 -#define HFI1_TWSI_STOP 0x200 - -/* Write byte to TWSI, optionally prefixed with START or suffixed with - * STOP. - * returns 0 if OK (ACK received), else != 0 - */ -static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags) -{ - int ret = 1; - - if (flags & HFI1_TWSI_START) - start_seq(dd, target); - - /* Leaves SCL low (from i2c_ackrcv()) */ - ret = wr_byte(dd, target, data); - - if (flags & HFI1_TWSI_STOP) - stop_cmd(dd, target); - return ret; -} - -/* Added functionality for IBA7220-based cards */ -#define HFI1_TEMP_DEV 0x98 - -/* - * hfi1_twsi_blk_rd - * General interface for data transfer from twsi devices. - * One vestige of its former role is that it recognizes a device - * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, - * which responded to all TWSI device codes, interpreting them as - * address within device. On all other devices found on board handled by - * this driver, the device is followed by a N-byte "address" which selects - * the "register" or "offset" within the device from which data should - * be read. - */ -int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, - void *buffer, int len) -{ - u8 *bp = buffer; - int ret = 1; - int i; - int offset_size; - - /* obtain the offset size, strip it from the device address */ - offset_size = (dev >> 8) & 0xff; - dev &= 0xff; - - /* allow at most a 2 byte offset */ - if (offset_size > 2) - goto bail; - - if (dev == HFI1_TWSI_NO_DEV) { - /* legacy not-really-I2C */ - addr = (addr << 1) | READ_CMD; - ret = twsi_wr(dd, target, addr, HFI1_TWSI_START); - } else { - /* Actual I2C */ - if (offset_size) { - ret = twsi_wr(dd, target, - dev | WRITE_CMD, HFI1_TWSI_START); - if (ret) { - stop_cmd(dd, target); - goto bail; - } - - for (i = 0; i < offset_size; i++) { - ret = twsi_wr(dd, target, - (addr >> (i * 8)) & 0xff, 0); - udelay(TWSI_BUF_WAIT_USEC); - if (ret) { - dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", - i, addr); - goto bail; - } - } - } - ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START); - } - if (ret) { - stop_cmd(dd, target); - goto bail; - } - - /* - * block devices keeps clocking data out as long as we ack, - * automatically incrementing the address. Some have "pages" - * whose boundaries will not be crossed, but the handling - * of these is left to the caller, who is in a better - * position to know. - */ - while (len-- > 0) { - /* - * Get and store data, sending ACK if length remaining, - * else STOP - */ - *bp++ = rd_byte(dd, target, !len); - } - - ret = 0; - -bail: - return ret; -} - -/* - * hfi1_twsi_blk_wr - * General interface for data transfer to twsi devices. - * One vestige of its former role is that it recognizes a device - * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, - * which responded to all TWSI device codes, interpreting them as - * address within device. On all other devices found on board handled by - * this driver, the device is followed by a N-byte "address" which selects - * the "register" or "offset" within the device to which data should - * be written. - */ -int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, - const void *buffer, int len) -{ - const u8 *bp = buffer; - int ret = 1; - int i; - int offset_size; - - /* obtain the offset size, strip it from the device address */ - offset_size = (dev >> 8) & 0xff; - dev &= 0xff; - - /* allow at most a 2 byte offset */ - if (offset_size > 2) - goto bail; - - if (dev == HFI1_TWSI_NO_DEV) { - if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, - HFI1_TWSI_START)) { - goto failed_write; - } - } else { - /* Real I2C */ - if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START)) - goto failed_write; - } - - for (i = 0; i < offset_size; i++) { - ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0); - udelay(TWSI_BUF_WAIT_USEC); - if (ret) { - dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", - i, addr); - goto bail; - } - } - - for (i = 0; i < len; i++) - if (twsi_wr(dd, target, *bp++, 0)) - goto failed_write; - - ret = 0; - -failed_write: - stop_cmd(dd, target); - -bail: - return ret; -} diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h deleted file mode 100644 index 5b8a5b5e7eae..000000000000 --- a/drivers/infiniband/hw/hfi1/twsi.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _TWSI_H -#define _TWSI_H -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#define HFI1_TWSI_NO_DEV 0xFF - -struct hfi1_devdata; - -/* Bit position of SDA/SCL pins in ASIC_QSFP* registers */ -#define GPIO_SDA_NUM 1 -#define GPIO_SCL_NUM 0 - -/* these functions must be called with qsfp_lock held */ -int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target); -int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, - void *buffer, int len); -int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, - const void *buffer, int len); - -#endif /* _TWSI_H */ From 14833b8c52424eafb962f9ce7d8f1c01c14ee41f Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Fri, 1 Jul 2016 16:01:56 -0700 Subject: [PATCH 18/84] IB/hfi1: Improve SDMA engine assignment for user SDMA Currently each user context is assigned a single SDMA engine based on the VL, context id, and subcontext id. That means for MPI applications, each rank can only use one SDMA engine for all messages. This may create unwanted backup for independent messages going to different destinations upon congestion at one destination. This patch adds the packet "dlid" to the formula of SDMA engine selection for user SDMA requests. A simple hash table is used to maintain even distribution among the available SDMA engines regardless how the "dlid" values are distributed. Reviewed-by: Dean Luick Reviewed-by: Tadeusz Struk Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 29 +++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 47ffd273ecbd..d16ed52a2cb1 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -496,6 +496,27 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) return 0; } +static u8 dlid_to_selector(u16 dlid) +{ + static u8 mapping[256]; + static int initialized; + static u8 next; + int hash; + + if (!initialized) { + memset(mapping, 0xFF, 256); + initialized = 1; + } + + hash = ((dlid >> 8) ^ dlid) & 0xFF; + if (mapping[hash] == 0xFF) { + mapping[hash] = next; + next = (next + 1) & 0x7F; + } + + return mapping[hash]; +} + int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, unsigned long dim, unsigned long *count) { @@ -511,6 +532,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, struct user_sdma_request *req; u8 opcode, sc, vl; int req_queued = 0; + u16 dlid; + u8 selector; if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { hfi1_cdbg( @@ -686,9 +709,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, idx++; } + dlid = be16_to_cpu(req->hdr.lrh[1]); + selector = dlid_to_selector(dlid); + /* Have to select the engine */ req->sde = sdma_select_engine_vl(dd, - (u32)(uctxt->ctxt + fd->subctxt), + (u32)(uctxt->ctxt + fd->subctxt + + selector), vl); if (!req->sde || !sdma_running(req->sde)) { ret = -ECOMM; From 71e68e3db8008d89d4eb25483ea68d752015e9d8 Mon Sep 17 00:00:00 2001 From: Jakub Pawlak Date: Fri, 1 Jul 2016 16:02:02 -0700 Subject: [PATCH 19/84] IB/hfi1: Correct receive packet handler assignment Prevent processing receive packet in case when opcode is accepted by QP but handler for this type of packet is not defined. Reviewed-by: Mike Marciniszyn Signed-off-by: Jakub Pawlak Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/verbs.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 849c4b9399d4..6ad3f9de587c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -540,19 +540,15 @@ void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release) /* * Make sure the QP is ready and able to accept the given opcode. */ -static inline int qp_ok(int opcode, struct hfi1_packet *packet) +static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet) { - struct hfi1_ibport *ibp; - if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) - goto dropit; + return NULL; if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || (opcode == IB_OPCODE_CNP)) - return 1; -dropit: - ibp = &packet->rcd->ppd->ibport_data; - ibp->rvp.n_pkt_drops++; - return 0; + return opcode_handler_tbl[opcode]; + + return NULL; } /** @@ -571,6 +567,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = &ppd->ibport_data; struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler packet_handler; unsigned long flags; u32 qp_num; int lnh; @@ -616,8 +613,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) list_for_each_entry_rcu(p, &mcast->qp_list, list) { packet->qp = p->qp; spin_lock_irqsave(&packet->qp->r_lock, flags); - if (likely((qp_ok(opcode, packet)))) - opcode_handler_tbl[opcode](packet); + packet_handler = qp_ok(opcode, packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; spin_unlock_irqrestore(&packet->qp->r_lock, flags); } /* @@ -634,8 +634,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) goto drop; } spin_lock_irqsave(&packet->qp->r_lock, flags); - if (likely((qp_ok(opcode, packet)))) - opcode_handler_tbl[opcode](packet); + packet_handler = qp_ok(opcode, packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; spin_unlock_irqrestore(&packet->qp->r_lock, flags); rcu_read_unlock(); } From afcf8f7647780aa147ad68be48d223cd50311b9a Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 1 Jul 2016 16:02:07 -0700 Subject: [PATCH 20/84] IB/rdmavt: Add data structures and routines for table driven post send Add flexibility for driver dependent operations in post send because different drivers will have differing post send operation support. This includes data structure definitions to support a table driven scheme along with the necessary validation routine using the new table. Reviewed-by: Ashutosh Dixit Reviewed-by: Jianxin Xiong Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 67 ++++++++++++++++++++++++++++--- include/rdma/rdma_vt.h | 3 ++ include/rdma/rdmavt_qp.h | 28 +++++++++++-- 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 41ba7e9cadaa..d2b5b547c5a0 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -613,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); void *priv = NULL; gfp_t gfp; + size_t sqsize; if (!rdi) return ERR_PTR(-EINVAL); @@ -643,7 +644,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, init_attr->cap.max_recv_wr == 0) return ERR_PTR(-EINVAL); } - + sqsize = + init_attr->cap.max_send_wr + 1; switch (init_attr->qp_type) { case IB_QPT_SMI: case IB_QPT_GSI: @@ -658,11 +660,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, sizeof(struct rvt_swqe); if (gfp == GFP_NOIO) swq = __vmalloc( - (init_attr->cap.max_send_wr + 1) * sz, + sqsize * sz, gfp | __GFP_ZERO, PAGE_KERNEL); else swq = vzalloc_node( - (init_attr->cap.max_send_wr + 1) * sz, + sqsize * sz, rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); @@ -747,7 +749,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, INIT_LIST_HEAD(&qp->rspwait); qp->state = IB_QPS_RESET; qp->s_wq = swq; - qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_size = sqsize; qp->s_avail = init_attr->cap.max_send_wr; qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) @@ -1440,12 +1442,65 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, } /** - * qp_get_savail - return number of avail send entries + * rvt_qp_valid_operation - validate post send wr request + * @qp - the qp + * @post-parms - the post send table for the driver + * @wr - the work request * + * The routine validates the operation based on the + * validation table an returns the length of the operation + * which can extend beyond the ib_send_bw. Operation + * dependent flags key atomic operation validation. + * + * There is an exception for UD qps that validates the pd and + * overrides the length to include the additional UD specific + * length. + * + * Returns a negative error or the length of the work request + * for building the swqe. + */ +static inline int rvt_qp_valid_operation( + struct rvt_qp *qp, + const struct rvt_operation_params *post_parms, + struct ib_send_wr *wr) +{ + int len; + + if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length) + return -EINVAL; + if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type))) + return -EINVAL; + if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) && + ibpd_to_rvtpd(qp->ibqp.pd)->user) + return -EINVAL; + if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + return -EINVAL; + if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC && + !qp->s_max_rd_atomic) + return -EINVAL; + len = post_parms[wr->opcode].length; + /* UD specific */ + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) { + if (qp->ibqp.pd != ud_wr(wr)->ah->pd) + return -EINVAL; + len = sizeof(struct ib_ud_wr); + } + return len; +} + +/** + * qp_get_savail - return number of avail send entries * @qp - the qp * * This assumes the s_hlock is held but the s_last * qp variable is uncontrolled. + * + * The return is adjusted to not count device specific + * reserved operations. */ static inline u32 qp_get_savail(struct rvt_qp *qp) { @@ -1481,6 +1536,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp, u8 log_pmtu; int ret; + BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); + /* IB spec says that num_sge == 0 is OK. */ if (unlikely(wr->num_sge > qp->s_max_sge)) return -EINVAL; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 9c9a27d42aaa..3a70dc047314 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -351,6 +351,9 @@ struct rvt_dev_info { /* Driver specific properties */ struct rvt_driver_params dparms; + /* post send table */ + const struct rvt_operation_params *post_parms; + struct rvt_mregion __rcu *dma_mr; struct rvt_lkey_table lkey_table; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6d23b879416a..a90d1e941504 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -228,11 +228,31 @@ struct rvt_ack_entry { #define RC_QP_SCALING_INTERVAL 5 -/* - * Variables prefixed with s_ are for the requester (sender). - * Variables prefixed with r_ are for the responder (receiver). - * Variables prefixed with ack_ are for responder replies. +#define RVT_OPERATION_PRIV 0x00000001 +#define RVT_OPERATION_ATOMIC 0x00000002 +#define RVT_OPERATION_ATOMIC_SGE 0x00000004 + +#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) + +/** + * rvt_operation_params - op table entry + * @length - the length to copy into the swqe entry + * @qpt_support - a bit mask indicating QP type support + * @flags - RVT_OPERATION flags (see above) * + * This supports table driven post send so that + * the driver can have differing an potentially + * different sets of operations. + * + **/ + +struct rvt_operation_params { + size_t length; + u32 qpt_support; + u32 flags; +}; + +/* * Common variables are protected by both r_rq.lock and s_lock in that order * which only happens in modify_qp() or changing the QP 'state'. */ From 1ac57c50e96286a221f0598dfc1f7dce70f6c2d8 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 1 Jul 2016 16:02:13 -0700 Subject: [PATCH 21/84] IB/hfi1: Add hfi1 post send tables Add initial table for table driven post_send support. Reviewed-by: Jianxin Xiong Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 44 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/qp.h | 2 ++ drivers/infiniband/hw/hfi1/verbs.c | 3 ++ 3 files changed, 49 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 1a942ffba4cb..a8b3fc9c91c8 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "hfi.h" #include "qp.h" @@ -115,6 +116,49 @@ static const u16 credit_table[31] = { 32768 /* 1E */ }; +const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { +[IB_WR_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_RDMA_READ] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC, +}, + +[IB_WR_ATOMIC_CMP_AND_SWP] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_ATOMIC_FETCH_AND_ADD] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_RDMA_WRITE_WITH_IMM] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND_WITH_IMM] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +}; + static void flush_tx_list(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index e7bc8d6cf681..ddf82988b02f 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -54,6 +54,8 @@ extern unsigned int hfi1_qp_table_size; +extern const struct rvt_operation_params hfi1_post_parms[]; + /* * free_ahg - clear ahg from QP */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 6ad3f9de587c..a89055fa72d8 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1683,6 +1683,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + /* post send table */ + dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, From 9ec4faa391470def4f70ec3dc3963c76eebcd5f3 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 1 Jul 2016 16:02:18 -0700 Subject: [PATCH 22/84] IB/qib: Add qib post send table Add initial table for table driven post_send support. Reviewed-by: Jianxin Xiong Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qib/qib_qp.c | 43 +++++++++++++++++++++++++++ drivers/infiniband/hw/qib/qib_verbs.c | 2 ++ drivers/infiniband/hw/qib/qib_verbs.h | 2 ++ 3 files changed, 47 insertions(+) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 575b737d9ef3..9cc0aae1d781 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -106,6 +106,49 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; +const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = { +[IB_WR_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_RDMA_READ] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC, +}, + +[IB_WR_ATOMIC_CMP_AND_SWP] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_ATOMIC_FETCH_AND_ADD] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_RDMA_WRITE_WITH_IMM] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND_WITH_IMM] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +}; + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map, gfp_t gfp) { diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index cbf6200e6afc..fd1dfbce5539 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1582,6 +1582,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_total_mcast_qp_attach = rdi->dparms.props.max_mcast_qp_attach * rdi->dparms.props.max_mcast_grp; + /* post send table */ + dd->verbs_dev.rdi.post_parms = qib_post_parms; } /** diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 4f878151f81f..736ced684842 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -497,4 +497,6 @@ extern unsigned int ib_qib_max_srq_wrs; extern const u32 ib_qib_rnr_table[]; +extern const struct rvt_operation_params qib_post_parms[]; + #endif /* QIB_VERBS_H */ From 2821c509fcc963e2661ec1ef3aa8b3d2a64399fa Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 1 Jul 2016 16:02:24 -0700 Subject: [PATCH 23/84] IB/rdmavt: Use new driver specific post send table Change rvt_post_one_wr to use the new table mechanism for post send. Validate that each low level driver specifies the table. Reviewed-by: Jianxin Xiong Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 46 ++++++------------------------- drivers/infiniband/sw/rdmavt/vt.c | 3 +- 2 files changed, 10 insertions(+), 39 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index d2b5b547c5a0..ebc37f55ac55 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1535,6 +1535,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); u8 log_pmtu; int ret; + size_t cplen; BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); @@ -1542,32 +1543,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, if (unlikely(wr->num_sge > qp->s_max_sge)) return -EINVAL; - /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. - */ - if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned)wr->opcode >= IB_WR_RDMA_READ) - return -EINVAL; - } else if (qp->ibqp.qp_type != IB_QPT_RC) { - /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ - if (wr->opcode != IB_WR_SEND && - wr->opcode != IB_WR_SEND_WITH_IMM) - return -EINVAL; - /* Check UD destination address PD */ - if (qp->ibqp.pd != ud_wr(wr)->ah->pd) - return -EINVAL; - } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { - return -EINVAL; - } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) { - return -EINVAL; - } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) { - return -EINVAL; - } + ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr); + if (ret < 0) + return ret; + cplen = ret; + /* check for avail */ if (unlikely(!qp->s_avail)) { qp->s_avail = qp_get_savail(qp); @@ -1588,18 +1568,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp, pd = ibpd_to_rvtpd(qp->ibqp.pd); wqe = rvt_get_swqe_ptr(qp, qp->s_head); - if (qp->ibqp.qp_type != IB_QPT_UC && - qp->ibqp.qp_type != IB_QPT_RC) - memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr)); - else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || - wr->opcode == IB_WR_RDMA_WRITE || - wr->opcode == IB_WR_RDMA_READ) - memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr)); - else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr)); - else - memcpy(&wqe->wr, wr, sizeof(wqe->wr)); + /* cplen has length from above */ + memcpy(&wqe->wr, wr, cplen); wqe->length = 0; j = 0; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 30c4fda7a05a..89fe9675c550 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -528,7 +528,8 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) post_send), rvt_post_send)) if (!rdi->driver_f.schedule_send || - !rdi->driver_f.do_send) + !rdi->driver_f.do_send || + !rdi->post_parms) return -EINVAL; break; From 3210314ad305d554e7bfce5281b31ce80afa5f15 Mon Sep 17 00:00:00 2001 From: Jakub Pawlak Date: Mon, 25 Jul 2016 13:37:54 -0700 Subject: [PATCH 24/84] IB/hfi1: Fix integrity errors counter value calculation PMA should not sum TX and RX replay counts when reporting local link integrity errors. Fixed by removing C_DC_TX_REPLAY counter from calculation of the link integrity errors counter value. Reviewed-by: Ira Weiny Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Jakub Pawlak Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 962bb11074d9..5590a4c84a4e 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2418,14 +2418,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, rsp->port_rcv_remote_physical_errors = cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL)); - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - if (tmp2 < tmp) { - /* overflow/wrapped */ - rsp->local_link_integrity_errors = cpu_to_be64(~0); - } else { - rsp->local_link_integrity_errors = cpu_to_be64(tmp2); - } + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); @@ -2520,9 +2515,8 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL); /* local link integrity must be right-shifted by the lli resolution */ - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - error_counter_summary += (tmp >> res_lli); + error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL) >> res_lli); /* link error recovery must b right-shifted by the ler resolution */ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); @@ -2791,14 +2785,9 @@ static void pma_get_opa_port_ectrs(struct ib_device *ibdev, rsp->port_rcv_constraint_errors = cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL)); - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - if (tmp2 < tmp) { - /* overflow/wrapped */ - rsp->local_link_integrity_errors = cpu_to_be64(~0); - } else { - rsp->local_link_integrity_errors = cpu_to_be64(tmp2); - } + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); rsp->excessive_buffer_overruns = cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); } @@ -3156,10 +3145,8 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS) write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0); - if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) { - write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); - } if (counter_select & CS_LINK_ERROR_RECOVERY) { write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); @@ -3956,7 +3943,6 @@ void clear_linkup_counters(struct hfi1_devdata *dd) write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0); /* LocalLinkIntegrityErrors */ - write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); /* ExcessiveBufferOverruns */ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); From 1b23f02cf4bbe644028077539b1045f742d61fa2 Mon Sep 17 00:00:00 2001 From: Tymoteusz Kielan Date: Mon, 25 Jul 2016 13:38:01 -0700 Subject: [PATCH 25/84] IB/hfi1: Fix to fully initialize send context area While handling buffer control MAD, partially initialized dd->kernel_send_context area may cause potential dereference of uninitialized pointers. Fix by using kzalloc_node() instead of kmalloc_node(). Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Reviewed-by: Andrzej Kacprowski Signed-off-by: Tymoteusz Kielan Signed-off-by: Andrzej Kacprowski Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index d4022450b73f..a99fccadf624 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1956,7 +1956,7 @@ int init_pervl_scs(struct hfi1_devdata *dd) hfi1_init_ctxt(dd->vld[15].sc); dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); - dd->kernel_send_context = kmalloc_node(dd->num_send_contexts * + dd->kernel_send_context = kzalloc_node(dd->num_send_contexts * sizeof(struct send_context *), GFP_KERNEL, dd->node); dd->kernel_send_context[0] = dd->vld[15].sc; From 5fd2b562edca6cfc710f97f0b691a589cf14c9e7 Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Mon, 25 Jul 2016 13:38:07 -0700 Subject: [PATCH 26/84] IB/hfi1: Pull FECN/BECN processing to a common place There were multiple places where FECN/BECN processing was being done for the different types of QPs. All of that code was very similar, which meant that it could be pulled into a single function used by the different QP types. To retain the performance in the fastpath, the common code starts with an inline function, which only calls the slow path if the packet has any of the [FB]ECN bits set. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Mitko Haralanov Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 47 ++++++++++++++++------------- drivers/infiniband/hw/hfi1/hfi.h | 16 ++++++++++ drivers/infiniband/hw/hfi1/rc.c | 19 +----------- drivers/infiniband/hw/hfi1/uc.c | 36 +--------------------- drivers/infiniband/hw/hfi1/ud.c | 24 +-------------- 5 files changed, 45 insertions(+), 97 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 6c81d155665d..4dbadf77f01d 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -450,14 +450,20 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd, packet->rcv_flags = 0; } -static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, - struct hfi1_other_headers *ohdr, - u64 rhf, u32 bth1, struct ib_grh *grh) +void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - u32 rqpn = 0; - u16 rlid; - u8 sc5, svc_type; + struct hfi1_ib_header *hdr = pkt->hdr; + struct hfi1_other_headers *ohdr = pkt->ohdr; + struct ib_grh *grh = NULL; + u32 rqpn = 0, bth1; + u16 rlid, dlid = be16_to_cpu(hdr->lrh[1]); + u8 sc, svc_type; + bool is_mcast = false; + + if (pkt->rcv_flags & HFI1_HAS_GRH) + grh = &hdr->u.l.grh; switch (qp->ibqp.qp_type) { case IB_QPT_SMI: @@ -466,6 +472,8 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, rlid = be16_to_cpu(hdr->lrh[3]); rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; svc_type = IB_CC_SVCTYPE_UD; + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); break; case IB_QPT_UC: rlid = qp->remote_ah_attr.dlid; @@ -481,24 +489,23 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, return; } - sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; - if (rhf_dc_info(rhf)) - sc5 |= 0x10; + sc = hdr2sc((struct hfi1_message_header *)hdr, pkt->rhf); - if (bth1 & HFI1_FECN_SMASK) { + bth1 = be32_to_cpu(ohdr->bth[1]); + if (do_cnp && (bth1 & HFI1_FECN_SMASK)) { u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); - u16 dlid = be16_to_cpu(hdr->lrh[1]); - return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh); + return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh); } - if (bth1 & HFI1_BECN_SMASK) { + if (!is_mcast && (bth1 & HFI1_BECN_SMASK)) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); u32 lqpn = bth1 & RVT_QPN_MASK; - u8 sl = ibp->sc_to_sl[sc5]; + u8 sl = ibp->sc_to_sl[sc]; process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); } + } struct ps_mdata { @@ -596,7 +603,6 @@ static void __prescan_rxq(struct hfi1_packet *packet) struct rvt_qp *qp; struct hfi1_ib_header *hdr; struct hfi1_other_headers *ohdr; - struct ib_grh *grh = NULL; struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn, bth1; @@ -616,14 +622,13 @@ static void __prescan_rxq(struct hfi1_packet *packet) hfi1_get_msgheader(dd, rhf_addr); lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_BTH) { + if (lnh == HFI1_LRH_BTH) ohdr = &hdr->u.oth; - } else if (lnh == HFI1_LRH_GRH) { + else if (lnh == HFI1_LRH_GRH) ohdr = &hdr->u.l.oth; - grh = &hdr->u.l.grh; - } else { + else goto next; /* just in case */ - } + bth1 = be32_to_cpu(ohdr->bth[1]); is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK)); @@ -639,7 +644,7 @@ static void __prescan_rxq(struct hfi1_packet *packet) goto next; } - process_ecn(qp, hdr, ohdr, rhf, bth1, grh); + process_ecn(qp, packet, true); rcu_read_unlock(); /* turn off BECN, FECN */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index c433eb8d5729..7df4cb2ed7da 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1583,6 +1583,22 @@ static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) return &dd->pport[pidx].ibport_data; } +void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp); +static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp) +{ + struct hfi1_other_headers *ohdr = pkt->ohdr; + u32 bth1; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + hfi1_process_ecn_slowpath(qp, pkt, do_cnp); + return bth1 & HFI1_FECN_SMASK; + } + return false; +} + /* * Return the indexed PKEY from the port PKEY table. */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 3aeb83297408..bd4baa444f7f 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2086,7 +2086,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_other_headers *ohdr = packet->ohdr; u32 bth0, opcode; u32 hdrsize = packet->hlen; @@ -2097,7 +2096,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) int diff; struct ib_reth *reth; unsigned long flags; - u32 bth1; int ret, is_fecn = 0; int copy_last = 0; @@ -2105,22 +2103,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) return; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { - if (bth1 & HFI1_BECN_SMASK) { - u16 rlid = qp->remote_ah_attr.dlid; - u32 lqpn, rqpn; - - lqpn = qp->ibqp.qp_num; - rqpn = qp->remote_qpn; - process_becn( - ppd, - qp->remote_ah_attr.sl, - rlid, lqpn, rqpn, - IB_CC_SVCTYPE_RC); - } - is_fecn = bth1 & HFI1_FECN_SMASK; - } + is_fecn = process_ecn(qp, packet, false); psn = be32_to_cpu(ohdr->bth[2]); opcode = (bth0 >> 24) & 0xff; diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index df773d433297..b7a25311bd7b 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -294,46 +294,12 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) struct ib_reth *reth; int has_grh = rcv_flags & HFI1_HAS_GRH; int ret; - u32 bth1; bth0 = be32_to_cpu(ohdr->bth[0]); if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) return; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { - if (bth1 & HFI1_BECN_SMASK) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 rqpn, lqpn; - u16 rlid = be16_to_cpu(hdr->lrh[3]); - u8 sl, sc5; - - lqpn = bth1 & RVT_QPN_MASK; - rqpn = qp->remote_qpn; - - sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; - sl = ibp->sc_to_sl[sc5]; - - process_becn(ppd, sl, rlid, lqpn, rqpn, - IB_CC_SVCTYPE_UC); - } - - if (bth1 & HFI1_FECN_SMASK) { - struct ib_grh *grh = NULL; - u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); - u16 slid = be16_to_cpu(hdr->lrh[3]); - u16 dlid = be16_to_cpu(hdr->lrh[1]); - u32 src_qp = qp->remote_qpn; - u8 sc5; - - sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; - if (has_grh) - grh = &hdr->u.l.grh; - - return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, - grh); - } - } + process_ecn(qp, packet, true); psn = be32_to_cpu(ohdr->bth[2]); opcode = (bth0 >> 24) & 0xff; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index be91f6fa1c87..02488c6fab28 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -679,29 +679,10 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct rvt_qp *qp = packet->qp; bool has_grh = rcv_flags & HFI1_HAS_GRH; u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); - u32 bth1; - int is_mcast; - struct ib_grh *grh = NULL; qkey = be32_to_cpu(ohdr->u.ud.deth[0]); src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; dlid = be16_to_cpu(hdr->lrh[1]); - is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & HFI1_BECN_SMASK)) { - /* - * In pre-B0 h/w the CNP_OPCODE is handled via an - * error path. - */ - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; - u8 sl; - - sl = ibp->sc_to_sl[sc5]; - - process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD); - } /* * The opcode is in the low byte when its in network order @@ -712,11 +693,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) pkey = (u16)be32_to_cpu(ohdr->bth[0]); - if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) { - u16 slid = be16_to_cpu(hdr->lrh[3]); + process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); - return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh); - } /* * Get the number of bytes the message was padded by * and drop incomplete packets. From a41081aa59363760ae8bc20929617f57bcf025b8 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:13 -0700 Subject: [PATCH 27/84] IB/rdmavt: Add support for ib_map_mr_sg This implements the device specific function needed by the verbs API function ib_map_mr_sg(). Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/mr.c | 51 +++++++++++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/mr.h | 2 ++ drivers/infiniband/sw/rdmavt/vt.c | 7 +++++ 3 files changed, 60 insertions(+) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0f4d4500f45e..75f158aa853d 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -479,6 +479,57 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, return &mr->ibmr; } +/** + * rvt_set_page - page assignment function called by ib_sg_to_pages + * @ibmr: memory region + * @addr: dma address of mapped page + * + * Return: 0 on success + */ +static int rvt_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rvt_mr *mr = to_imr(ibmr); + u32 ps = 1 << mr->mr.page_shift; + u32 mapped_segs = mr->mr.length >> mr->mr.page_shift; + int m, n; + + if (unlikely(mapped_segs == mr->mr.max_segs)) + return -ENOMEM; + + if (mr->mr.length == 0) { + mr->mr.user_base = addr; + mr->mr.iova = addr; + } + + m = mapped_segs / RVT_SEGSZ; + n = mapped_segs % RVT_SEGSZ; + mr->mr.map[m]->segs[n].vaddr = (void *)addr; + mr->mr.map[m]->segs[n].length = ps; + mr->mr.length += ps; + + return 0; +} + +/** + * rvt_map_mr_sg - map sg list and set it the memory region + * @ibmr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg + * + * Return: number of sg elements mapped to the memory region + */ +int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct rvt_mr *mr = to_imr(ibmr); + + mr->mr.length = 0; + mr->mr.page_shift = PAGE_SHIFT; + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + rvt_set_page); +} + /** * rvt_alloc_fmr - allocate a fast memory region * @pd: the protection domain for this memory region diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h index 69380512c6d1..132800ee0205 100644 --- a/drivers/infiniband/sw/rdmavt/mr.h +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -82,6 +82,8 @@ int rvt_dereg_mr(struct ib_mr *ibmr); struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); +int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset); struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 89fe9675c550..d430c2f7cec4 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -370,6 +370,7 @@ enum { REG_USER_MR, DEREG_MR, ALLOC_MR, + MAP_MR_SG, ALLOC_FMR, MAP_PHYS_FMR, UNMAP_FMR, @@ -634,6 +635,12 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) rvt_alloc_mr); break; + case MAP_MR_SG: + check_driver_override(rdi, offsetof(struct ib_device, + map_mr_sg), + rvt_map_mr_sg); + break; + case MAP_PHYS_FMR: check_driver_override(rdi, offsetof(struct ib_device, map_phys_fmr), From e8f8b098a44a66d3da81e460aed465f26693e120 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:19 -0700 Subject: [PATCH 28/84] IB/rdmavt: Add mechanism to invalidate MR keys In order to support extended memory management, add the mechanism to invalidate MR keys. This includes a flag "lkey_invalid" in the MR data structure that is to be checked when validating access to the MR via the associated key, and two utility functions to perform fast memory registration and memory key invalidate operations. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/mr.c | 73 ++++++++++++++++++++++++++++++- include/rdma/rdma_vt.h | 3 ++ include/rdma/rdmavt_mr.h | 1 + 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 75f158aa853d..80c4b6b401b8 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -140,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd, init_completion(&mr->comp); /* count returning the ptr to user */ atomic_set(&mr->refcount, 1); + atomic_set(&mr->lkey_invalid, 0); mr->pd = pd; mr->max_segs = count; return 0; @@ -530,6 +531,72 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, rvt_set_page); } +/** + * rvt_fast_reg_mr - fast register physical MR + * @qp: the queue pair where the work request comes from + * @ibmr: the memory region to be registered + * @key: updated key for this memory region + * @access: access flags for this memory region + * + * Returns 0 on success. + */ +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access) +{ + struct rvt_mr *mr = to_imr(ibmr); + + if (qp->ibqp.pd != mr->mr.pd) + return -EACCES; + + /* not applicable to dma MR or user MR */ + if (!mr->mr.lkey || mr->umem) + return -EINVAL; + + if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00)) + return -EINVAL; + + ibmr->lkey = key; + ibmr->rkey = key; + mr->mr.lkey = key; + mr->mr.access_flags = access; + atomic_set(&mr->mr.lkey_invalid, 0); + + return 0; +} +EXPORT_SYMBOL(rvt_fast_reg_mr); + +/** + * rvt_invalidate_rkey - invalidate an MR rkey + * @qp: queue pair associated with the invalidate op + * @rkey: rkey to invalidate + * + * Returns 0 on success. + */ +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey) +{ + struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device); + struct rvt_lkey_table *rkt = &dev->lkey_table; + struct rvt_mregion *mr; + + if (rkey == 0) + return -EINVAL; + + rcu_read_lock(); + mr = rcu_dereference( + rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + atomic_set(&mr->lkey_invalid, 1); + rcu_read_unlock(); + return 0; + +bail: + rcu_read_unlock(); + return -EINVAL; +} +EXPORT_SYMBOL(rvt_invalidate_rkey); + /** * rvt_alloc_fmr - allocate a fast memory region * @pd: the protection domain for this memory region @@ -733,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, } mr = rcu_dereference( rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]); - if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + if (unlikely(!mr || atomic_read(&mr->lkey_invalid) || + mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) goto bail; off = sge->addr - mr->user_base; @@ -833,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, mr = rcu_dereference( rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]); - if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + if (unlikely(!mr || atomic_read(&mr->lkey_invalid) || + mr->lkey != rkey || qp->ibqp.pd != mr->pd)) goto bail; off = vaddr - mr->iova; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 3a70dc047314..7fdba92d4c05 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -487,6 +487,9 @@ void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, int port_index, u16 *pkey_table); +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access); +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc); int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h index 5edffdca8c53..6b3c6c8b6b77 100644 --- a/include/rdma/rdmavt_mr.h +++ b/include/rdma/rdmavt_mr.h @@ -81,6 +81,7 @@ struct rvt_mregion { u32 mapsz; /* size of the map array */ u8 page_shift; /* 0 - non unform/non powerof2 sizes */ u8 lkey_published; /* in global table */ + atomic_t lkey_invalid; /* true if current lkey is invalid */ struct completion comp; /* complete when refcount goes to zero */ atomic_t refcount; struct rvt_segarray *map[0]; /* the segments */ From d9f8723924d5955979d05cb7f4f10d9ebac39b7d Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:25 -0700 Subject: [PATCH 29/84] IB/rdmavt: Handle local operations in post send Some work requests are local operations, such as IB_WR_REG_MR and IB_WR_LOCAL_INV. They differ from non-local operations in that: (1) Local operations can be processed immediately without being posted to the send queue if neither fencing nor completion generation is needed. However, to ensure correct ordering, once a local operation is posted to the work queue due to fencing or completion requiement, all subsequent local operations must also be posted to the work queue until all the local operations on the work queue have completed. (2) Local operations don't send packets over the wire and thus don't need (and shouldn't update) the packet sequence numbers. Define a new a flag bit for the post send table to identify local operations. Add a new field to the QP structure to track the number of local operations on the send queue to determine if direct processing of new local operations should be enabled/disabled. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 45 +++++++++++++++++++++++++++---- include/rdma/rdmavt_qp.h | 3 +++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index ebc37f55ac55..f79b809241e0 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -743,6 +743,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->s_lock); spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); + atomic_set(&qp->local_ops_pending, 0); init_waitqueue_head(&qp->wait); init_timer(&qp->s_timer); qp->s_timer.data = (unsigned long)qp; @@ -1548,6 +1549,31 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return ret; cplen = ret; + /* + * Local operations including fast register and local invalidate + * can be processed immediately w/o being posted to the send queue + * if neither fencing nor completion generation is needed. However, + * once fencing or completion is requested, direct processing of + * following local operations must be disabled until all the local + * operations posted to the send queue have completed. This is + * necessary to ensure the correct ordering. + */ + if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) && + !(wr->send_flags & (IB_SEND_FENCE | IB_SEND_SIGNALED)) && + !atomic_read(&qp->local_ops_pending)) { + struct ib_reg_wr *reg = reg_wr(wr); + + switch (wr->opcode) { + case IB_WR_REG_MR: + return rvt_fast_reg_mr(qp, reg->mr, reg->key, + reg->access); + case IB_WR_LOCAL_INV: + return rvt_invalidate_rkey(qp, wr->ex.invalidate_rkey); + default: + return -EINVAL; + } + } + /* check for avail */ if (unlikely(!qp->s_avail)) { qp->s_avail = qp_get_savail(qp); @@ -1612,11 +1638,20 @@ static int rvt_post_one_wr(struct rvt_qp *qp, atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); } - wqe->ssn = qp->s_ssn++; - wqe->psn = qp->s_next_psn; - wqe->lpsn = wqe->psn + - (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; + if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { + atomic_inc(&qp->local_ops_pending); + wqe->ssn = 0; + wqe->psn = 0; + wqe->lpsn = 0; + } else { + wqe->ssn = qp->s_ssn++; + wqe->psn = qp->s_next_psn; + wqe->lpsn = wqe->psn + + (wqe->length ? + ((wqe->length - 1) >> log_pmtu) : + 0); + qp->s_next_psn = wqe->lpsn + 1; + } trace_rvt_post_one_wr(qp, wqe); smp_wmb(); /* see request builders */ qp->s_avail--; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index a90d1e941504..b0ab12b30f1e 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -231,6 +231,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_PRIV 0x00000001 #define RVT_OPERATION_ATOMIC 0x00000002 #define RVT_OPERATION_ATOMIC_SGE 0x00000004 +#define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) @@ -363,6 +364,8 @@ struct rvt_qp { struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; + atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */ + /* * This sge list MUST be last. Do not add anything below here. */ From a2df0c833209a22d020163913e451f94be5114cd Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:31 -0700 Subject: [PATCH 30/84] IB/hfi1: Handle send with invalidate opcode in the RC recv path As part of enabling extended memory management support, add the processing of the RC send with invalidate. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 16 +++++++++++++++- drivers/infiniband/hw/hfi1/verbs.c | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index bd4baa444f7f..cb474a703b33 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2098,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) unsigned long flags; int ret, is_fecn = 0; int copy_last = 0; + u32 rkey; bth0 = be32_to_cpu(ohdr->bth[0]); if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) @@ -2137,7 +2138,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) case OP(SEND_MIDDLE): if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || - opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE)) break; goto nack_inv; @@ -2153,6 +2155,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE) || opcode == OP(RDMA_WRITE_MIDDLE) || opcode == OP(RDMA_WRITE_LAST) || opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) @@ -2201,6 +2204,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): + case OP(SEND_ONLY_WITH_INVALIDATE): ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto nack_op_err; @@ -2209,12 +2213,22 @@ send_middle: qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) goto no_immediate_data; + if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) + goto send_last_inv; /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; wc.wc_flags = IB_WC_WITH_IMM; goto send_last; + case OP(SEND_LAST_WITH_INVALIDATE): +send_last_inv: + rkey = be32_to_cpu(ohdr->u.ieth); + if (rvt_invalidate_rkey(qp, rkey)) + goto no_immediate_data; + wc.ex.invalidate_rkey = rkey; + wc.wc_flags = IB_WC_WITH_INVALIDATE; + goto send_last; case OP(RDMA_WRITE_LAST): copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; /* fall through */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a89055fa72d8..84cc09a5eb67 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -378,6 +378,8 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, From 0db3dfa03c0881fc98d3ff2f88dcca2bc69c0003 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:37 -0700 Subject: [PATCH 31/84] IB/hfi1: Work request processing for fast register mr and invalidate In order to support extended memory management support, add send side processing of work requests of type IB_WR_REG_MR, IB_WR_LOCAL_INV, and IB_WR_SEND_WITH_INV. The first two are local operations and are supported for both RC and UC. Send with invalidate is only supported for RC because the corresponding IB opcodes are not defined for UC. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 46 ++++++++++++++++++++++++++++-- drivers/infiniband/hw/hfi1/ruc.c | 26 +++++++++++++++++ drivers/infiniband/hw/hfi1/uc.c | 24 ++++++++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 5 +++- 4 files changed, 98 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index cb474a703b33..0bc43b67d0b8 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -402,6 +402,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) char newreq; int middle = 0; int delta; + int err; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -477,6 +478,35 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } + /* + * Local operations are processed immediately + * after all prior requests have completed + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (++qp->s_tail == qp->s_size) + qp->s_tail = 0; + if (wqe->wr.opcode == IB_WR_REG_MR) + err = rvt_fast_reg_mr( + qp, wqe->reg_wr.mr, + wqe->reg_wr.key, + wqe->reg_wr.access); + else + err = rvt_invalidate_rkey( + qp, + wqe->wr.ex.invalidate_rkey); + hfi1_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + atomic_dec(&qp->local_ops_pending); + qp->s_hdrwords = 0; + goto done_free_tx; + } + newreq = 1; qp->s_psn = wqe->psn; } @@ -491,6 +521,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: /* If no credit, return. */ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { @@ -504,11 +535,17 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_ONLY); - } else { + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; + } else { + qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); + /* Invalidate rkey comes after the BTH */ + ohdr->u.ieth = cpu_to_be32( + wqe->wr.ex.invalidate_rkey); + hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; @@ -671,11 +708,16 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_LAST); - } else { + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; + } else { + qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); + /* invalidate data comes after the BTH */ + ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); + hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index a659aec3c3c6..76b9c9e42d86 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -372,6 +372,7 @@ static void ruc_loopback(struct rvt_qp *sqp) int ret; int copy_last = 0; u32 to; + int local_ops = 0; rcu_read_lock(); @@ -440,11 +441,32 @@ again: sqp->s_sge.num_sge = wqe->wr.num_sge; sqp->s_len = wqe->length; switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + if (rvt_fast_reg_mr(sqp, wqe->reg_wr.mr, wqe->reg_wr.key, + wqe->reg_wr.access)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + goto send_comp; + + case IB_WR_LOCAL_INV: + if (rvt_invalidate_rkey(sqp, wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + case IB_WR_SEND_WITH_IMM: wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: +send: ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto op_err; @@ -583,6 +605,10 @@ send_comp: flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; hfi1_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } goto again; rnr_nak: diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index b7a25311bd7b..ef6c96cd3d68 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -77,6 +77,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) u32 len; u32 pmtu = qp->pmtu; int middle = 0; + int err; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -118,6 +119,29 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) clear_ahg(qp); goto bail; } + /* + * Local operations are processed immediately + * after all prior requests have completed. + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (wqe->wr.opcode == IB_WR_REG_MR) + err = rvt_fast_reg_mr(qp, wqe->reg_wr.mr, + wqe->reg_wr.key, + wqe->reg_wr.access); + else + err = rvt_invalidate_rkey( + qp, wqe->wr.ex.invalidate_rkey); + hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + atomic_dec(&qp->local_ops_pending); + qp->s_hdrwords = 0; + goto done_free_tx; + } /* * Start a new request. */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 84cc09a5eb67..57e0c0df073c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -306,7 +306,10 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, - [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [IB_WR_SEND_WITH_INV] = IB_WC_SEND, + [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV, + [IB_WR_REG_MR] = IB_WC_REG_MR }; /* From c72cfe3e389e5d13f82d7d7837a783ca995e968f Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:38:43 -0700 Subject: [PATCH 32/84] IB/hfi1: Add support for extended memory management Advertise and add the capability of handing all aspects of IBTA extended memory management support in post send. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 17 +++++++++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 6 ++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index a8b3fc9c91c8..ad8ad33aaa73 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -157,6 +157,23 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { BIT(IB_QPT_UC) | BIT(IB_QPT_RC), }, +[IB_WR_REG_MR] = { + .length = sizeof(struct ib_reg_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_LOCAL_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_SEND_WITH_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_RC), +}, + }; static void flush_tx_list(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 57e0c0df073c..95785651fb77 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1305,13 +1305,15 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | + IB_DEVICE_MEM_MGT_EXTENSIONS; rdi->dparms.props.page_size_cap = PAGE_SIZE; rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; rdi->dparms.props.vendor_part_id = dd->pcidev->device; rdi->dparms.props.hw_ver = dd->minrev; rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; - rdi->dparms.props.max_mr_size = ~0ULL; + rdi->dparms.props.max_mr_size = U64_MAX; + rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; rdi->dparms.props.max_sge = hfi1_max_sges; From 8784ac02437fbe545a52a1be14684b2663208253 Mon Sep 17 00:00:00 2001 From: Harish Chegondi Date: Mon, 25 Jul 2016 13:38:50 -0700 Subject: [PATCH 33/84] IB/hfi1: Modify the default number of kernel receive conexts Currently, the default number of kernel receive contexts is set to the number of NUMA nodes on the system plus one for control context. However, the systems that have a single socket and/or have NUMA disabled in the BIOS will have only one receive context by default. This patch would ensure that by default there will be at least two kernel receive contexts plus one for control context regardless of the number of NUMA nodes on the system. The user can override the default number of kernel receive contexts with the krcvqs module parameter. Reviewed-by: Dean Luick Reviewed-by: Mike Marciniszyn Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 40d485b9cb75..e5f49eff47ea 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -122,6 +122,7 @@ struct flag_table { #define SEC_SC_HALTED 0x4 /* per-context only */ #define SEC_SPC_FREEZE 0x8 /* per-HFI only */ +#define DEFAULT_KRCVQS 2 #define MIN_KERNEL_KCTXTS 2 #define FIRST_KERNEL_KCTXT 1 /* sizes for both the QP and RSM map tables */ @@ -12773,7 +12774,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) /* * Kernel receive contexts: - * - min of 2 or 1 context/numa (excluding control context) * - Context 0 - control context (VL15/multicast/error) * - Context 1 - first kernel context * - Context 2 - second kernel context @@ -12787,9 +12787,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) */ num_kernel_contexts = n_krcvqs + 1; else - num_kernel_contexts = num_online_nodes() + 1; - num_kernel_contexts = - max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts); + num_kernel_contexts = DEFAULT_KRCVQS + 1; /* * Every kernel receive context needs an ACK send context. * one send context is allocated for each VL{0-7} and VL15 From 6854c6925d4913d1f8d848c565ab665501212888 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 25 Jul 2016 13:38:56 -0700 Subject: [PATCH 34/84] IB/hfi1: Explain state complete frame details When link up fails in LNI, the local and peer state complete frames are reported as numbers. Explain what the values mean so the operator can better diagnose the problem. Reviewed-by: Easwar Hariharan Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 134 ++++++++++++++++++++++++++++-- 1 file changed, 126 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e5f49eff47ea..f3782b37ef78 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9918,6 +9918,131 @@ static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) return 0; } +static const char *state_completed_string(u32 completed) +{ + static const char * const state_completed[] = { + "EstablishComm", + "OptimizeEQ", + "VerifyCap" + }; + + if (completed < ARRAY_SIZE(state_completed)) + return state_completed[completed]; + + return "unknown"; +} + +static const char all_lanes_dead_timeout_expired[] = + "All lanes were inactive – was the interconnect media removed?"; +static const char tx_out_of_policy[] = + "Passing lanes on local port do not meet the local link width policy"; +static const char no_state_complete[] = + "State timeout occurred before link partner completed the state"; +static const char * const state_complete_reasons[] = { + [0x00] = "Reason unknown", + [0x01] = "Link was halted by driver, refer to LinkDownReason", + [0x02] = "Link partner reported failure", + [0x10] = "Unable to achieve frame sync on any lane", + [0x11] = + "Unable to find a common bit rate with the link partner", + [0x12] = + "Unable to achieve frame sync on sufficient lanes to meet the local link width policy", + [0x13] = + "Unable to identify preset equalization on sufficient lanes to meet the local link width policy", + [0x14] = no_state_complete, + [0x15] = + "State timeout occurred before link partner identified equalization presets", + [0x16] = + "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy", + [0x17] = tx_out_of_policy, + [0x20] = all_lanes_dead_timeout_expired, + [0x21] = + "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy", + [0x22] = no_state_complete, + [0x23] = + "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy", + [0x24] = tx_out_of_policy, + [0x30] = all_lanes_dead_timeout_expired, + [0x31] = + "State timeout occurred waiting for host to process received frames", + [0x32] = no_state_complete, + [0x33] = + "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy", + [0x34] = tx_out_of_policy, +}; + +static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd, + u32 code) +{ + const char *str = NULL; + + if (code < ARRAY_SIZE(state_complete_reasons)) + str = state_complete_reasons[code]; + + if (str) + return str; + return "Reserved"; +} + +/* describe the given last state complete frame */ +static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame, + const char *prefix) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 success; + u32 state; + u32 reason; + u32 lanes; + + /* + * Decode frame: + * [ 0: 0] - success + * [ 3: 1] - state + * [ 7: 4] - next state timeout + * [15: 8] - reason code + * [31:16] - lanes + */ + success = frame & 0x1; + state = (frame >> 1) & 0x7; + reason = (frame >> 8) & 0xff; + lanes = (frame >> 16) & 0xffff; + + dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n", + prefix, frame); + dd_dev_err(dd, " last reported state state: %s (0x%x)\n", + state_completed_string(state), state); + dd_dev_err(dd, " state successfully completed: %s\n", + success ? "yes" : "no"); + dd_dev_err(dd, " fail reason 0x%x: %s\n", + reason, state_complete_reason_code_string(ppd, reason)); + dd_dev_err(dd, " passing lane mask: 0x%x", lanes); +} + +/* + * Read the last state complete frames and explain them. This routine + * expects to be called if the link went down during link negotiation + * and initialization (LNI). That is, anywhere between polling and link up. + */ +static void check_lni_states(struct hfi1_pportdata *ppd) +{ + u32 last_local_state; + u32 last_remote_state; + + read_last_local_state(ppd->dd, &last_local_state); + read_last_remote_state(ppd->dd, &last_remote_state); + + /* + * Don't report anything if there is nothing to report. A value of + * 0 means the link was taken down while polling and there was no + * training in-process. + */ + if (last_local_state == 0 && last_remote_state == 0) + return; + + decode_state_complete(ppd, last_local_state, "transmitted"); + decode_state_complete(ppd, last_remote_state, "received"); +} + /* * Helper for set_link_state(). Do not call except from that routine. * Expects ppd->hls_mutex to be held. @@ -9930,8 +10055,6 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 pstate, previous_state; - u32 last_local_state; - u32 last_remote_state; int ret; int do_transition; int do_wait; @@ -10031,12 +10154,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } else if (previous_state & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ - /* byte 1 of last_*_state is the failure reason */ - read_last_local_state(dd, &last_local_state); - read_last_remote_state(dd, &last_remote_state); - dd_dev_err(dd, - "LNI failure last states: local 0x%08x, remote 0x%08x\n", - last_local_state, last_remote_state); + check_lni_states(ppd); } /* the active link width (downgrade) is 0 on link down */ From b3bf270bed5b7cd334c08293bbd27bc63b5bb9d7 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 25 Jul 2016 13:39:02 -0700 Subject: [PATCH 35/84] IB/hfi1: Read all firmware versions Read the version of the SBus, PCIe SerDes, and Fabric Serdes firmwares at driver load time. Reviewed-by: Dennis Dalessandro Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 25 ---- drivers/infiniband/hw/hfi1/chip.h | 1 + drivers/infiniband/hw/hfi1/chip_registers.h | 4 + drivers/infiniband/hw/hfi1/firmware.c | 125 ++++++++++++++++++++ 4 files changed, 130 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index f3782b37ef78..faeed29afc8f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8826,30 +8826,6 @@ static int write_tx_settings(struct hfi1_devdata *dd, return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame); } -static void check_fabric_firmware_versions(struct hfi1_devdata *dd) -{ - u32 frame, version, prod_id; - int ret, lane; - - /* 4 lanes */ - for (lane = 0; lane < 4; lane++) { - ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame); - if (ret) { - dd_dev_err(dd, - "Unable to read lane %d firmware details\n", - lane); - continue; - } - version = (frame >> SPICO_ROM_VERSION_SHIFT) - & SPICO_ROM_VERSION_MASK; - prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT) - & SPICO_ROM_PROD_ID_MASK; - dd_dev_info(dd, - "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", - lane, version, prod_id); - } -} - /* * Read an idle LCB message. * @@ -14621,7 +14597,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, ret = load_firmware(dd); /* asymmetric with dispose_firmware() */ if (ret) goto bail_clear_intr; - check_fabric_firmware_versions(dd); thermal_init(dd); diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index d0a4ddb421f7..f07bc4ccc468 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -640,6 +640,7 @@ extern uint platform_config_load; /* SBus commands */ #define RESET_SBUS_RECEIVER 0x20 #define WRITE_SBUS_RECEIVER 0x21 +#define READ_SBUS_RECEIVER 0x22 void sbus_request(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); int sbus_request_slow(struct hfi1_devdata *dd, diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index 8744de6667c2..5b9993899789 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -471,6 +471,10 @@ #define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010) #define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull #define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2 +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull +#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32 +#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull #define ASIC_STS_THERM (ASIC + 0x000000000058) #define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull #define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18 diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index ed680fda611d..13db8eb4f4ec 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -206,6 +206,9 @@ static const struct firmware *platform_config; /* the number of fabric SerDes on the SBus */ #define NUM_FABRIC_SERDES 4 +/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */ +#define SBUS_READ_COMPLETE 0x4 + /* SBus fabric SerDes addresses, one set per HFI */ static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = { { 0x01, 0x02, 0x03, 0x04 }, @@ -240,6 +243,7 @@ static const u8 all_pcie_serdes_broadcast = 0xe0; static void dispose_one_firmware(struct firmware_details *fdet); static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, struct firmware_details *fdet); +static void dump_fw_version(struct hfi1_devdata *dd); /* * Read a single 64-bit value from 8051 data memory. @@ -1078,6 +1082,44 @@ void sbus_request(struct hfi1_devdata *dd, ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); } +/* + * Read a value from the SBus. + * + * Requires the caller to be in fast mode + */ +static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, + u32 data_in) +{ + u64 reg; + int retries; + int success = 0; + u32 result = 0; + u32 result_code = 0; + + sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in); + + for (retries = 0; retries < 100; retries++) { + usleep_range(1000, 1200); /* arbitrary */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT) + & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK; + if (result_code != SBUS_READ_COMPLETE) + continue; + + success = 1; + result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT) + & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK; + break; + } + + if (!success) { + dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__, + result_code); + } + + return result; +} + /* * Turn off the SBus and fabric serdes spicos. * @@ -1636,6 +1678,7 @@ int load_firmware(struct hfi1_devdata *dd) return ret; } + dump_fw_version(dd); return 0; } @@ -2054,3 +2097,85 @@ void read_guid(struct hfi1_devdata *dd) dd_dev_info(dd, "GUID %llx", (unsigned long long)dd->base_guid); } + +/* read and display firmware version info */ +static void dump_fw_version(struct hfi1_devdata *dd) +{ + u32 pcie_vers[NUM_PCIE_SERDES]; + u32 fabric_vers[NUM_FABRIC_SERDES]; + u32 sbus_vers; + int i; + int all_same; + int ret; + u8 rcv_addr; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n"); + return; + } + + /* set fast mode */ + set_sbus_fast_mode(dd); + + /* read version for SBus Master */ + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0); + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1); + dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers); + + /* read version for PCIe SerDes */ + all_same = 1; + pcie_vers[0] = 0; + for (i = 0; i < NUM_PCIE_SERDES; i++) { + rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && pcie_vers[0] != pcie_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n", + pcie_vers[0]); + } else { + dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_PCIE_SERDES; i++) { + dd_dev_info(dd, + "PCIe SerDes lane %d firmware version 0x%x\n", + i, pcie_vers[i]); + } + } + + /* read version for fabric SerDes */ + all_same = 1; + fabric_vers[0] = 0; + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && fabric_vers[0] != fabric_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n", + fabric_vers[0]); + } else { + dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + dd_dev_info(dd, + "Fabric SerDes lane %d firmware version 0x%x\n", + i, fabric_vers[i]); + } + } + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); +} From a6580f4310ded039fc9f682cbf027fbba217652b Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:39:08 -0700 Subject: [PATCH 36/84] IB/rdmavt: Add missing spin_lock_init call for rdi->n_cqs_lock This fixes the following warning with PROV_LOCKING enabled kernel: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 15 PID: 12286 Comm: modprobe Not tainted 4.7.0-rc5.prove_rcu+ #1 Hardware name: Intel Corporation S2600WT2R/S2600WT2R, ...... Call Trace: [] dump_stack+0x85/0xc8 [] register_lock_class+0x415/0x4b0 [] ? __lock_acquire+0x40c/0x1960 [] __lock_acquire+0x99/0x1960 [] ? find_vmap_area+0x42/0x60 [] ? find_vmap_area+0x19/0x60 [] lock_acquire+0xd3/0x200 [] ? rvt_create_cq+0xc8/0x250 [rdmavt] [] _raw_spin_lock+0x31/0x40 [] ? rvt_create_cq+0xc8/0x250 [rdmavt] [] rvt_create_cq+0xc8/0x250 [rdmavt] [] ? static_obj+0x36/0x50 [] ib_alloc_cq+0x49/0x180 [ib_core] [] ib_mad_init_device+0x204/0x6d0 [ib_core] [] ? up_write+0x1f/0x40 [] ib_register_device+0x3d0/0x510 [ib_core] [] ? read_cc_setting_bin+0x200/0x200 [hfi1] [] ? static_obj+0x36/0x50 [] ? lockdep_init_map+0x88/0x200 [] rvt_register_device+0x17f/0x320 [rdmavt] [] hfi1_register_ib_device+0x6ca/0x7c0 [hfi1] [] init_one+0x2b4/0x430 [hfi1] [] local_pci_probe+0x45/0xa0 [] ? pci_match_device+0xe0/0x110 [] pci_device_probe+0xfc/0x140 [] driver_probe_device+0x239/0x460 [] __driver_attach+0xcd/0xf0 [] ? driver_probe_device+0x460/0x460 [] bus_for_each_dev+0x73/0xc0 [] driver_attach+0x1e/0x20 [] bus_add_driver+0x1d3/0x290 [] ? dev_init+0x114/0x114 [hfi1] [] driver_register+0x60/0xe0 [] ? dev_init+0x114/0x114 [hfi1] [] __pci_register_driver+0x60/0x70 [] hfi1_mod_init+0x196/0x1fe [hfi1] [] do_one_initcall+0x50/0x190 [] ? rcu_read_lock_sched_held+0x62/0x70 [] ? kmem_cache_alloc_trace+0x23a/0x2a0 [] ? do_init_module+0x27/0x1dc [] do_init_module+0x60/0x1dc [] load_module+0x132c/0x1ac0 [] ? __symbol_put+0x60/0x60 [] ? ima_post_read_file+0x3d/0x80 Cc: Stable # 4.6+ Reviewed-by: Mike Marciniszyn Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/cq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 6ca6fa80dd6e..f2f229efbe64 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -510,6 +510,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) if (rdi->worker) return 0; + spin_lock_init(&rdi->n_cqs_lock); rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL); if (!rdi->worker) return -ENOMEM; From 8adf71fa145e5e309a07c1cca843da8cd83c1d75 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:39:14 -0700 Subject: [PATCH 37/84] IB/hfi1: Fix "suspicious rcu_dereference_check() usage" warnings This fixes the following warnings with PROVE_LOCKING and PROVE_RCU enabled in the kernel: case (1): [ INFO: suspicious RCU usage. ] drivers/infiniband/hw/hfi1/init.c:532 suspicious rcu_dereference_check() usage! case (2): [ INFO: suspicious RCU usage. ] drivers/infiniband/hw/hfi1/hfi.h:1624 suspicious rcu_dereference_check() usage! Reviewed-by: Mike Marciniszyn Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 13 +++++++++++-- drivers/infiniband/hw/hfi1/init.c | 11 ++++++----- drivers/infiniband/hw/hfi1/mad.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 7df4cb2ed7da..d22876d0fdd3 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1616,14 +1616,23 @@ static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) } /* - * Readers of cc_state must call get_cc_state() under rcu_read_lock(). - * Writers of cc_state must call get_cc_state() under cc_state_lock. + * Called by readers of cc_state only, must call under rcu_read_lock(). */ static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) { return rcu_dereference(ppd->cc_state); } +/* + * Called by writers of cc_state only, must call under cc_state_lock. + */ +static inline +struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) +{ + return rcu_dereference_protected(ppd->cc_state, + lockdep_is_held(&ppd->cc_state_lock)); +} + /* * values for dd->flags (_device_ related flags) */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index ec77c7edb025..44ec3a838b1b 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -475,8 +475,9 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, struct hfi1_devdata *dd, u8 hw_pidx, u8 port) { - int i, size; + int i; uint default_pkey_idx; + struct cc_state *cc_state; ppd->dd = dd; ppd->hw_pidx = hw_pidx; @@ -527,9 +528,9 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cc_state_lock); spin_lock_init(&ppd->cc_log_lock); - size = sizeof(struct cc_state); - RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); - if (!rcu_dereference(ppd->cc_state)) + cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); + RCU_INIT_POINTER(ppd->cc_state, cc_state); + if (!cc_state) goto bail; return; @@ -1327,7 +1328,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) hrtimer_cancel(&ppd->cca_timer[i].hrtimer); spin_lock(&ppd->cc_state_lock); - cc_state = get_cc_state(ppd); + cc_state = get_cc_state_protected(ppd); RCU_INIT_POINTER(ppd->cc_state, NULL); spin_unlock(&ppd->cc_state_lock); diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 5590a4c84a4e..1263abe01999 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -3375,7 +3375,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd) */ spin_lock(&ppd->cc_state_lock); - old_cc_state = get_cc_state(ppd); + old_cc_state = get_cc_state_protected(ppd); if (!old_cc_state) { /* never active, or shutting down */ spin_unlock(&ppd->cc_state_lock); From c3f8de0b334c96cdbd7aa6c1c9d5578300ad68b0 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 25 Jul 2016 13:39:21 -0700 Subject: [PATCH 38/84] IB/hfi1: Add static PCIe Gen3 CTLE tuning Enhance the PCIe Gen3 recipe to support static CTLE tuning, and add a switch to choose between static and dynamic approaches. Make discrete chips default to static CTLE tuning. Reviewed-by: Tadeusz Struk Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 68 +++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 0bac21e6a658..89c68da1c273 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -679,6 +679,10 @@ static uint pcie_pset = UNSET_PSET; module_param(pcie_pset, uint, S_IRUGO); MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10"); +static uint pcie_ctle = 1; /* discrete on, integrated off */ +module_param(pcie_ctle, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off"); + /* equalization columns */ #define PREC 0 #define ATTN 1 @@ -716,6 +720,36 @@ static const u8 integrated_preliminary_eq[11][3] = { { 0x00, 0x1e, 0x0a }, /* p10 */ }; +static const u8 discrete_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x48, 0x0b, 0x04, 0x04 }, /* p0 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p1 */ + { 0x50, 0x09, 0x06, 0x06 }, /* p2 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p3 */ + { 0x80, 0x05, 0x0f, 0x0a }, /* p4 */ + { 0x70, 0x05, 0x0f, 0x0a }, /* p5 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x48, 0x09, 0x06, 0x06 }, /* p8 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p9 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p10 */ +}; + +static const u8 integrated_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p0 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p1 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p2 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p3 */ + { 0x58, 0x0a, 0x05, 0x05 }, /* p4 */ + { 0x48, 0x0a, 0x05, 0x05 }, /* p5 */ + { 0x40, 0x0a, 0x05, 0x05 }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p8 */ + { 0x38, 0x09, 0x06, 0x06 }, /* p9 */ + { 0x38, 0x0e, 0x01, 0x01 }, /* p10 */ +}; + /* helper to format the value to write to hardware */ #define eq_value(pre, curr, post) \ ((((u32)(pre)) << \ @@ -951,11 +985,14 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) u32 status, err; int ret; int do_retry, retry_count = 0; + int intnum = 0; uint default_pset; u16 target_vector, target_speed; u16 lnkctl2, vendor; u8 div; const u8 (*eq)[3]; + const u8 (*ctle_tunings)[4]; + uint static_ctle_mode; int return_error = 0; /* PCIe Gen3 is for the ASIC only */ @@ -1089,6 +1126,9 @@ retry: div = 3; eq = discrete_preliminary_eq; default_pset = DEFAULT_DISCRETE_PSET; + ctle_tunings = discrete_ctle_tunings; + /* bit 0 - discrete on/off */ + static_ctle_mode = pcie_ctle & 0x1; } else { /* 400mV, FS=29, LF = 9 */ fs = 29; @@ -1096,6 +1136,9 @@ retry: div = 1; eq = integrated_preliminary_eq; default_pset = DEFAULT_MCP_PSET; + ctle_tunings = integrated_ctle_tunings; + /* bit 1 - integrated on/off */ + static_ctle_mode = (pcie_ctle >> 1) & 0x1; } pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, (fs << @@ -1135,16 +1178,33 @@ retry: * step 5c: Program gasket interrupts */ /* set the Rx Bit Rate to REFCLK ratio */ - write_gasket_interrupt(dd, 0, 0x0006, 0x0050); + write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050); /* disable pCal for PCIe Gen3 RX equalization */ - write_gasket_interrupt(dd, 1, 0x0026, 0x5b01); + /* select adaptive or static CTLE */ + write_gasket_interrupt(dd, intnum++, 0x0026, + 0x5b01 | (static_ctle_mode << 3)); /* * Enable iCal for PCIe Gen3 RX equalization, and set which * evaluation of RX_EQ_EVAL will launch the iCal procedure. */ - write_gasket_interrupt(dd, 2, 0x0026, 0x5202); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202); + + if (static_ctle_mode) { + /* apply static CTLE tunings */ + u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw; + + pcie_dc = ctle_tunings[pcie_pset][0]; + pcie_lf = ctle_tunings[pcie_pset][1]; + pcie_hf = ctle_tunings[pcie_pset][2]; + pcie_bw = ctle_tunings[pcie_pset][3]; + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw); + } + /* terminate list */ - write_gasket_interrupt(dd, 3, 0x0000, 0x0000); + write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000); /* * step 5d: program XMT margin From b14db1f0aa4fccd0e5ebcbe588d1136f3a23dbc6 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 25 Jul 2016 13:39:27 -0700 Subject: [PATCH 39/84] IB/hfi1: Add sysfs entry to override SDMA interrupt affinity Add sysfs entry to allow user to override affinity for SDMA engine interrupts. Reviewed-by: Dean Luick Signed-off-by: Tadeusz Struk Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 68 +++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/affinity.h | 4 ++ drivers/infiniband/hw/hfi1/sysfs.c | 25 ++++++++++ 3 files changed, 97 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index c9dcbd55883a..79575ee873f2 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "hfi.h" #include "affinity.h" @@ -673,3 +674,70 @@ void hfi1_put_proc_affinity(int cpu) } spin_unlock(&affinity->lock); } + +/* Prevents concurrent reads and writes of the sdma_affinity attrib */ +static DEFINE_MUTEX(sdma_affinity_mutex); + +int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, + size_t count) +{ + struct hfi1_affinity_node *entry; + struct cpumask mask; + int ret, i; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + if (!entry) + return -EINVAL; + + ret = cpulist_parse(buf, &mask); + if (ret) + return ret; + + if (!cpumask_subset(&mask, cpu_online_mask) || cpumask_empty(&mask)) { + dd_dev_warn(dd, "Invalid CPU mask\n"); + return -EINVAL; + } + + mutex_lock(&sdma_affinity_mutex); + /* reset the SDMA interrupt affinity details */ + init_cpu_mask_set(&entry->def_intr); + cpumask_copy(&entry->def_intr.mask, &mask); + /* + * Reassign the affinity for each SDMA interrupt. + */ + for (i = 0; i < dd->num_msix_entries; i++) { + struct hfi1_msix_entry *msix; + + msix = &dd->msix_entries[i]; + if (msix->type != IRQ_SDMA) + continue; + + ret = hfi1_get_irq_affinity(dd, msix); + + if (ret) + break; + } + + mutex_unlock(&sdma_affinity_mutex); + return ret ? ret : strnlen(buf, PAGE_SIZE); +} + +int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) +{ + struct hfi1_affinity_node *entry; + + spin_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + spin_unlock(&node_affinity.lock); + + if (!entry) + return -EINVAL; + + mutex_lock(&sdma_affinity_mutex); + cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); + mutex_unlock(&sdma_affinity_mutex); + return strnlen(buf, PAGE_SIZE); +} diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index f784de52e881..8879cf7a8cac 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -102,6 +102,10 @@ int hfi1_get_proc_affinity(int); /* Release a CPU used by a user process. */ void hfi1_put_proc_affinity(int); +int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf); +int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, + size_t count); + struct hfi1_affinity_node { int node; struct cpu_mask_set def_intr; diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 91fc2aed6aed..74c84c655f7e 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -49,6 +49,7 @@ #include "hfi.h" #include "mad.h" #include "trace.h" +#include "affinity.h" /* * Start of per-port congestion control structures and support code @@ -622,6 +623,27 @@ static ssize_t show_tempsense(struct device *device, return ret; } +static ssize_t show_sdma_affinity(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return hfi1_get_sdma_affinity(dd, buf); +} + +static ssize_t store_sdma_affinity(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return hfi1_set_sdma_affinity(dd, buf, count); +} + /* * end of per-unit (or driver, in some cases, but replicated * per unit) functions @@ -636,6 +658,8 @@ static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); +static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity, + store_sdma_affinity); static struct device_attribute *hfi1_attributes[] = { &dev_attr_hw_rev, @@ -646,6 +670,7 @@ static struct device_attribute *hfi1_attributes[] = { &dev_attr_boardversion, &dev_attr_tempsense, &dev_attr_chip_reset, + &dev_attr_sdma_affinity, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, From 23002d5b08ccbbf0902bbc3430293629a1fa12c6 Mon Sep 17 00:00:00 2001 From: Grzegorz Heldt Date: Mon, 25 Jul 2016 13:39:33 -0700 Subject: [PATCH 40/84] IB/hfi1: Fix trace message units Trace shows incorrect amount of allocated memory. Fix trace to display memory in KB. Reviewed-by: Mike Marciniszyn Signed-off-by: Grzegorz Heldt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 44ec3a838b1b..a358d23ecd54 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1777,8 +1777,8 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) hfi1_cdbg(PROC, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", - rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, - rcd->egrbufs.size); + rcd->ctxt, rcd->egrbufs.alloced, + rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); /* * Set the contexts rcv array head update threshold to the closest From 856cc4c237add46510c8ae91764f4eda31a9e1cf Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 25 Jul 2016 13:39:39 -0700 Subject: [PATCH 41/84] IB/hfi1: Add the capability for reserved operations This fix allows for support of in-kernel reserved operations without impacting the ULP user. The low level driver can register a non-zero value which will be transparently added to the send queue size and hidden from the ULP in every respect. ULP post sends will never see a full queue due to a reserved post send and reserved operations will never exceed that registered value. The s_avail will continue to track the ULP swqe availability and the difference between the reserved value and the reserved in use will track reserved availabity. Reviewed-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 83 +++++++++++++++++++++++-------- include/rdma/rdma_vt.h | 1 + include/rdma/rdmavt_qp.h | 50 +++++++++++++++++++ 3 files changed, 112 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index f79b809241e0..218494c6afe2 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -584,6 +584,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->r_rq.wq->tail = 0; } qp->r_sge.num_sge = 0; + atomic_set(&qp->s_reserved_used, 0); } /** @@ -645,7 +646,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, return ERR_PTR(-EINVAL); } sqsize = - init_attr->cap.max_send_wr + 1; + init_attr->cap.max_send_wr + 1 + + rdi->dparms.reserved_operations; switch (init_attr->qp_type) { case IB_QPT_SMI: case IB_QPT_GSI: @@ -1335,7 +1337,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask; attr->dest_qp_num = qp->remote_qpn; attr->qp_access_flags = qp->qp_access_flags; - attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_send_wr = qp->s_size - 1 - + rdi->dparms.reserved_operations; attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; attr->cap.max_send_sge = qp->s_max_sge; attr->cap.max_recv_sge = qp->r_rq.max_sge; @@ -1494,27 +1497,65 @@ static inline int rvt_qp_valid_operation( } /** - * qp_get_savail - return number of avail send entries + * rvt_qp_is_avail - determine queue capacity * @qp - the qp + * @rdi - the rdmavt device + * @reserved_op - is reserved operation * * This assumes the s_hlock is held but the s_last * qp variable is uncontrolled. * - * The return is adjusted to not count device specific - * reserved operations. + * For non reserved operations, the qp->s_avail + * may be changed. + * + * The return value is zero or a -ENOMEM. */ -static inline u32 qp_get_savail(struct rvt_qp *qp) +static inline int rvt_qp_is_avail( + struct rvt_qp *qp, + struct rvt_dev_info *rdi, + bool reserved_op) { u32 slast; - u32 ret; + u32 avail; + u32 reserved_used; + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); + if (unlikely(reserved_op)) { + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + if (reserved_used >= rdi->dparms.reserved_operations) + return -ENOMEM; + return 0; + } + /* non-reserved operations */ + if (likely(qp->s_avail)) + return 0; smp_read_barrier_depends(); /* see rc.c */ slast = ACCESS_ONCE(qp->s_last); if (qp->s_head >= slast) - ret = qp->s_size - (qp->s_head - slast); + avail = qp->s_size - (qp->s_head - slast); else - ret = slast - qp->s_head; - return ret - 1; + avail = slast - qp->s_head; + + /* see rvt_qp_wqe_unreserve() */ + smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); + avail = avail - 1 - + (rdi->dparms.reserved_operations - reserved_used); + /* insure we don't assign a negative s_avail */ + if ((s32)avail <= 0) + return -ENOMEM; + qp->s_avail = avail; + if (WARN_ON(qp->s_avail > + (qp->s_size - 1 - rdi->dparms.reserved_operations))) + rvt_pr_err(rdi, + "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u", + qp->ibqp.qp_num, qp->s_size, qp->s_avail, + qp->s_head, qp->s_tail, qp->s_cur, + qp->s_acked, qp->s_last); + return 0; } /** @@ -1537,6 +1578,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, u8 log_pmtu; int ret; size_t cplen; + bool reserved_op; BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); @@ -1574,18 +1616,12 @@ static int rvt_post_one_wr(struct rvt_qp *qp, } } + reserved_op = rdi->post_parms[wr->opcode].flags & + RVT_OPERATION_USE_RESERVE; /* check for avail */ - if (unlikely(!qp->s_avail)) { - qp->s_avail = qp_get_savail(qp); - if (WARN_ON(qp->s_avail > (qp->s_size - 1))) - rvt_pr_err(rdi, - "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u", - qp->ibqp.qp_num, qp->s_size, qp->s_avail, - qp->s_head, qp->s_tail, qp->s_cur, - qp->s_acked, qp->s_last); - if (!qp->s_avail) - return -ENOMEM; - } + ret = rvt_qp_is_avail(qp, rdi, reserved_op); + if (ret) + return ret; next = qp->s_head + 1; if (next >= qp->s_size) next = 0; @@ -1653,8 +1689,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, qp->s_next_psn = wqe->lpsn + 1; } trace_rvt_post_one_wr(qp, wqe); + if (unlikely(reserved_op)) + rvt_qp_wqe_reserve(qp, wqe); + else + qp->s_avail--; smp_wmb(); /* see request builders */ - qp->s_avail--; qp->s_head = next; return 0; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 7fdba92d4c05..e31502107a58 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -158,6 +158,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 reserved_operations; }; /* Protection domain */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index b0ab12b30f1e..56adcfcabe0b 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -144,6 +144,11 @@ #define RVT_PROCESS_OR_FLUSH_SEND \ (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND) +/* + * Internal send flags + */ +#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START + /* * Send work request queue entry. * The size of the sg_list is determined when the QP is created and stored @@ -232,6 +237,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC 0x00000002 #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 +#define RVT_OPERATION_USE_RESERVE 0x00000010 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) @@ -328,6 +334,7 @@ struct rvt_qp { u32 s_next_psn; /* PSN for next request */ u32 s_avail; /* number of entries avail */ u32 s_ssn; /* SSN of tail entry */ + atomic_t s_reserved_used; /* reserved entries in use */ spinlock_t s_lock ____cacheline_aligned_in_smp; u32 s_flags; @@ -459,6 +466,49 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) rq->max_sge * sizeof(struct ib_sge)) * n); } +/** + * rvt_qp_wqe_reserve - reserve operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This routine used in post send to record + * a wqe relative reserved operation use. + */ +static inline void rvt_qp_wqe_reserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; + atomic_inc(&qp->s_reserved_used); +} + +/** + * rvt_qp_wqe_unreserve - clean reserved operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This decrements the reserve use count. + * + * This call MUST precede the change to + * s_last to insure that post send sees a stable + * s_avail. + * + * An smp_mp__after_atomic() is used to insure + * the compiler does not juggle the order of the s_last + * ring index and the decrementing of s_reserved_used. + */ +static inline void rvt_qp_wqe_unreserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { + wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; + atomic_dec(&qp->s_reserved_used); + /* insure no compiler re-order up to s_last change */ + smp_mb__after_atomic(); + } +} + extern const int ib_rvt_state_ops[]; struct rvt_dev_info; From d9b13c203003cfb78c1f216049a204d385ccaeff Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 25 Jul 2016 13:39:45 -0700 Subject: [PATCH 42/84] IB/rdmavt, hfi1: Fix NFSoRDMA failure with FRMR enabled Hanging has been observed while writing a file over NFSoRDMA. Dmesg on the server contains messages like these: [ 931.992501] svcrdma: Error -22 posting RDMA_READ [ 952.076879] svcrdma: Error -22 posting RDMA_READ [ 982.154127] svcrdma: Error -22 posting RDMA_READ [ 1012.235884] svcrdma: Error -22 posting RDMA_READ [ 1042.319194] svcrdma: Error -22 posting RDMA_READ Here is why: With the base memory management extension enabled, FRMR is used instead of FMR. The xprtrdma server issues each RDMA read request as the following bundle: (1)IB_WR_REG_MR, signaled; (2)IB_WR_RDMA_READ, signaled; (3)IB_WR_LOCAL_INV, signaled & fencing. These requests are signaled. In order to generate completion, the fast register work request is processed by the hfi1 send engine after being posted to the work queue, and the corresponding lkey is not valid until the request is processed. However, the rdmavt driver validates lkey when the RDMA read request is posted and thus it fails immediately with error -EINVAL (-22). This patch changes the work flow of local operations (fast register and local invalidate) so that fast register work requests are always processed immediately to ensure that the corresponding lkey is valid when subsequent work requests are posted. Local invalidate requests are processed immediately if fencing is not required and no previous local invalidate request is pending. To allow completion generation for signaled local operations that have been processed before posting to the work queue, an internal send flag RVT_SEND_COMPLETION_ONLY is added. The hfi1 send engine checks this flag and only generates completion for such requests. Reviewed-by: Mike Marciniszyn Signed-off-by: Jianxin Xiong Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 17 +++++------ drivers/infiniband/hw/hfi1/ruc.c | 13 ++++----- drivers/infiniband/hw/hfi1/uc.c | 15 +++++----- drivers/infiniband/sw/rdmavt/qp.c | 48 ++++++++++++++++++++----------- include/rdma/rdmavt_qp.h | 1 + 5 files changed, 56 insertions(+), 38 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 0bc43b67d0b8..5da190e6011b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -402,7 +402,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) char newreq; int middle = 0; int delta; - int err; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -484,25 +483,27 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) */ if (wqe->wr.opcode == IB_WR_REG_MR || wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + if (qp->s_last != qp->s_cur) goto bail; if (++qp->s_cur == qp->s_size) qp->s_cur = 0; if (++qp->s_tail == qp->s_size) qp->s_tail = 0; - if (wqe->wr.opcode == IB_WR_REG_MR) - err = rvt_fast_reg_mr( - qp, wqe->reg_wr.mr, - wqe->reg_wr.key, - wqe->reg_wr.access); - else + if (!(wqe->wr.send_flags & + RVT_SEND_COMPLETION_ONLY)) { err = rvt_invalidate_rkey( qp, wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); - atomic_dec(&qp->local_ops_pending); + if (local_ops) + atomic_dec(&qp->local_ops_pending); qp->s_hdrwords = 0; goto done_free_tx; } diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 76b9c9e42d86..7e76d33a5774 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -442,16 +442,15 @@ again: sqp->s_len = wqe->length; switch (wqe->wr.opcode) { case IB_WR_REG_MR: - if (rvt_fast_reg_mr(sqp, wqe->reg_wr.mr, wqe->reg_wr.key, - wqe->reg_wr.access)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; goto send_comp; case IB_WR_LOCAL_INV: - if (rvt_invalidate_rkey(sqp, wqe->wr.ex.invalidate_rkey)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } goto send_comp; case IB_WR_SEND_WITH_INV: diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index ef6c96cd3d68..a726d96d185f 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -77,7 +77,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) u32 len; u32 pmtu = qp->pmtu; int middle = 0; - int err; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -125,20 +124,22 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) */ if (wqe->wr.opcode == IB_WR_REG_MR || wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + if (qp->s_last != qp->s_cur) goto bail; if (++qp->s_cur == qp->s_size) qp->s_cur = 0; - if (wqe->wr.opcode == IB_WR_REG_MR) - err = rvt_fast_reg_mr(qp, wqe->reg_wr.mr, - wqe->reg_wr.key, - wqe->reg_wr.access); - else + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { err = rvt_invalidate_rkey( qp, wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); - atomic_dec(&qp->local_ops_pending); + if (local_ops) + atomic_dec(&qp->local_ops_pending); qp->s_hdrwords = 0; goto done_free_tx; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 218494c6afe2..8ccf1b970b2c 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1579,6 +1579,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, int ret; size_t cplen; bool reserved_op; + int local_ops_delayed = 0; BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); @@ -1592,25 +1593,37 @@ static int rvt_post_one_wr(struct rvt_qp *qp, cplen = ret; /* - * Local operations including fast register and local invalidate - * can be processed immediately w/o being posted to the send queue - * if neither fencing nor completion generation is needed. However, - * once fencing or completion is requested, direct processing of - * following local operations must be disabled until all the local - * operations posted to the send queue have completed. This is - * necessary to ensure the correct ordering. + * Local operations include fast register and local invalidate. + * Fast register needs to be processed immediately because the + * registered lkey may be used by following work requests and the + * lkey needs to be valid at the time those requests are posted. + * Local invalidate can be processed immediately if fencing is + * not required and no previous local invalidate ops are pending. + * Signaled local operations that have been processed immediately + * need to have requests with "completion only" flags set posted + * to the send queue in order to generate completions. */ - if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) && - !(wr->send_flags & (IB_SEND_FENCE | IB_SEND_SIGNALED)) && - !atomic_read(&qp->local_ops_pending)) { - struct ib_reg_wr *reg = reg_wr(wr); - + if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) { switch (wr->opcode) { case IB_WR_REG_MR: - return rvt_fast_reg_mr(qp, reg->mr, reg->key, - reg->access); + ret = rvt_fast_reg_mr(qp, + reg_wr(wr)->mr, + reg_wr(wr)->key, + reg_wr(wr)->access); + if (ret || !(wr->send_flags & IB_SEND_SIGNALED)) + return ret; + break; case IB_WR_LOCAL_INV: - return rvt_invalidate_rkey(qp, wr->ex.invalidate_rkey); + if ((wr->send_flags & IB_SEND_FENCE) || + atomic_read(&qp->local_ops_pending)) { + local_ops_delayed = 1; + } else { + ret = rvt_invalidate_rkey( + qp, wr->ex.invalidate_rkey); + if (ret || !(wr->send_flags & IB_SEND_SIGNALED)) + return ret; + } + break; default: return -EINVAL; } @@ -1675,7 +1688,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, } if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { - atomic_inc(&qp->local_ops_pending); + if (local_ops_delayed) + atomic_inc(&qp->local_ops_pending); + else + wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY; wqe->ssn = 0; wqe->psn = 0; wqe->lpsn = 0; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 56adcfcabe0b..13902dd319a9 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -148,6 +148,7 @@ * Internal send flags */ #define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START +#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) /* * Send work request queue entry. From 3ca5f4c06892fdd1c7a4587af4d92550a170ef37 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Mon, 25 Jul 2016 13:39:51 -0700 Subject: [PATCH 43/84] IB/hfi1: Disable external device configuration requests QSFP CDR enablement is now controlled by determining power class and the configuration file. We disable the DC 8051 from requesting enablement or disabling of TX and RX CDRs by removing the code that allowed the DC 8051 to request changes. Reviewed-by: Dean Luick Signed-off-by: Easwar Hariharan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/platform.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index 03df9322f862..41af6297133c 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -537,20 +537,6 @@ static void apply_tunings( u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0; u8 *cache = ppd->qsfp_info.cache; - /* Enable external device config if channel is limiting active */ - read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, - GENERAL_CONFIG, &config_data); - config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT); - config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT); - ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, - GENERAL_CONFIG, config_data); - if (ret != HCMD_SUCCESS) - dd_dev_err( - ppd->dd, - "%s: Failed to set enable external device config\n", - __func__); - - config_data = 0; /* re-init */ /* Pass tuning method to 8051 */ read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, &config_data); From 5fbd98dd20b9e9829868ebb874bc4d97f3ed3c9e Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Mon, 25 Jul 2016 13:39:57 -0700 Subject: [PATCH 44/84] IB/hfi1: Ignore QSFP interrupts until power stabilizes Some QSFP cables assert the interrupt line as a side effect of module plug-in and power up. This causes the SerDes and QSFP tuning algorithm to begin cable initialization by reading the QSFP memory map over I2C, which fails. This patch ignores any interrupt line assertion until the module has completed power up and voltage rails have stabilized, which can take a maximum of 500 ms per the SFF-8679 specification. Reviewed-by: Dean Luick Signed-off-by: Easwar Hariharan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index faeed29afc8f..becc7b11e31e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9191,17 +9191,24 @@ static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) unsigned long timeout; /* - * Check for QSFP interrupt for t_init (SFF 8679) + * Some QSFP cables have a quirk that asserts the IntN line as a side + * effect of power up on plug-in. We ignore this false positive + * interrupt until the module has finished powering up by waiting for + * a minimum timeout of the module inrush initialization time of + * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the + * module have stabilized. + */ + msleep(500); + + /* + * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1) */ timeout = jiffies + msecs_to_jiffies(2000); while (1) { mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); - if (!(mask & QSFP_HFI0_INT_N)) { - write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : - ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N); + if (!(mask & QSFP_HFI0_INT_N)) break; - } if (time_after(jiffies, timeout)) { dd_dev_info(dd, "%s: No IntN detected, reset complete\n", __func__); @@ -9217,10 +9224,17 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) u64 mask; mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK); - if (enable) + if (enable) { + /* + * Clear the status register to avoid an immediate interrupt + * when we re-enable the IntN pin + */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + QSFP_HFI0_INT_N); mask |= (u64)QSFP_HFI0_INT_N; - else + } else { mask &= ~(u64)QSFP_HFI0_INT_N; + } write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } From b5e710195492f682d93097cddac13e594d39a946 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Mon, 25 Jul 2016 13:40:03 -0700 Subject: [PATCH 45/84] IB/hfi1: Reset QSFP on every run through channel tuning Active QSFP cables were reset only every alternate iteration of the channel tuning algorithm instead of every iteration due to incorrect reset of the flag that controlled QSFP reset, resulting in using stale QSFP status in the channel tuning algorithm. Fixes: 8ebd4cf1852a ("Add active and optical cable support") Reviewed-by: Dean Luick Signed-off-by: Easwar Hariharan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/platform.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index 41af6297133c..965c8aef0c60 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -624,9 +624,13 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, if (ret) return ret; + /* + * We'll change the QSFP memory contents from here on out, thus we set a + * flag here to remind ourselves to reset the QSFP module. This prevents + * reuse of stale settings established in our previous pass through. + */ if (ppd->qsfp_info.reset_needed) { reset_qsfp(ppd); - ppd->qsfp_info.reset_needed = 0; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; From bd24ef5eca75b00c9d98533f5644750de2f29a65 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Mon, 25 Jul 2016 13:40:10 -0700 Subject: [PATCH 46/84] IB/hfi1: Remove unused elements from struct ahg_ib_header sde and hfi1_ib_header are not used anymore. Reviewed-by: Dennis Dalessandro Reviewed-by: Dean Luick Reviewed-by: Ira Weiny Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ruc.c | 2 -- drivers/infiniband/hw/hfi1/ud.c | 1 - drivers/infiniband/hw/hfi1/verbs.h | 2 -- 3 files changed, 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 7e76d33a5774..c7c2f48ad8a8 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -735,7 +735,6 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) qp->s_ahgpsn = npsn; priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; /* save to protect a change in another thread */ - priv->s_hdr->sde = priv->s_sde; priv->s_hdr->ahgidx = qp->s_ahgidx; qp->s_flags |= RVT_S_AHG_VALID; } @@ -804,7 +803,6 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, priv->s_hdr->tx_flags = 0; priv->s_hdr->ahgcount = 0; priv->s_hdr->ahgidx = 0; - priv->s_hdr->sde = NULL; if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; else diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 02488c6fab28..d4afb1dfda27 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -433,7 +433,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) priv->s_hdr->ahgcount = 0; priv->s_hdr->ahgidx = 0; priv->s_hdr->tx_flags = 0; - priv->s_hdr->sde = NULL; /* pbc */ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 488356775627..4bd1cd42e981 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -179,12 +179,10 @@ struct hfi1_ib_header { } __packed; struct ahg_ib_header { - struct sdma_engine *sde; u32 ahgdesc[2]; u16 tx_flags; u8 ahgcount; u8 ahgidx; - struct hfi1_ib_header ibh; }; struct hfi1_pio_header { From a9b6b3bc295d2360480d32049c32661e809c7c5c Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Mon, 25 Jul 2016 13:40:16 -0700 Subject: [PATCH 47/84] IB/hfi1: Rename struct ahg_ib_header to struct hfi1_ahg_info struct ahg_ib_header has no header specific information. Rename it to struct hfi1_ahg_info Reviewed-by: Dennis Dalessandro Reviewed-by: Dean Luick Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 ++++--- drivers/infiniband/hw/hfi1/qp.h | 2 +- drivers/infiniband/hw/hfi1/ruc.c | 28 ++++++++++++++-------------- drivers/infiniband/hw/hfi1/ud.c | 6 +++--- drivers/infiniband/hw/hfi1/verbs.c | 20 ++++++++++---------- drivers/infiniband/hw/hfi1/verbs.h | 4 ++-- 6 files changed, 34 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index ad8ad33aaa73..a5aa3517e7d5 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -806,8 +806,9 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, priv->owner = qp; - priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node); - if (!priv->s_hdr) { + priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp, + rdi->dparms.node); + if (!priv->s_ahg) { kfree(priv); return ERR_PTR(-ENOMEM); } @@ -820,7 +821,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - kfree(priv->s_hdr); + kfree(priv->s_ahg); kfree(priv); } diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index ddf82988b02f..587d84d65bb8 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -63,7 +63,7 @@ static inline void clear_ahg(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - priv->s_hdr->ahgcount = 0; + priv->s_ahg->ahgcount = 0; qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR); if (priv->s_sde && qp->s_ahgidx >= 0) sdma_ahg_free(priv->s_sde, qp->s_ahgidx); diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index c7c2f48ad8a8..ea7977dc646e 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -711,7 +711,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, #define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4) /** - * build_ahg - create ahg in s_hdr + * build_ahg - create ahg in s_ahg * @qp: a pointer to QP * @npsn: the next PSN for the request/response * @@ -733,18 +733,18 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); if (qp->s_ahgidx >= 0) { qp->s_ahgpsn = npsn; - priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY; /* save to protect a change in another thread */ - priv->s_hdr->ahgidx = qp->s_ahgidx; + priv->s_ahg->ahgidx = qp->s_ahgidx; qp->s_flags |= RVT_S_AHG_VALID; } } else { /* subsequent middle after valid */ if (qp->s_ahgidx >= 0) { - priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; - priv->s_hdr->ahgidx = qp->s_ahgidx; - priv->s_hdr->ahgcount++; - priv->s_hdr->ahgdesc[0] = + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG; + priv->s_ahg->ahgidx = qp->s_ahgidx; + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[0] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16((u16)npsn), BTH2_OFFSET, @@ -752,8 +752,8 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) 16); if ((npsn & 0xffff0000) != (qp->s_ahgpsn & 0xffff0000)) { - priv->s_hdr->ahgcount++; - priv->s_hdr->ahgdesc[1] = + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[1] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16( (u16)(npsn >> 16)), @@ -790,7 +790,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, } lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; /* - * reset s_hdr/AHG fields + * reset s_ahg/AHG fields * * This insures that the ahgentry/ahgcount * are at a non-AHG default to protect @@ -800,9 +800,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, * build_ahg() will modify as appropriate * to use the AHG feature. */ - priv->s_hdr->tx_flags = 0; - priv->s_hdr->ahgcount = 0; - priv->s_hdr->ahgidx = 0; + priv->s_ahg->tx_flags = 0; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; else @@ -913,7 +913,7 @@ void hfi1_do_send(struct rvt_qp *qp) */ if (hfi1_verbs_send(qp, &ps)) return; - /* Record that s_hdr is empty. */ + /* Record that s_ahg is empty. */ qp->s_hdrwords = 0; /* allow other tasks to run */ if (unlikely(time_after(jiffies, timeout))) { diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index d4afb1dfda27..08a9c1219b29 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -430,9 +430,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->qkey : wqe->ud_wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); /* disarm any ahg */ - priv->s_hdr->ahgcount = 0; - priv->s_hdr->ahgidx = 0; - priv->s_hdr->tx_flags = 0; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; + priv->s_ahg->tx_flags = 0; /* pbc */ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 95785651fb77..2b3fa400c593 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -816,19 +816,19 @@ static int build_verbs_tx_desc( struct rvt_sge_state *ss, u32 length, struct verbs_txreq *tx, - struct ahg_ib_header *ahdr, + struct hfi1_ahg_info *ahg_info, u64 pbc) { int ret = 0; struct hfi1_pio_header *phdr = &tx->phdr; u16 hdrbytes = tx->hdr_dwords << 2; - if (!ahdr->ahgcount) { + if (!ahg_info->ahgcount) { ret = sdma_txinit_ahg( &tx->txreq, - ahdr->tx_flags, + ahg_info->tx_flags, hdrbytes + length, - ahdr->ahgidx, + ahg_info->ahgidx, 0, NULL, 0, @@ -846,11 +846,11 @@ static int build_verbs_tx_desc( } else { ret = sdma_txinit_ahg( &tx->txreq, - ahdr->tx_flags, + ahg_info->tx_flags, length, - ahdr->ahgidx, - ahdr->ahgcount, - ahdr->ahgdesc, + ahg_info->ahgidx, + ahg_info->ahgcount, + ahg_info->ahgdesc, hdrbytes, verbs_sdma_complete); if (ret) @@ -868,7 +868,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { struct hfi1_qp_priv *priv = qp->priv; - struct ahg_ib_header *ahdr = priv->s_hdr; + struct hfi1_ahg_info *ahg_info = priv->s_ahg; u32 hdrwords = qp->s_hdrwords; struct rvt_sge_state *ss = qp->s_cur_sge; u32 len = qp->s_cur_size; @@ -896,7 +896,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, plen); } tx->wqe = qp->s_wqe; - ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc); + ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc); if (unlikely(ret)) goto bail_build; } diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 4bd1cd42e981..d44550fdd39f 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -178,7 +178,7 @@ struct hfi1_ib_header { } u; } __packed; -struct ahg_ib_header { +struct hfi1_ahg_info { u32 ahgdesc[2]; u16 tx_flags; u8 ahgcount; @@ -195,7 +195,7 @@ struct hfi1_pio_header { * pair is made common */ struct hfi1_qp_priv { - struct ahg_ib_header *s_hdr; /* next header to send */ + struct hfi1_ahg_info *s_ahg; /* ahg info for next header */ struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ u8 s_sc; /* SC[0..4] for next packet */ From d4d602e9a3035d039befdd37df5213b430948f28 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Mon, 25 Jul 2016 13:40:22 -0700 Subject: [PATCH 48/84] IB/hfi1: Rename hfi1_pio_header to hfi1_sdma_header. hfi1_pio_header should really be called hfi1_sdma_header as it is only used for sdma transmits. Reviewed-by: Dennis Dalessandro Reviewed-by: Dean Luick Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ruc.c | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hfi1/verbs.h | 2 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index ea7977dc646e..48d5094f98e2 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -708,7 +708,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } -#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4) +#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4) /** * build_ahg - create ahg in s_ahg diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 2b3fa400c593..67810e33f754 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -820,7 +820,7 @@ static int build_verbs_tx_desc( u64 pbc) { int ret = 0; - struct hfi1_pio_header *phdr = &tx->phdr; + struct hfi1_sdma_header *phdr = &tx->phdr; u16 hdrbytes = tx->hdr_dwords << 2; if (!ahg_info->ahgcount) { diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d44550fdd39f..d1b101c54828 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -185,7 +185,7 @@ struct hfi1_ahg_info { u8 ahgidx; }; -struct hfi1_pio_header { +struct hfi1_sdma_header { __le64 pbc; struct hfi1_ib_header hdr; } __packed; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index a1d6e0807f97..5660897593ba 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -56,7 +56,7 @@ #include "iowait.h" struct verbs_txreq { - struct hfi1_pio_header phdr; + struct hfi1_sdma_header phdr; struct sdma_txreq txreq; struct rvt_qp *qp; struct rvt_swqe *wqe; From 89c057cae4deaeab94c652c06b855af2bb754f50 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Mon, 25 Jul 2016 13:40:28 -0700 Subject: [PATCH 49/84] IB/hfi1: Cleanup UD packet handler. Cleanup hfi1_ud_rcv to not have to look at the packet header fields multiple times. The fields are looked up once and used throughout the function. Also fix sc computation when validating MAD packets. Reviewed-by: Dennis Dalessandro Reviewed-by: Dean Luick Reviewed-by: Ira Weiny Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ud.c | 51 ++++++++++++++------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 08a9c1219b29..852c20f5498c 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -664,13 +664,13 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct hfi1_other_headers *ohdr = packet->ohdr; int opcode; u32 hdrsize = packet->hlen; - u32 pad; struct ib_wc wc; u32 qkey; u32 src_qp; u16 dlid, pkey; int mgmt_pkey_idx = -1; struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_ib_header *hdr = packet->hdr; u32 rcv_flags = packet->rcv_flags; void *data = packet->ebuf; @@ -678,31 +678,34 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct rvt_qp *qp = packet->qp; bool has_grh = rcv_flags & HFI1_HAS_GRH; u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); + u32 bth1; + u8 sl_from_sc, sl; + u16 slid; + u8 extra_bytes; qkey = be32_to_cpu(ohdr->u.ud.deth[0]); src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; dlid = be16_to_cpu(hdr->lrh[1]); + bth1 = be32_to_cpu(ohdr->bth[1]); + slid = be16_to_cpu(hdr->lrh[3]); + pkey = (u16)be32_to_cpu(ohdr->bth[0]); + sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + extra_bytes = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + extra_bytes += (SIZE_OF_CRC << 2); + sl_from_sc = ibp->sc_to_sl[sc5]; - /* - * The opcode is in the low byte when its in network order - * (top byte when in host order). - */ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; opcode &= 0xff; - pkey = (u16)be32_to_cpu(ohdr->bth[0]); - process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); - /* * Get the number of bytes the message was padded by * and drop incomplete packets. */ - pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - if (unlikely(tlen < (hdrsize + pad + 4))) + if (unlikely(tlen < (hdrsize + extra_bytes))) goto drop; - tlen -= hdrsize + pad + 4; + tlen -= hdrsize + extra_bytes; /* * Check that the permissive LID is only used on QP0 @@ -713,10 +716,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) hdr->lrh[3] == IB_LID_PERMISSIVE)) goto drop; if (qp->ibqp.qp_num > 1) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid; - - slid = be16_to_cpu(hdr->lrh[3]); if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { /* * Traps will not be sent for packets dropped @@ -725,12 +724,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * IB spec (release 1.3, section 10.9.4) */ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, - pkey, - (be16_to_cpu(hdr->lrh[0]) >> 4) & - 0xF, + pkey, sl, src_qp, qp->ibqp.qp_num, - be16_to_cpu(hdr->lrh[3]), - be16_to_cpu(hdr->lrh[1])); + slid, dlid); return; } } else { @@ -740,22 +736,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (unlikely(qkey != qp->qkey)) { - hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, - (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl, src_qp, qp->ibqp.qp_num, - be16_to_cpu(hdr->lrh[3]), - be16_to_cpu(hdr->lrh[1])); + slid, dlid); return; } /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && - (tlen > 2048 || - (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + (tlen > 2048 || (sc5 == 0xF)))) goto drop; } else { /* Received on QP0, and so by definition, this is an SMP */ struct opa_smp *smp = (struct opa_smp *)data; - u16 slid = be16_to_cpu(hdr->lrh[3]); if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp)) goto drop; @@ -838,7 +830,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) qp->ibqp.qp_type == IB_QPT_SMI) { if (mgmt_pkey_idx < 0) { if (net_ratelimit()) { - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = ppd->dd; dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n", @@ -851,8 +842,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) wc.pkey_index = 0; } - wc.slid = be16_to_cpu(hdr->lrh[3]); - wc.sl = ibp->sc_to_sl[sc5]; + wc.slid = slid; + wc.sl = sl_from_sc; /* * Save the LMC lower bits if the destination LID is a unicast LID. From b736a469f96a28805296f0e8f0c6aa5206f694d0 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Mon, 25 Jul 2016 13:40:34 -0700 Subject: [PATCH 50/84] IB/hfi1: Use hdr2sc function to calculate 5-bit SC The interface is used to compute the 5-bit SC field from the LRH and the RHF bits. Modify code to use the interface instead. Reviewed-by: Dennis Dalessandro Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 4 +--- drivers/infiniband/hw/hfi1/hfi.h | 7 +------ drivers/infiniband/hw/hfi1/verbs.c | 7 ++----- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 4dbadf77f01d..8246dc7d0573 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -392,9 +392,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, u16 rlid; u8 svc_type, sl, sc5; - sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf; - if (rhf_dc_info(packet->rhf)) - sc5 |= 0x10; + sc5 = hdr2sc(rhdr, packet->rhf); sl = ibp->sc_to_sl[sc5]; lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index d22876d0fdd3..df43732621ee 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -368,11 +368,6 @@ struct hfi1_packet { u8 etype; }; -static inline bool has_sc4_bit(struct hfi1_packet *p) -{ - return !!rhf_dc_info(p->rhf); -} - /* * Private data for snoop/capture support. */ @@ -1273,7 +1268,7 @@ void receive_interrupt_work(struct work_struct *work); static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) { return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | - ((!!(rhf & RHF_DC_INFO_SMASK)) << 4); + ((!!(rhf_dc_info(rhf))) << 4); } static inline u16 generate_jkey(kuid_t uid) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 67810e33f754..5265d160fa63 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1743,8 +1743,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) struct rvt_qp *qp = packet->qp; u32 lqpn, rqpn = 0; u16 rlid = 0; - u8 sl, sc5, sc4_bit, svc_type; - bool sc4_set = has_sc4_bit(packet); + u8 sl, sc5, svc_type; switch (packet->qp->ibqp.qp_type) { case IB_QPT_UC: @@ -1767,9 +1766,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) return; } - sc4_bit = sc4_set << 4; - sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; - sc5 |= sc4_bit; + sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); sl = ibp->sc_to_sl[sc5]; lqpn = qp->ibqp.qp_num; From 527dbf12e08175e510ab445528c1a2964c1129ca Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Mon, 25 Jul 2016 13:40:40 -0700 Subject: [PATCH 51/84] IB/qib, IB/hfi1: Fix grh creation in ud loopback Instead of copying the actual GRH of type struct ib_grh, existing code copies the struct ib_global_route into the sge. This patch fixes that and constructs the actual GRH from ib_global_route and copies the GRH into the sge. Reviewed-by: Dennis Dalessandro Reviewed-by: Dean Luick Reviewed-by: Ira Weiny Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ud.c | 8 ++++++-- drivers/infiniband/hw/qib/qib_ud.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 852c20f5498c..f01e8e1d62d3 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -184,8 +184,12 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } if (ah_attr->ah_flags & IB_AH_GRH) { - hfi1_copy_sge(&qp->r_sge, &ah_attr->grh, - sizeof(struct ib_grh), 1, 0); + struct ib_grh grh; + struct ib_global_route grd = ah_attr->grh; + + hfi1_make_grh(ibp, &grh, &grd, 0, 0); + hfi1_copy_sge(&qp->r_sge, &grh, + sizeof(grh), 1, 0); wc.wc_flags |= IB_WC_GRH; } else { hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 846e6c726df7..10d062561bd9 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -169,8 +169,12 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } if (ah_attr->ah_flags & IB_AH_GRH) { - qib_copy_sge(&qp->r_sge, &ah_attr->grh, - sizeof(struct ib_grh), 1); + struct ib_grh grh; + struct ib_global_route grd = ah_attr->grh; + + qib_make_grh(ibp, &grh, &grd, 0, 0); + qib_copy_sge(&qp->r_sge, &grh, + sizeof(grh), 1); wc.wc_flags |= IB_WC_GRH; } else qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); From 042b0159aa6c230093c4318b689ef9a5b89f29e2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 27 Jul 2016 21:06:15 -0400 Subject: [PATCH 52/84] IB/hfi1: Handle kzalloc failure in init_pervl_scs Checking the return value of the memory allocation call in init_pervl_scs() was missed. Recently the kmalloc() was changed to kzalloc() which identified the problem. While fixing this issue 2 other bugs were noticed. First, the array being allocated is accessed in the nomem path which can be reached before it is allocated. Second, kernel_send_context was not released on error. Fix both of these by creating a more common memory unwind label structure. Fixes: 35f6befc8441 ("staging/rdma/hfi1: Add qp to send context mapping for PIO") Reported-by: Leon Romanovsky Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pio.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index a99fccadf624..ac1bf4a73571 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1952,13 +1952,17 @@ int init_pervl_scs(struct hfi1_devdata *dd) dd->vld[15].sc = sc_alloc(dd, SC_VL15, dd->rcd[0]->rcvhdrqentsize, dd->node); if (!dd->vld[15].sc) - goto nomem; + return -ENOMEM; + hfi1_init_ctxt(dd->vld[15].sc); dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); dd->kernel_send_context = kzalloc_node(dd->num_send_contexts * sizeof(struct send_context *), GFP_KERNEL, dd->node); + if (!dd->kernel_send_context) + goto freesc15; + dd->kernel_send_context[0] = dd->vld[15].sc; for (i = 0; i < num_vls; i++) { @@ -2010,12 +2014,21 @@ int init_pervl_scs(struct hfi1_devdata *dd) if (pio_map_init(dd, ppd->port - 1, num_vls, NULL)) goto nomem; return 0; + nomem: - sc_free(dd->vld[15].sc); - for (i = 0; i < num_vls; i++) + for (i = 0; i < num_vls; i++) { sc_free(dd->vld[i].sc); + dd->vld[i].sc = NULL; + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) sc_free(dd->kernel_send_context[i + 1]); + + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; + +freesc15: + sc_free(dd->vld[15].sc); return -ENOMEM; } From fe508272c963d62de4183c32b6883c3d54c557ef Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 27 Jul 2016 21:07:36 -0400 Subject: [PATCH 53/84] IB/rdmavt: Eliminate redundant opcode test in mr ref clear The use of the specific opcode test is redundant since all ack entry users correctly manipulate the mr pointer to selectively trigger the reference clearing. The overly specific test hinders the use of implementation specific operations. The change needs to get rid of the union to insure that an atomic value is not seen as an MR pointer. Reviewed-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 3 +-- include/rdma/rdmavt_qp.h | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 8ccf1b970b2c..bdb540f25a88 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -435,8 +435,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) for (n = 0; n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; - if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && - e->rdma_sge.mr) { + if (e->rdma_sge.mr) { rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 13902dd319a9..bd34d0b56bf7 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -222,14 +222,12 @@ struct rvt_mmap_info { * to send a RDMA read response or atomic operation. */ struct rvt_ack_entry { - u8 opcode; - u8 sent; + struct rvt_sge rdma_sge; + u64 atomic_data; u32 psn; u32 lpsn; - union { - struct rvt_sge rdma_sge; - u64 atomic_data; - }; + u8 opcode; + u8 sent; }; #define RC_QP_SCALING_INTERVAL 5 From c49298026908a8ce9dcf01ed68734ad171cef98b Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 27 Jul 2016 21:08:42 -0400 Subject: [PATCH 54/84] IB/hfi1: Allow for non-double word multiple message sizes for user SDMA The driver pads non-double word multiple message sizes but it doesn't account for this padding when the packet length is calculated. Also, the data length is miscalculated for message sizes less than 4 bytes due to the bit representation in LRH. And there's a check for non-double word multiple message sizes that prevents these messages from being sent. This patch fixes length miscalculations and enables the functionality to send non-double word multiple message sizes. Reviewed-by: Harish Chegondi Signed-off-by: Sebastian Sanchez Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 31 ++++++++++++++++++-------- include/uapi/rdma/hfi/hfi1_user.h | 2 +- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index d16ed52a2cb1..1e266c95056a 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -793,14 +793,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req, * The size of the data of the first packet is in the header * template. However, it includes the header and ICRC, which need * to be subtracted. + * The minimum representable packet data length in a header is 4 bytes, + * therefore, when the data length request is less than 4 bytes, there's + * only one packet, and the packet data length is equal to that of the + * request data length. * The size of the remaining packets is the minimum of the frag * size (MTU) or remaining data in the request. */ u32 len; if (!req->seqnum) { - len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - - (sizeof(tx->hdr) - 4)); + if (req->data_len < sizeof(u32)) + len = req->data_len; + else + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); } else if (req_opcode(req->info.ctrl) == EXPECTED) { u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * PAGE_SIZE; @@ -830,6 +837,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req, return len; } +static inline u32 pad_len(u32 len) +{ + if (len & (sizeof(u32) - 1)) + len += sizeof(u32) - (len & (sizeof(u32) - 1)); + return len; +} + static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) { /* (Size of complete header - size of PBC) + 4B ICRC + data length */ @@ -921,7 +935,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { if (!req->seqnum) { u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); - u32 lrhlen = get_lrh_len(req->hdr, datalen); + u32 lrhlen = get_lrh_len(req->hdr, + pad_len(datalen)); /* * Copy the request header into the tx header * because the HW needs a cacheline-aligned @@ -1219,16 +1234,14 @@ static int check_header_template(struct user_sdma_request *req, /* * Perform safety checks for any type of packet: * - transfer size is multiple of 64bytes - * - packet length is multiple of 4bytes - * - entire request length is multiple of 4bytes + * - packet length is multiple of 4 bytes * - packet length is not larger than MTU size * * These checks are only done for the first packet of the * transfer since the header is "given" to us by user space. * For the remainder of the packets we compute the values. */ - if (req->info.fragsize % PIO_BLOCK_SIZE || - lrhlen & 0x3 || req->data_len & 0x3 || + if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || lrhlen > get_lrh_len(*hdr, req->info.fragsize)) return -EINVAL; @@ -1290,7 +1303,7 @@ static int set_txreq_header(struct user_sdma_request *req, struct hfi1_pkt_header *hdr = &tx->hdr; u16 pbclen; int ret; - u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); /* Copy the header template to the request before modification */ memcpy(hdr, &req->hdr, sizeof(*hdr)); @@ -1401,7 +1414,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, struct hfi1_user_sdma_pkt_q *pq = req->pq; struct hfi1_pkt_header *hdr = &req->hdr; u16 pbclen = le16_to_cpu(hdr->pbc[0]); - u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); if (PBC2LRH(pbclen) != lrhlen) { /* PBC.PbcLengthDWs */ diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 98bebf8bef55..d15e7289d835 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -75,7 +75,7 @@ * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define HFI1_USER_SWMINOR 1 +#define HFI1_USER_SWMINOR 2 /* * We will encode the major/minor inside a single 32bit version number. From fc0b76c0168dc7577792c371c16bad200009a62d Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 27 Jul 2016 21:09:40 -0400 Subject: [PATCH 55/84] IB/hfi1: Expand reported serial number Expand the serial number space by using more bits from the GUID. Reviewed-by: Jubin John Signed-off-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index becc7b11e31e..c93683496614 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14601,8 +14601,14 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up LCB access - must be after set_up_interrupts() */ init_lcb_access(dd); + /* + * Serial number is created from the base guid: + * [27:24] = base guid [38:35] + * [23: 0] = base guid [23: 0] + */ snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n", - dd->base_guid & 0xFFFFFF); + (dd->base_guid & 0xFFFFFF) | + ((dd->base_guid >> 11) & 0xF000000)); dd->oui1 = dd->base_guid >> 56 & 0xFF; dd->oui2 = dd->base_guid >> 48 & 0xFF; From 72720ddfc6d2256d62c4d8a644cf2ac54a27af90 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:25 -0400 Subject: [PATCH 56/84] IB/hfi1: Fix minor format error Brackets should be on the next line of a function Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 1e266c95056a..86c28851491c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1115,7 +1115,8 @@ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) } static int pin_vector_pages(struct user_sdma_request *req, - struct user_sdma_iovec *iovec) { + struct user_sdma_iovec *iovec) +{ int ret = 0, pinned, npages, cleared; struct page **pages; struct hfi1_user_sdma_pkt_q *pq = req->pq; From 8e1f52df978ec17475e1184ed9f72078babcbbfa Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:26 -0400 Subject: [PATCH 57/84] IB/hfi1: Remove unused uctxt->subpid and uctxt->pid These are no longer needed. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 4 ---- drivers/infiniband/hw/hfi1/hfi.h | 3 --- 2 files changed, 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index d7c07bc7bd14..b80c8d2ac52b 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -727,7 +727,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) if (--uctxt->cnt) { uctxt->active_slaves &= ~(1 << fdata->subctxt); - uctxt->subpid[fdata->subctxt] = 0; mutex_unlock(&hfi1_mutex); goto done; } @@ -753,7 +752,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); sc_disable(uctxt->sc); - uctxt->pid = 0; spin_unlock_irqrestore(&dd->uctxt_lock, flags); dd->rcd[uctxt->ctxt] = NULL; @@ -893,7 +891,6 @@ static int find_shared_ctxt(struct file *fp, } fd->uctxt = uctxt; fd->subctxt = uctxt->cnt++; - uctxt->subpid[fd->subctxt] = current->pid; uctxt->active_slaves |= 1 << fd->subctxt; ret = 1; goto done; @@ -978,7 +975,6 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, return ret; } uctxt->userversion = uinfo->userversion; - uctxt->pid = current->pid; uctxt->flags = HFI1_CAP_UGET(MASK); init_waitqueue_head(&uctxt->wait); strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index df43732621ee..63ce587d6f94 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -270,9 +270,6 @@ struct hfi1_ctxtdata { u32 urgent; /* saved total number of polled urgent packets for poll edge trigger */ u32 urgent_poll; - /* pid of process using this ctxt */ - pid_t pid; - pid_t subpid[HFI1_MAX_SHARED_CTXTS]; /* same size as task_struct .comm[], command that opened context */ char comm[TASK_COMM_LEN]; /* so file ops can get at unit */ From fc87879ae237785704a6b6a54c1c5a47c395662c Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:27 -0400 Subject: [PATCH 58/84] IB/hfi1: Remove unused function hfi1_mmu_rb_search Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 17 ----------------- drivers/infiniband/hw/hfi1/mmu_rb.h | 2 -- 2 files changed, 19 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index b7a80aa1ae30..9289bfaa4911 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -223,23 +223,6 @@ static void __mmu_rb_remove(struct mmu_rb_handler *handler, handler->ops->remove(handler->root, node, mm); } -struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, - unsigned long len) -{ - struct mmu_rb_handler *handler = find_mmu_handler(root); - struct mmu_rb_node *node; - unsigned long flags; - - if (!handler) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&handler->lock, flags); - node = __mmu_rb_search(handler, addr, len); - spin_unlock_irqrestore(&handler->lock, flags); - - return node; -} - struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, unsigned long addr, unsigned long len) { diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 7a57b9c49d27..215c728e52cd 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -68,8 +68,6 @@ int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); void hfi1_mmu_rb_unregister(struct rb_root *); int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); -struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, - unsigned long); struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, unsigned long); From ac335e7e8079d08441aba46ce2c07398b603719f Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:28 -0400 Subject: [PATCH 59/84] IB/hfi1: Add parameter names to function declarations Parameter names to function declarations make it more clear what those parameters do. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 63ce587d6f94..6fb86fee0701 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1700,9 +1700,11 @@ void shutdown_led_override(struct hfi1_pportdata *ppd); */ #define DEFAULT_RCVHDR_ENTSIZE 32 -bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32); -int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **); -void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool); +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages); +int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, + struct page **pages); +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty); static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) { From 862548dace34690b6a477f32e8ce68b50f7cbdf6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:29 -0400 Subject: [PATCH 60/84] IB/hfi1: Add parameter names to callback declarations This makes it more clear what these functions are operating on. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 215c728e52cd..45e7245d813b 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -57,11 +57,12 @@ struct mmu_rb_node { }; struct mmu_rb_ops { - bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long); - int (*insert)(struct rb_root *, struct mmu_rb_node *); - void (*remove)(struct rb_root *, struct mmu_rb_node *, - struct mm_struct *); - int (*invalidate)(struct rb_root *, struct mmu_rb_node *); + bool (*filter)(struct mmu_rb_node *node, unsigned long addr, + unsigned long len); + int (*insert)(struct rb_root *root, struct mmu_rb_node *mnode); + void (*remove)(struct rb_root *root, struct mmu_rb_node *mnode, + struct mm_struct *mm); + int (*invalidate)(struct rb_root *root, struct mmu_rb_node *node); }; int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); From c0946642e574181e00cf8561579e495d08d18988 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:30 -0400 Subject: [PATCH 61/84] IB/hfi1: Always expect ops functions Remove, insert, and invalidate are always provided. No need to test. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 9289bfaa4911..525d58afb354 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -99,9 +99,6 @@ int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) { struct mmu_rb_handler *handlr; - if (!ops->invalidate) - return -EINVAL; - handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); if (!handlr) return -ENOMEM; @@ -143,8 +140,7 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) while ((node = rb_first(root))) { rbnode = rb_entry(node, struct mmu_rb_node, node); rb_erase(node, root); - if (handler->ops->remove) - handler->ops->remove(root, rbnode, NULL); + handler->ops->remove(root, rbnode, NULL); } } spin_unlock_irqrestore(&handler->lock, flags); @@ -172,11 +168,9 @@ int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) } __mmu_int_rb_insert(mnode, root); - if (handler->ops->insert) { - ret = handler->ops->insert(root, mnode); - if (ret) - __mmu_int_rb_remove(mnode, root); - } + ret = handler->ops->insert(root, mnode); + if (ret) + __mmu_int_rb_remove(mnode, root); unlock: spin_unlock_irqrestore(&handler->lock, flags); return ret; @@ -219,8 +213,7 @@ static void __mmu_rb_remove(struct mmu_rb_handler *handler, __mmu_int_rb_remove(node, handler->root); spin_unlock_irqrestore(&handler->lock, flags); - if (handler->ops->remove) - handler->ops->remove(handler->root, node, mm); + handler->ops->remove(handler->root, node, mm); } struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, @@ -300,8 +293,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, node->addr, node->len); if (handler->ops->invalidate(root, node)) { __mmu_int_rb_remove(node, root); - if (handler->ops->remove) - handler->ops->remove(root, node, mm); + handler->ops->remove(root, node, mm); } } spin_unlock_irqrestore(&handler->lock, flags); From 3c1091aa94d6c973cfb3eab17afdb4565f8a4ae6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:31 -0400 Subject: [PATCH 62/84] IB/hfi1: Consolidate __mmu_rb_remove and hfi1_mmu_rb_remove __mmu_rb_remove was called in only 1 place which was a very simple call site. Combine this function into its caller. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 525d58afb354..b845adf9fc0e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -200,22 +200,6 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } -/* Caller must *not* hold handler lock. */ -static void __mmu_rb_remove(struct mmu_rb_handler *handler, - struct mmu_rb_node *node, struct mm_struct *mm) -{ - unsigned long flags; - - /* Validity of handler and node pointers has been checked by caller. */ - hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, - node->len); - spin_lock_irqsave(&handler->lock, flags); - __mmu_int_rb_remove(node, handler->root); - spin_unlock_irqrestore(&handler->lock, flags); - - handler->ops->remove(handler->root, node, mm); -} - struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, unsigned long addr, unsigned long len) { @@ -237,12 +221,20 @@ struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) { + unsigned long flags; struct mmu_rb_handler *handler = find_mmu_handler(root); if (!handler || !node) return; - __mmu_rb_remove(handler, node, NULL); + /* Validity of handler and node pointers has been checked by caller. */ + hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, + node->len); + spin_lock_irqsave(&handler->lock, flags); + __mmu_int_rb_remove(node, handler->root); + spin_unlock_irqrestore(&handler->lock, flags); + + handler->ops->remove(handler->root, node, NULL); } static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) From 5ed3b15b05449a2e2cc2e4d4698d420a37b092ea Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:32 -0400 Subject: [PATCH 63/84] IB/hfi1: Remove unused sub-context parameter subctxt is not used, just remove it. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 1b640a35b3fe..f16eb25bfcd2 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -94,7 +94,7 @@ static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *); static int unprogram_rcvarray(struct file *, u32, struct tid_group **); -static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *); +static void clear_tid_node(struct hfi1_filedata *, struct tid_rb_node *); static struct mmu_rb_ops tid_rb_ops = { .insert = mmu_rb_insert, @@ -911,12 +911,11 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, if (grp) *grp = node->grp; - clear_tid_node(fd, fd->subctxt, node); + clear_tid_node(fd, node); return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, - struct tid_rb_node *node) +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; @@ -975,7 +974,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, else hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); - clear_tid_node(fd, -1, node); + clear_tid_node(fd, node); } } } From 639297b4f0ea3cf14290ec4e188ec6cd0cb50db9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:33 -0400 Subject: [PATCH 64/84] IB/hfi1: Use "false" not 0 For bool parameters "false" should be used Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 86c28851491c..54640c31b6bb 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1224,7 +1224,7 @@ bail: static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, unsigned start, unsigned npages) { - hfi1_release_user_pages(mm, pages + start, npages, 0); + hfi1_release_user_pages(mm, pages + start, npages, false); kfree(pages); } From ff4ce9bde9b2a88984c3ee359b952e35fe49c474 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:34 -0400 Subject: [PATCH 65/84] IB/hfi1: Make iovec loop index easy to understand Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 54640c31b6bb..586f07807b27 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -520,7 +520,7 @@ static u8 dlid_to_selector(u16 dlid) int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, unsigned long dim, unsigned long *count) { - int ret = 0, i = 0; + int ret = 0, i; struct hfi1_filedata *fd = fp->private_data; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_user_sdma_pkt_q *pq = fd->pq; @@ -657,7 +657,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, idx++; /* Save all the IO vector structures */ - while (i < req->data_iovs) { + for (i = 0; i < req->data_iovs; i++) { INIT_LIST_HEAD(&req->iovs[i].list); memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); ret = pin_vector_pages(req, &req->iovs[i]); @@ -665,7 +665,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, req->status = ret; goto free_req; } - req->data_len += req->iovs[i++].iov.iov_len; + req->data_len += req->iovs[i].iov.iov_len; } SDMA_DBG(req, "total data length %u", req->data_len); From ea3a0ee52db0c2ec8d1d0ecdd21e650e6e183085 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 12:27:35 -0400 Subject: [PATCH 66/84] IB/hfi1: Restructure hfi1_file_open Rearrange the file open call in prep for new changes. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index b80c8d2ac52b..0522bafb190b 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -168,6 +168,7 @@ static inline int is_valid_mmap(u64 token) static int hfi1_file_open(struct inode *inode, struct file *fp) { + struct hfi1_filedata *fd; struct hfi1_devdata *dd = container_of(inode->i_cdev, struct hfi1_devdata, user_cdev); @@ -176,10 +177,15 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) kobject_get(&dd->kobj); /* The real work is performed later in assign_ctxt() */ - fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL); - if (fp->private_data) /* no cpu affinity by default */ - ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1; - return fp->private_data ? 0 : -ENOMEM; + + fd = kzalloc(sizeof(*fd), GFP_KERNEL); + + if (fd) /* no cpu affinity by default */ + fd->rec_cpu_num = -1; + + fp->private_data = fd; + + return fd ? 0 : -ENOMEM; } static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, From 20a42d08331c888207d81d8e4713e18250ac49cf Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:36 -0400 Subject: [PATCH 67/84] IB/hfi1: Remove unneeded empty check in hfi1_mmu_rb_unregister() Checking if the rb tree is empty is redundant with the while loop which is emptying the rb tree. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index b845adf9fc0e..1c7e25b90a2c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -118,6 +118,8 @@ int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) void hfi1_mmu_rb_unregister(struct rb_root *root) { struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *rbnode; + struct rb_node *node; unsigned long flags; if (!handler) @@ -133,15 +135,10 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) synchronize_rcu(); spin_lock_irqsave(&handler->lock, flags); - if (!RB_EMPTY_ROOT(root)) { - struct rb_node *node; - struct mmu_rb_node *rbnode; - - while ((node = rb_first(root))) { - rbnode = rb_entry(node, struct mmu_rb_node, node); - rb_erase(node, root); - handler->ops->remove(root, rbnode, NULL); - } + while ((node = rb_first(root))) { + rbnode = rb_entry(node, struct mmu_rb_node, node); + rb_erase(node, root); + handler->ops->remove(root, rbnode, NULL); } spin_unlock_irqrestore(&handler->lock, flags); From a7cd2dc5d494e92871e7b6734b72ef5451ff026d Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 12:27:37 -0400 Subject: [PATCH 68/84] IB/hfi1: Rename TID mmu_rb_* functions Clarify the names of the TID mmu functions. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index f16eb25bfcd2..8283a6a2bb15 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -86,10 +86,10 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); -static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); -static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *, +static int tid_rb_insert(struct rb_root *, struct mmu_rb_node *); +static void tid_rb_remove(struct rb_root *, struct mmu_rb_node *, struct mm_struct *); -static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int tid_rb_invalidate(struct rb_root *, struct mmu_rb_node *); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *); @@ -97,9 +97,9 @@ static int unprogram_rcvarray(struct file *, u32, struct tid_group **); static void clear_tid_node(struct hfi1_filedata *, struct tid_rb_node *); static struct mmu_rb_ops tid_rb_ops = { - .insert = mmu_rb_insert, - .remove = mmu_rb_remove, - .invalidate = mmu_rb_invalidate + .insert = tid_rb_insert, + .remove = tid_rb_remove, + .invalidate = tid_rb_invalidate }; static inline u32 rcventry2tidinfo(u32 rcventry) @@ -862,7 +862,7 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, memcpy(node->pages, pages, sizeof(struct page *) * npages); if (HFI1_CAP_IS_USET(TID_UNMAP)) - ret = mmu_rb_insert(root, &node->mmu); + ret = tid_rb_insert(root, &node->mmu); else ret = hfi1_mmu_rb_insert(root, &node->mmu); @@ -905,7 +905,7 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; if (HFI1_CAP_IS_USET(TID_UNMAP)) - mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); + tid_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); else hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); @@ -969,7 +969,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, if (!node || node->rcventry != rcventry) continue; if (HFI1_CAP_IS_USET(TID_UNMAP)) - mmu_rb_remove(&fd->tid_rb_root, + tid_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); else hfi1_mmu_rb_remove(&fd->tid_rb_root, @@ -980,7 +980,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, } } -static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +static int tid_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) { struct hfi1_filedata *fdata = container_of(root, struct hfi1_filedata, tid_rb_root); @@ -1024,7 +1024,7 @@ static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) return 0; } -static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) +static int tid_rb_insert(struct rb_root *root, struct mmu_rb_node *node) { struct hfi1_filedata *fdata = container_of(root, struct hfi1_filedata, tid_rb_root); @@ -1036,7 +1036,7 @@ static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) return 0; } -static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node, +static void tid_rb_remove(struct rb_root *root, struct mmu_rb_node *node, struct mm_struct *mm) { struct hfi1_filedata *fdata = From 53445bb32d244b1b32e8d88346a551130fb35544 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 15:21:12 -0400 Subject: [PATCH 69/84] IB/hfi1: Prevent null pointer dereference If a context has not been assigned or assignment failed, pq may be NULL. Move the unregister within the protection of the null check. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 586f07807b27..6b8d1e8b6286 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -472,8 +472,8 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, uctxt->ctxt, fd->subctxt); pq = fd->pq; - hfi1_mmu_rb_unregister(&pq->sdma_rb_root); if (pq) { + hfi1_mmu_rb_unregister(&pq->sdma_rb_root); spin_lock_irqsave(&uctxt->sdma_qlock, flags); if (!list_empty(&pq->list)) list_del_init(&pq->list); From bdf7752e072f91fbeb1739da3938d4392ea8a51f Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:13 -0400 Subject: [PATCH 70/84] IB/hfi1: Use the same capability state for all shared contexts Save the current capability state at user context creation time. Report this saved value for all shared contexts. Also get rid of unnecessary hfi1_get_base_kinfo function. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 8 -------- drivers/infiniband/hw/hfi1/chip.h | 2 -- drivers/infiniband/hw/hfi1/file_ops.c | 21 +++++++++++---------- drivers/infiniband/hw/hfi1/hfi.h | 2 +- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index c93683496614..b32638d58ae8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9648,14 +9648,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) hfi1_put_tid(dd, i, PT_INVALID, 0, 0); } -int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, - struct hfi1_ctxt_info *kinfo) -{ - kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) | - HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U); - return 0; -} - struct hfi1_message_header *hfi1_get_msgheader( struct hfi1_devdata *dd, __le32 *rhf_addr) { diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index f07bc4ccc468..ed11107c50fe 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1337,8 +1337,6 @@ void hfi1_start_cleanup(struct hfi1_devdata *dd); void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); struct hfi1_message_header *hfi1_get_msgheader( struct hfi1_devdata *dd, __le32 *rhf_addr); -int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, - struct hfi1_ctxt_info *kinfo); int hfi1_init_ctxt(struct send_context *sc); void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 0522bafb190b..1f4cd5aa2071 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -981,7 +981,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, return ret; } uctxt->userversion = uinfo->userversion; - uctxt->flags = HFI1_CAP_UGET(MASK); + uctxt->flags = hfi1_cap_mask; /* save current flag state */ init_waitqueue_head(&uctxt->wait); strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); @@ -1084,18 +1084,18 @@ static int user_init(struct file *fp) hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; /* * Ignore the bit in the flags for now until proper * support for multiple packet per rcv array entry is * added. */ - if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR)) rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL)) rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; /* * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. @@ -1103,7 +1103,7 @@ static int user_init(struct file *fp) * uses of the chip or ctxt. Therefore, add the rcvctrl op * for both cases. */ - if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL)) rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; else rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; @@ -1126,9 +1126,10 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) int ret = 0; memset(&cinfo, 0, sizeof(cinfo)); - ret = hfi1_get_base_kinfo(uctxt, &cinfo); - if (ret < 0) - goto done; + cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) & + HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | + HFI1_CAP_KGET_MASK(uctxt->flags, K2U); cinfo.num_active = hfi1_count_active_units(); cinfo.unit = uctxt->dd->unit; cinfo.ctxt = uctxt->ctxt; @@ -1150,7 +1151,7 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); if (copy_to_user(ubase, &cinfo, sizeof(cinfo))) ret = -EFAULT; -done: + return ret; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6fb86fee0701..36e6b8e0c735 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -255,7 +255,7 @@ struct hfi1_ctxtdata { /* chip offset of PIO buffers for this ctxt */ u32 piobufs; /* per-context configuration flags */ - u32 flags; + unsigned long flags; /* per-context event flags for fileops/intr communication */ unsigned long event_flags; /* WAIT_RCV that timed out, no interrupt */ From 4fa0d22c9a08f32df4d2f7683b1efe21d1a905ac Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:14 -0400 Subject: [PATCH 71/84] IB/hfi1: Validate SDMA user request index Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 6b8d1e8b6286..0a0281ae35f1 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -552,6 +552,14 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, (u16 *)&info); + + if (info.comp_idx >= hfi1_sdma_comp_ring_size) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid comp index", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); + return -EINVAL; + } + if (cq->comps[info.comp_idx].status == QUEUED || test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", From 9ff73c8715dbd02d18b4d99c815f388413314229 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:15 -0400 Subject: [PATCH 72/84] IB/hfi1: Validate SDMA user iovector count Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 0a0281ae35f1..42cc371cdf95 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -560,6 +560,18 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, return -EINVAL; } + /* + * Sanity check the header io vector count. Need at least 1 vector + * (header) and cannot be larger than the actual io vector count. + */ + if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, + req_iovcnt(info.ctrl), dim); + return -EINVAL; + } + if (cq->comps[info.comp_idx].status == QUEUED || test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", @@ -583,7 +595,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, memset(req, 0, sizeof(*req)); /* Mark the request as IN_USE before we start filling it in. */ set_bit(SDMA_REQ_IN_USE, &req->flags); - req->data_iovs = req_iovcnt(info.ctrl) - 1; + req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ req->pq = pq; req->cq = cq; req->status = -1; @@ -591,8 +603,16 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, memcpy(&req->info, &info, sizeof(info)); - if (req_opcode(info.ctrl) == EXPECTED) + if (req_opcode(info.ctrl) == EXPECTED) { + /* expected must have a TID info and at least one data vector */ + if (req->data_iovs < 2) { + SDMA_DBG(req, + "Not enough vectors for expected request"); + ret = -EINVAL; + goto free_req; + } req->data_iovs--; + } if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, From a383f8ec552c9af5066eb488cc7a2d8b3994151d Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:16 -0400 Subject: [PATCH 73/84] IB/hfi1: Release node on insert failure If unable to insert node into the RB tree cache, node will be freed before returning from the function. Null out iovec's pointer to node so iovec does not try to free it later. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 42cc371cdf95..ff03e1dad5b9 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1239,6 +1239,7 @@ retry: list_del(&node->list); pq->n_locked -= node->npages; spin_unlock(&pq->evict_lock); + iovec->node = NULL; goto bail; } return 0; From 9da7e9a711e6fbecd111b9873682480c77c64192 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:17 -0400 Subject: [PATCH 74/84] IB/hfi1: Fix error condition that needs to clean up If input validation fails, properly free the request before returning. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index ff03e1dad5b9..5c1322428065 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -617,7 +617,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, MAX_VECTORS_PER_REQ); - return -EINVAL; + ret = -EINVAL; + goto free_req; } /* Copy the header from the user buffer */ ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), From 7b3256e331270c7de43ccf3879b7c289cdc3ff28 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:18 -0400 Subject: [PATCH 75/84] IB/hfi1: Fix user SDMA racy user request claim The user SDMA in-use claim bit is in the structure that gets zeroed out once the claim is made. Move the request in-use flag into its own bit array and use that for atomic claims. This cleans up the claim code and removes any race possibility. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 32 +++++++++++++++----------- drivers/infiniband/hw/hfi1/user_sdma.h | 1 + 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 5c1322428065..e88d555389f4 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -145,7 +145,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 /* Last packet in the request */ #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) -#define SDMA_REQ_IN_USE 0 +/* SDMA request flag bits */ #define SDMA_REQ_FOR_THREAD 1 #define SDMA_REQ_SEND_DONE 2 #define SDMA_REQ_HAVE_AHG 3 @@ -397,6 +397,11 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) if (!pq->reqs) goto pq_reqs_nomem; + memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long); + pq->req_in_use = kzalloc(memsize, GFP_KERNEL); + if (!pq->req_in_use) + goto pq_reqs_no_in_use; + INIT_LIST_HEAD(&pq->list); pq->dd = dd; pq->ctxt = uctxt->ctxt; @@ -453,6 +458,8 @@ cq_comps_nomem: cq_nomem: kmem_cache_destroy(pq->txreq_cache); pq_txreq_nomem: + kfree(pq->req_in_use); +pq_reqs_no_in_use: kfree(pq->reqs); pq_reqs_nomem: kfree(pq); @@ -484,6 +491,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) pq->wait, (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); kfree(pq->reqs); + kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); kfree(pq); fd->pq = NULL; @@ -572,29 +580,27 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, return -EINVAL; } - if (cq->comps[info.comp_idx].status == QUEUED || - test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { - hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", - dd->unit, uctxt->ctxt, fd->subctxt, - info.comp_idx); - return -EBADSLT; - } if (!info.fragsize) { hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Request does not specify fragsize", dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); return -EINVAL; } + + /* Try to claim the request. */ + if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", + dd->unit, uctxt->ctxt, fd->subctxt, + info.comp_idx); + return -EBADSLT; + } /* - * We've done all the safety checks that we can up to this point, - * "allocate" the request entry. + * All safety checks have been done and this request has been claimed. */ hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); req = pq->reqs + info.comp_idx; memset(req, 0, sizeof(*req)); - /* Mark the request as IN_USE before we start filling it in. */ - set_bit(SDMA_REQ_IN_USE, &req->flags); req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ req->pq = pq; req->cq = cq; @@ -1612,7 +1618,7 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) } } kfree(req->tids); - clear_bit(SDMA_REQ_IN_USE, &req->flags); + clear_bit(req->info.comp_idx, req->pq->req_in_use); } static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index b9240e351161..20ff846f318b 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -63,6 +63,7 @@ struct hfi1_user_sdma_pkt_q { struct hfi1_devdata *dd; struct kmem_cache *txreq_cache; struct user_sdma_request *reqs; + unsigned long *req_in_use; struct iowait busy; unsigned state; wait_queue_head_t wait; From 3faa3d9a308e539cc48355b1f419a5ed9f8274a2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 15:21:19 -0400 Subject: [PATCH 76/84] IB/hfi1: Make use of mm consistent The hfi1 driver registers a mmu_notifier callback when /dev/hfi1_* is opened, and unregisters it when the device is closed. The driver incorrectly assumes that the close will always happen from the same context as the open. In particular, closes due to SIGKILL or OOM killer activity may happen from a different context. In these cases, the wrong mm is passed to mmu_notifier_unregister(), which causes improper reference counting for the victim mm, and eventual memory corruption. Preserve the mm for all open file descriptors and use this mm rather than current->mm for memory operations for the lifetime of that fd. Note: this patch leaves 1 use of current->mm in place. This use is removed in a follow on patch because other functional changes were required prior to that use being removed. If registration fails, there is no reason to keep the handler object around. Free the handler object rather than add it to the list to prevent any mmu_notifier operations, including unregister, when registration fails. Suggested-by: Jim Foraker Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 6 ++++-- drivers/infiniband/hw/hfi1/hfi.h | 8 +++++--- drivers/infiniband/hw/hfi1/mmu_rb.c | 18 ++++++++++++++---- drivers/infiniband/hw/hfi1/mmu_rb.h | 3 ++- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 15 ++++++++------- drivers/infiniband/hw/hfi1/user_pages.c | 19 ++++++++++--------- drivers/infiniband/hw/hfi1/user_sdma.c | 11 ++++++----- drivers/infiniband/hw/hfi1/user_sdma.h | 1 + 8 files changed, 50 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1f4cd5aa2071..302f0cdd8119 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -180,8 +180,10 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) fd = kzalloc(sizeof(*fd), GFP_KERNEL); - if (fd) /* no cpu affinity by default */ - fd->rec_cpu_num = -1; + if (fd) { + fd->rec_cpu_num = -1; /* no cpu affinity by default */ + fd->mm = current->mm; + } fp->private_data = fd; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 36e6b8e0c735..67f37c9ea960 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1205,6 +1205,7 @@ struct hfi1_filedata { u32 invalid_tid_idx; /* protect invalid_tids array and invalid_tid_idx */ spinlock_t invalid_lock; + struct mm_struct *mm; }; extern struct list_head hfi1_dev_list; @@ -1700,9 +1701,10 @@ void shutdown_led_override(struct hfi1_pportdata *ppd); */ #define DEFAULT_RCVHDR_ENTSIZE 32 -bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages); -int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, - struct page **pages); +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages); +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, + size_t npages, bool writable, struct page **pages); void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty); diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 1c7e25b90a2c..e5c5ef4cf06c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -58,6 +58,7 @@ struct mmu_rb_handler { struct rb_root *root; spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; + struct mm_struct *mm; }; static LIST_HEAD(mmu_rb_handlers); @@ -95,9 +96,11 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) return PAGE_ALIGN(node->addr + node->len) - 1; } -int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) +int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, + struct mmu_rb_ops *ops) { struct mmu_rb_handler *handlr; + int ret; handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); if (!handlr) @@ -108,11 +111,19 @@ int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) INIT_HLIST_NODE(&handlr->mn.hlist); spin_lock_init(&handlr->lock); handlr->mn.ops = &mn_opts; + handlr->mm = mm; + + ret = mmu_notifier_register(&handlr->mn, handlr->mm); + if (ret) { + kfree(handlr); + return ret; + } + spin_lock(&mmu_rb_lock); list_add_tail_rcu(&handlr->list, &mmu_rb_handlers); spin_unlock(&mmu_rb_lock); - return mmu_notifier_register(&handlr->mn, current->mm); + return ret; } void hfi1_mmu_rb_unregister(struct rb_root *root) @@ -126,8 +137,7 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) return; /* Unregister first so we don't get any more notifications. */ - if (current->mm) - mmu_notifier_unregister(&handler->mn, current->mm); + mmu_notifier_unregister(&handler->mn, handler->mm); spin_lock(&mmu_rb_lock); list_del_rcu(&handler->list); diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 45e7245d813b..489a691856e5 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -65,7 +65,8 @@ struct mmu_rb_ops { int (*invalidate)(struct rb_root *root, struct mmu_rb_node *node); }; -int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); +int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, + struct mmu_rb_ops *ops); void hfi1_mmu_rb_unregister(struct rb_root *); int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 8283a6a2bb15..a2f7e719dc4d 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -211,7 +211,8 @@ int hfi1_user_exp_rcv_init(struct file *fp) * fails, continue but turn off the TID caching for * all user contexts. */ - ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops); + ret = hfi1_mmu_rb_register(fd->mm, &fd->tid_rb_root, + &tid_rb_ops); if (ret) { dd_dev_info(dd, "Failed MMU notifier registration %d\n", @@ -399,12 +400,12 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) * pages, accept the amount pinned so far and program only that. * User space knows how to deal with partially programmed buffers. */ - if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) { + if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { ret = -ENOMEM; goto bail; } - pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); if (pinned <= 0) { ret = pinned; goto bail; @@ -559,7 +560,7 @@ nomem: * for example), unpin all unmapped pages so we can pin them nex time. */ if (mapped_pages != pinned) { - hfi1_release_user_pages(current->mm, &pages[mapped_pages], + hfi1_release_user_pages(fd->mm, &pages[mapped_pages], pinned - mapped_pages, false); fd->tid_n_pinned -= pinned - mapped_pages; @@ -905,7 +906,7 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; if (HFI1_CAP_IS_USET(TID_UNMAP)) - tid_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); + tid_rb_remove(&fd->tid_rb_root, &node->mmu, fd->mm); else hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); @@ -933,7 +934,7 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, PCI_DMA_FROMDEVICE); - hfi1_release_user_pages(current->mm, node->pages, node->npages, true); + hfi1_release_user_pages(fd->mm, node->pages, node->npages, true); fd->tid_n_pinned -= node->npages; node->grp->used--; @@ -970,7 +971,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, continue; if (HFI1_CAP_IS_USET(TID_UNMAP)) tid_rb_remove(&fd->tid_rb_root, - &node->mmu, NULL); + &node->mmu, fd->mm); else hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 88e10b5f55f1..20f4ddcac3b0 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -68,7 +68,8 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); * could keeping caching buffers. * */ -bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages) { unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, size = (cache_size * (1UL << 20)); /* convert to bytes */ @@ -89,9 +90,9 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) /* Convert to number of pages */ size = DIV_ROUND_UP(size, PAGE_SIZE); - down_read(¤t->mm->mmap_sem); - pinned = current->mm->pinned_vm; - up_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); + pinned = mm->pinned_vm; + up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ if (pinned + npages >= ulimit && !can_lock) @@ -100,8 +101,8 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) return ((nlocked + npages) <= size) || can_lock; } -int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, - struct page **pages) +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, + bool writable, struct page **pages) { int ret; @@ -109,9 +110,9 @@ int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, if (ret < 0) return ret; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += ret; - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm += ret; + up_write(&mm->mmap_sem); return ret; } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index e88d555389f4..640c244b665b 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -413,6 +413,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) pq->sdma_rb_root = RB_ROOT; INIT_LIST_HEAD(&pq->evict); spin_lock_init(&pq->evict_lock); + pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, defer_packet_queue, activate_packet_queue, NULL); @@ -442,7 +443,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) cq->nentries = hfi1_sdma_comp_ring_size; fd->cq = cq; - ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops); + ret = hfi1_mmu_rb_register(pq->mm, &pq->sdma_rb_root, &sdma_rb_ops); if (ret) { dd_dev_err(dd, "Failed to register with MMU %d", ret); goto done; @@ -1205,12 +1206,12 @@ static int pin_vector_pages(struct user_sdma_request *req, spin_unlock(&pq->evict_lock); } retry: - if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { + if (!hfi1_can_pin_pages(pq->dd, pq->mm, pq->n_locked, npages)) { cleared = sdma_cache_evict(pq, npages); if (cleared >= npages) goto retry; } - pinned = hfi1_acquire_user_pages( + pinned = hfi1_acquire_user_pages(pq->mm, ((unsigned long)iovec->iov.iov_base + (node->npages * PAGE_SIZE)), npages, 0, pages + node->npages); @@ -1220,7 +1221,7 @@ retry: goto bail; } if (pinned != npages) { - unpin_vector_pages(current->mm, pages, node->npages, + unpin_vector_pages(pq->mm, pages, node->npages, pinned); ret = -EFAULT; goto bail; @@ -1252,7 +1253,7 @@ retry: return 0; bail: if (rb_node) - unpin_vector_pages(current->mm, node->pages, 0, node->npages); + unpin_vector_pages(pq->mm, node->pages, 0, node->npages); kfree(node); return ret; } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index 20ff846f318b..ff49f74f43f4 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -72,6 +72,7 @@ struct hfi1_user_sdma_pkt_q { u32 n_locked; struct list_head evict; spinlock_t evict_lock; /* protect evict and n_locked */ + struct mm_struct *mm; }; struct hfi1_user_sdma_comp_q { From e0b09ac55d51bb9bf6a4a320bf4029e40bdabd6c Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:20 -0400 Subject: [PATCH 77/84] IB/hfi1: Make the cache handler own its rb tree root The objects which use cache handling should reference their own handler object not the internal data structure it uses to track the nodes. Have the "users" of the mmu notifier code pass opaque objects which can then be properly used in the mmu callbacks depending on the owners needs. This patch has the additional benefit that operations no longer require a look up in a list to find the handlers. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 3 +- drivers/infiniband/hw/hfi1/mmu_rb.c | 98 +++++++---------------- drivers/infiniband/hw/hfi1/mmu_rb.h | 23 +++--- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 54 +++++-------- drivers/infiniband/hw/hfi1/user_sdma.c | 26 +++--- drivers/infiniband/hw/hfi1/user_sdma.h | 2 +- 6 files changed, 81 insertions(+), 125 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 67f37c9ea960..ba9083602cbd 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1186,6 +1186,7 @@ struct hfi1_devdata { struct tid_rb_node; struct mmu_rb_node; +struct mmu_rb_handler; /* Private data for file operations */ struct hfi1_filedata { @@ -1196,7 +1197,7 @@ struct hfi1_filedata { /* for cpu affinity; -1 if none */ int rec_cpu_num; u32 tid_n_pinned; - struct rb_root tid_rb_root; + struct mmu_rb_handler *handler; struct tid_rb_node **entry_to_rb; spinlock_t tid_lock; /* protect tid_[limit,used] counters */ u32 tid_limit; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e5c5ef4cf06c..9fbcfed4d34c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -53,20 +53,16 @@ #include "trace.h" struct mmu_rb_handler { - struct list_head list; struct mmu_notifier mn; - struct rb_root *root; + struct rb_root root; + void *ops_arg; spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; struct mm_struct *mm; }; -static LIST_HEAD(mmu_rb_handlers); -static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */ - static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static struct mmu_rb_handler *find_mmu_handler(struct rb_root *); static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, unsigned long); static inline void mmu_notifier_range_start(struct mmu_notifier *, @@ -96,8 +92,9 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) return PAGE_ALIGN(node->addr + node->len) - 1; } -int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, - struct mmu_rb_ops *ops) +int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, + struct mmu_rb_ops *ops, + struct mmu_rb_handler **handler) { struct mmu_rb_handler *handlr; int ret; @@ -106,8 +103,9 @@ int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, if (!handlr) return -ENOMEM; - handlr->root = root; + handlr->root = RB_ROOT; handlr->ops = ops; + handlr->ops_arg = ops_arg; INIT_HLIST_NODE(&handlr->mn.hlist); spin_lock_init(&handlr->lock); handlr->mn.ops = &mn_opts; @@ -119,52 +117,38 @@ int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, return ret; } - spin_lock(&mmu_rb_lock); - list_add_tail_rcu(&handlr->list, &mmu_rb_handlers); - spin_unlock(&mmu_rb_lock); - - return ret; + *handler = handlr; + return 0; } -void hfi1_mmu_rb_unregister(struct rb_root *root) +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) { - struct mmu_rb_handler *handler = find_mmu_handler(root); struct mmu_rb_node *rbnode; struct rb_node *node; unsigned long flags; - if (!handler) - return; - /* Unregister first so we don't get any more notifications. */ mmu_notifier_unregister(&handler->mn, handler->mm); - spin_lock(&mmu_rb_lock); - list_del_rcu(&handler->list); - spin_unlock(&mmu_rb_lock); - synchronize_rcu(); - spin_lock_irqsave(&handler->lock, flags); - while ((node = rb_first(root))) { + while ((node = rb_first(&handler->root))) { rbnode = rb_entry(node, struct mmu_rb_node, node); - rb_erase(node, root); - handler->ops->remove(root, rbnode, NULL); + rb_erase(node, &handler->root); + handler->ops->remove(handler->ops_arg, rbnode, + NULL); } spin_unlock_irqrestore(&handler->lock, flags); kfree(handler); } -int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode) { - struct mmu_rb_handler *handler = find_mmu_handler(root); struct mmu_rb_node *node; unsigned long flags; int ret = 0; - if (!handler) - return -EINVAL; - spin_lock_irqsave(&handler->lock, flags); hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr, mnode->len); @@ -173,11 +157,11 @@ int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) ret = -EINVAL; goto unlock; } - __mmu_int_rb_insert(mnode, root); + __mmu_int_rb_insert(mnode, &handler->root); - ret = handler->ops->insert(root, mnode); + ret = handler->ops->insert(handler->ops_arg, mnode); if (ret) - __mmu_int_rb_remove(mnode, root); + __mmu_int_rb_remove(mnode, &handler->root); unlock: spin_unlock_irqrestore(&handler->lock, flags); return ret; @@ -192,10 +176,10 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len); if (!handler->ops->filter) { - node = __mmu_int_rb_iter_first(handler->root, addr, + node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); } else { - for (node = __mmu_int_rb_iter_first(handler->root, addr, + for (node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); node; node = __mmu_int_rb_iter_next(node, addr, @@ -207,56 +191,34 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } -struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, +struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, unsigned long addr, unsigned long len) { - struct mmu_rb_handler *handler = find_mmu_handler(root); struct mmu_rb_node *node; unsigned long flags; - if (!handler) - return ERR_PTR(-EINVAL); - spin_lock_irqsave(&handler->lock, flags); node = __mmu_rb_search(handler, addr, len); if (node) - __mmu_int_rb_remove(node, handler->root); + __mmu_int_rb_remove(node, &handler->root); spin_unlock_irqrestore(&handler->lock, flags); return node; } -void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) +void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *node) { unsigned long flags; - struct mmu_rb_handler *handler = find_mmu_handler(root); - - if (!handler || !node) - return; /* Validity of handler and node pointers has been checked by caller. */ hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, node->len); spin_lock_irqsave(&handler->lock, flags); - __mmu_int_rb_remove(node, handler->root); + __mmu_int_rb_remove(node, &handler->root); spin_unlock_irqrestore(&handler->lock, flags); - handler->ops->remove(handler->root, node, NULL); -} - -static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) -{ - struct mmu_rb_handler *handler; - - rcu_read_lock(); - list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) { - if (handler->root == root) - goto unlock; - } - handler = NULL; -unlock: - rcu_read_unlock(); - return handler; + handler->ops->remove(handler->ops_arg, node, NULL); } static inline void mmu_notifier_page(struct mmu_notifier *mn, @@ -279,7 +241,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); - struct rb_root *root = handler->root; + struct rb_root *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; @@ -290,9 +252,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, ptr = __mmu_int_rb_iter_next(node, start, end - 1); hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", node->addr, node->len); - if (handler->ops->invalidate(root, node)) { + if (handler->ops->invalidate(handler->ops_arg, node)) { __mmu_int_rb_remove(node, root); - handler->ops->remove(root, node, mm); + handler->ops->remove(handler->ops_arg, node, mm); } } spin_unlock_irqrestore(&handler->lock, flags); diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 489a691856e5..2cedfbe2189e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -59,18 +59,21 @@ struct mmu_rb_node { struct mmu_rb_ops { bool (*filter)(struct mmu_rb_node *node, unsigned long addr, unsigned long len); - int (*insert)(struct rb_root *root, struct mmu_rb_node *mnode); - void (*remove)(struct rb_root *root, struct mmu_rb_node *mnode, + int (*insert)(void *ops_arg, struct mmu_rb_node *mnode); + void (*remove)(void *ops_arg, struct mmu_rb_node *mnode, struct mm_struct *mm); - int (*invalidate)(struct rb_root *root, struct mmu_rb_node *node); + int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); }; -int hfi1_mmu_rb_register(struct mm_struct *mm, struct rb_root *root, - struct mmu_rb_ops *ops); -void hfi1_mmu_rb_unregister(struct rb_root *); -int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); -void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); -struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, - unsigned long); +int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, + struct mmu_rb_ops *ops, + struct mmu_rb_handler **handler); +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode); +void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode); +struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len); #endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index a2f7e719dc4d..269a948189e0 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -82,14 +82,14 @@ struct tid_pageset { ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, - struct rb_root *); + struct hfi1_filedata *); static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); -static int tid_rb_insert(struct rb_root *, struct mmu_rb_node *); -static void tid_rb_remove(struct rb_root *, struct mmu_rb_node *, +static int tid_rb_insert(void *, struct mmu_rb_node *); +static void tid_rb_remove(void *, struct mmu_rb_node *, struct mm_struct *); -static int tid_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int tid_rb_invalidate(void *, struct mmu_rb_node *); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *); @@ -162,7 +162,6 @@ int hfi1_user_exp_rcv_init(struct file *fp) spin_lock_init(&fd->tid_lock); spin_lock_init(&fd->invalid_lock); - fd->tid_rb_root = RB_ROOT; if (!uctxt->subctxt_cnt || !fd->subctxt) { exp_tid_group_init(&uctxt->tid_group_list); @@ -211,8 +210,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) * fails, continue but turn off the TID caching for * all user contexts. */ - ret = hfi1_mmu_rb_register(fd->mm, &fd->tid_rb_root, - &tid_rb_ops); + ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, &fd->handler); if (ret) { dd_dev_info(dd, "Failed MMU notifier registration %d\n", @@ -263,17 +261,15 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) * was freed. */ if (!HFI1_CAP_IS_USET(TID_UNMAP)) - hfi1_mmu_rb_unregister(&fd->tid_rb_root); + hfi1_mmu_rb_unregister(fd->handler); kfree(fd->invalid_tids); if (!uctxt->cnt) { if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) - unlock_exp_tids(uctxt, &uctxt->tid_full_list, - &fd->tid_rb_root); + unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) - unlock_exp_tids(uctxt, &uctxt->tid_used_list, - &fd->tid_rb_root); + unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, list) { list_del_init(&grp->list); @@ -830,7 +826,6 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, struct hfi1_ctxtdata *uctxt = fd->uctxt; struct tid_rb_node *node; struct hfi1_devdata *dd = uctxt->dd; - struct rb_root *root = &fd->tid_rb_root; dma_addr_t phys; /* @@ -863,9 +858,9 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, memcpy(node->pages, pages, sizeof(struct page *) * npages); if (HFI1_CAP_IS_USET(TID_UNMAP)) - ret = tid_rb_insert(root, &node->mmu); + ret = tid_rb_insert(fd, &node->mmu); else - ret = hfi1_mmu_rb_insert(root, &node->mmu); + ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu); if (ret) { hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", @@ -906,9 +901,9 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; if (HFI1_CAP_IS_USET(TID_UNMAP)) - tid_rb_remove(&fd->tid_rb_root, &node->mmu, fd->mm); + tid_rb_remove(fd, &node->mmu, fd->mm); else - hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); + hfi1_mmu_rb_remove(fd->handler, &node->mmu); if (grp) *grp = node->grp; @@ -950,11 +945,10 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) } static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, - struct exp_tid_set *set, struct rb_root *root) + struct exp_tid_set *set, + struct hfi1_filedata *fd) { struct tid_group *grp, *ptr; - struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, - tid_rb_root); int i; list_for_each_entry_safe(grp, ptr, &set->list, list) { @@ -970,10 +964,9 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, if (!node || node->rcventry != rcventry) continue; if (HFI1_CAP_IS_USET(TID_UNMAP)) - tid_rb_remove(&fd->tid_rb_root, - &node->mmu, fd->mm); + tid_rb_remove(fd, &node->mmu, fd->mm); else - hfi1_mmu_rb_remove(&fd->tid_rb_root, + hfi1_mmu_rb_remove(fd->handler, &node->mmu); clear_tid_node(fd, node); } @@ -981,10 +974,9 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, } } -static int tid_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_filedata *fdata = arg; struct hfi1_ctxtdata *uctxt = fdata->uctxt; struct tid_rb_node *node = container_of(mnode, struct tid_rb_node, mmu); @@ -1025,10 +1017,9 @@ static int tid_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) return 0; } -static int tid_rb_insert(struct rb_root *root, struct mmu_rb_node *node) +static int tid_rb_insert(void *arg, struct mmu_rb_node *node) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_filedata *fdata = arg; struct tid_rb_node *tnode = container_of(node, struct tid_rb_node, mmu); u32 base = fdata->uctxt->expected_base; @@ -1037,11 +1028,10 @@ static int tid_rb_insert(struct rb_root *root, struct mmu_rb_node *node) return 0; } -static void tid_rb_remove(struct rb_root *root, struct mmu_rb_node *node, +static void tid_rb_remove(void *arg, struct mmu_rb_node *node, struct mm_struct *mm) { - struct hfi1_filedata *fdata = - container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_filedata *fdata = arg; struct tid_rb_node *tnode = container_of(node, struct tid_rb_node, mmu); u32 base = fdata->uctxt->expected_base; diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 640c244b665b..8be095e1a538 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -305,10 +305,10 @@ static int defer_packet_queue( unsigned seq); static void activate_packet_queue(struct iowait *, int); static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); -static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *); -static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, +static int sdma_rb_insert(void *, struct mmu_rb_node *); +static void sdma_rb_remove(void *, struct mmu_rb_node *, struct mm_struct *); -static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int sdma_rb_invalidate(void *, struct mmu_rb_node *); static struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, @@ -410,7 +410,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); - pq->sdma_rb_root = RB_ROOT; INIT_LIST_HEAD(&pq->evict); spin_lock_init(&pq->evict_lock); pq->mm = fd->mm; @@ -443,7 +442,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) cq->nentries = hfi1_sdma_comp_ring_size; fd->cq = cq; - ret = hfi1_mmu_rb_register(pq->mm, &pq->sdma_rb_root, &sdma_rb_ops); + ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, &pq->handler); if (ret) { dd_dev_err(dd, "Failed to register with MMU %d", ret); goto done; @@ -481,7 +480,8 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) uctxt->ctxt, fd->subctxt); pq = fd->pq; if (pq) { - hfi1_mmu_rb_unregister(&pq->sdma_rb_root); + if (pq->handler) + hfi1_mmu_rb_unregister(pq->handler); spin_lock_irqsave(&uctxt->sdma_qlock, flags); if (!list_empty(&pq->list)) list_del_init(&pq->list); @@ -1145,7 +1145,7 @@ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) spin_unlock(&pq->evict_lock); list_for_each_entry_safe(node, ptr, &to_evict, list) - hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + hfi1_mmu_rb_remove(pq->handler, &node->rb); return cleared; } @@ -1159,7 +1159,7 @@ static int pin_vector_pages(struct user_sdma_request *req, struct sdma_mmu_node *node = NULL; struct mmu_rb_node *rb_node; - rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, + rb_node = hfi1_mmu_rb_extract(pq->handler, (unsigned long)iovec->iov.iov_base, iovec->iov.iov_len); if (rb_node && !IS_ERR(rb_node)) @@ -1240,7 +1240,7 @@ retry: iovec->npages = npages; iovec->node = node; - ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); if (ret) { spin_lock(&pq->evict_lock); if (!list_empty(&node->list)) @@ -1612,7 +1612,7 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) continue; if (unpin) - hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, + hfi1_mmu_rb_remove(req->pq->handler, &node->rb); else atomic_dec(&node->refcount); @@ -1642,7 +1642,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, return (bool)(node->addr == addr); } -static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); @@ -1651,7 +1651,7 @@ static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) return 0; } -static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, +static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode, struct mm_struct *mm) { struct sdma_mmu_node *node = @@ -1692,7 +1692,7 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, kfree(node); } -static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index ff49f74f43f4..bcdc9e8ae1f0 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -68,7 +68,7 @@ struct hfi1_user_sdma_pkt_q { unsigned state; wait_queue_head_t wait; unsigned long unpinned; - struct rb_root sdma_rb_root; + struct mmu_rb_handler *handler; u32 n_locked; struct list_head evict; spinlock_t evict_lock; /* protect evict and n_locked */ From 622c202c4a4697636334761d7ca295ebd35074e4 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:21 -0400 Subject: [PATCH 78/84] IB/hfi1: Fix TID caching actions Per file descriptor TID caching actions depend on a global that can change midway through the lifetime of that file descriptor. Make the use of caching consistent for the life of the file descriptor by using the presence of the cache handler to decide when to use the cache functions. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++++ drivers/infiniband/hw/hfi1/user_exp_rcv.c | 16 +++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 302f0cdd8119..4f39bffad74a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1132,6 +1132,10 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) | HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | HFI1_CAP_KGET_MASK(uctxt->flags, K2U); + /* adjust flag if this fd is not able to cache */ + if (!fd->handler) + cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */ + cinfo.num_active = hfi1_count_active_units(); cinfo.unit = uctxt->dd->unit; cinfo.ctxt = uctxt->ctxt; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 269a948189e0..9b740db34963 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -196,7 +196,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) if (!fd->entry_to_rb) return -ENOMEM; - if (!HFI1_CAP_IS_USET(TID_UNMAP)) { + if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { fd->invalid_tid_idx = 0; fd->invalid_tids = kzalloc(uctxt->expected_count * sizeof(u32), GFP_KERNEL); @@ -207,15 +207,13 @@ int hfi1_user_exp_rcv_init(struct file *fp) /* * Register MMU notifier callbacks. If the registration - * fails, continue but turn off the TID caching for - * all user contexts. + * fails, continue without TID caching for this context. */ ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, &fd->handler); if (ret) { dd_dev_info(dd, "Failed MMU notifier registration %d\n", ret); - HFI1_CAP_USET(TID_UNMAP); ret = 0; } } @@ -234,7 +232,7 @@ int hfi1_user_exp_rcv_init(struct file *fp) * init. */ spin_lock(&fd->tid_lock); - if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { + if (uctxt->subctxt_cnt && fd->handler) { u16 remainder; fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; @@ -260,7 +258,7 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) * The notifier would have been removed when the process'es mm * was freed. */ - if (!HFI1_CAP_IS_USET(TID_UNMAP)) + if (fd->handler) hfi1_mmu_rb_unregister(fd->handler); kfree(fd->invalid_tids); @@ -857,7 +855,7 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, node->freed = false; memcpy(node->pages, pages, sizeof(struct page *) * npages); - if (HFI1_CAP_IS_USET(TID_UNMAP)) + if (!fd->handler) ret = tid_rb_insert(fd, &node->mmu); else ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu); @@ -900,7 +898,7 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (HFI1_CAP_IS_USET(TID_UNMAP)) + if (!fd->handler) tid_rb_remove(fd, &node->mmu, fd->mm); else hfi1_mmu_rb_remove(fd->handler, &node->mmu); @@ -963,7 +961,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, uctxt->expected_base]; if (!node || node->rcventry != rcventry) continue; - if (HFI1_CAP_IS_USET(TID_UNMAP)) + if (!fd->handler) tid_rb_remove(fd, &node->mmu, fd->mm); else hfi1_mmu_rb_remove(fd->handler, From 1034599805009394cc42e6c538575d12d8dc57fa Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:22 -0400 Subject: [PATCH 79/84] IB/hfi1: Add evict operation to the mmu rb handler Allow users to clear nodes from the rb tree based on their evict callback. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 34 +++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/mmu_rb.h | 4 ++++ 2 files changed, 38 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 9fbcfed4d34c..97f2d3680751 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -206,6 +206,40 @@ struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, return node; } +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) +{ + struct mmu_rb_node *rbnode; + struct rb_node *node, *next; + struct list_head del_list; + unsigned long flags; + bool stop = false; + + INIT_LIST_HEAD(&del_list); + + spin_lock_irqsave(&handler->lock, flags); + for (node = rb_first(&handler->root); node; node = next) { + next = rb_next(node); + rbnode = rb_entry(node, struct mmu_rb_node, node); + if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, + &stop)) { + __mmu_int_rb_remove(rbnode, &handler->root); + list_add(&rbnode->list, &del_list); + } + if (stop) + break; + } + spin_unlock_irqrestore(&handler->lock, flags); + + down_write(&handler->mm->mmap_sem); + while (!list_empty(&del_list)) { + rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&rbnode->list); + handler->ops->remove(handler->ops_arg, rbnode, + handler->mm); + } + up_write(&handler->mm->mmap_sem); +} + void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, struct mmu_rb_node *node) { diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 2cedfbe2189e..09e5888c0818 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -54,6 +54,7 @@ struct mmu_rb_node { unsigned long len; unsigned long __last; struct rb_node node; + struct list_head list; }; struct mmu_rb_ops { @@ -63,6 +64,8 @@ struct mmu_rb_ops { void (*remove)(void *ops_arg, struct mmu_rb_node *mnode, struct mm_struct *mm); int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); + int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop); }; int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, @@ -71,6 +74,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, struct mmu_rb_node *mnode); +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, struct mmu_rb_node *mnode); struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, From b7df192f74a8cde22f6dc0680a2daa40540ed72f Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:23 -0400 Subject: [PATCH 80/84] IB/hfi1: Use evict mmu rb operation Use the new cache evict operation in the SDMA code. This allows the cache to properly coordinate evicts and removes, preventing any race. With this change, the separate list, lock, and race flag are not needed. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 116 ++++++++++--------------- drivers/infiniband/hw/hfi1/user_sdma.h | 4 +- 2 files changed, 47 insertions(+), 73 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 8be095e1a538..3d76222d1aac 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -183,16 +183,18 @@ struct user_sdma_iovec { struct sdma_mmu_node *node; }; -#define SDMA_CACHE_NODE_EVICT 0 - struct sdma_mmu_node { struct mmu_rb_node rb; - struct list_head list; struct hfi1_user_sdma_pkt_q *pq; atomic_t refcount; struct page **pages; unsigned npages; - unsigned long flags; +}; + +/* evict operation argument */ +struct evict_data { + u32 cleared; /* count evicted so far */ + u32 target; /* target count to evict */ }; struct user_sdma_request { @@ -306,6 +308,8 @@ static int defer_packet_queue( static void activate_packet_queue(struct iowait *, int); static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); static int sdma_rb_insert(void *, struct mmu_rb_node *); +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, + void *arg2, bool *stop); static void sdma_rb_remove(void *, struct mmu_rb_node *, struct mm_struct *); static int sdma_rb_invalidate(void *, struct mmu_rb_node *); @@ -313,6 +317,7 @@ static int sdma_rb_invalidate(void *, struct mmu_rb_node *); static struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, .insert = sdma_rb_insert, + .evict = sdma_rb_evict, .remove = sdma_rb_remove, .invalidate = sdma_rb_invalidate }; @@ -410,8 +415,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); - INIT_LIST_HEAD(&pq->evict); - spin_lock_init(&pq->evict_lock); + atomic_set(&pq->n_locked, 0); pq->mm = fd->mm; iowait_init(&pq->busy, 0, NULL, defer_packet_queue, @@ -1126,28 +1130,12 @@ static inline int num_user_pages(const struct iovec *iov) static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) { - u32 cleared = 0; - struct sdma_mmu_node *node, *ptr; - struct list_head to_evict = LIST_HEAD_INIT(to_evict); + struct evict_data evict_data; - spin_lock(&pq->evict_lock); - list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { - /* Make sure that no one is still using the node. */ - if (!atomic_read(&node->refcount)) { - set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); - list_del_init(&node->list); - list_add(&node->list, &to_evict); - cleared += node->npages; - if (cleared >= npages) - break; - } - } - spin_unlock(&pq->evict_lock); - - list_for_each_entry_safe(node, ptr, &to_evict, list) - hfi1_mmu_rb_remove(pq->handler, &node->rb); - - return cleared; + evict_data.cleared = 0; + evict_data.target = npages; + hfi1_mmu_rb_evict(pq->handler, &evict_data); + return evict_data.cleared; } static int pin_vector_pages(struct user_sdma_request *req, @@ -1175,7 +1163,6 @@ static int pin_vector_pages(struct user_sdma_request *req, node->rb.addr = (unsigned long)iovec->iov.iov_base; node->pq = pq; atomic_set(&node->refcount, 0); - INIT_LIST_HEAD(&node->list); } npages = num_user_pages(&iovec->iov); @@ -1190,23 +1177,9 @@ static int pin_vector_pages(struct user_sdma_request *req, npages -= node->npages; - /* - * If rb_node is NULL, it means that this is brand new node - * and, therefore not on the eviction list. - * If, however, the rb_node is non-NULL, it means that the - * node is already in RB tree and, therefore on the eviction - * list (nodes are unconditionally inserted in the eviction - * list). In that case, we have to remove the node prior to - * calling the eviction function in order to prevent it from - * freeing this node. - */ - if (rb_node) { - spin_lock(&pq->evict_lock); - list_del_init(&node->list); - spin_unlock(&pq->evict_lock); - } retry: - if (!hfi1_can_pin_pages(pq->dd, pq->mm, pq->n_locked, npages)) { + if (!hfi1_can_pin_pages(pq->dd, pq->mm, + atomic_read(&pq->n_locked), npages)) { cleared = sdma_cache_evict(pq, npages); if (cleared >= npages) goto retry; @@ -1231,10 +1204,7 @@ retry: node->pages = pages; node->npages += pinned; npages = node->npages; - spin_lock(&pq->evict_lock); - list_add(&node->list, &pq->evict); - pq->n_locked += pinned; - spin_unlock(&pq->evict_lock); + atomic_add(pinned, &pq->n_locked); } iovec->pages = node->pages; iovec->npages = npages; @@ -1242,11 +1212,7 @@ retry: ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); if (ret) { - spin_lock(&pq->evict_lock); - if (!list_empty(&node->list)) - list_del(&node->list); - pq->n_locked -= node->npages; - spin_unlock(&pq->evict_lock); + atomic_sub(node->npages, &pq->n_locked); iovec->node = NULL; goto bail; } @@ -1651,29 +1617,39 @@ static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) return 0; } +/* + * Return 1 to remove the node from the rb tree and call the remove op. + * + * Called with the rb tree lock held. + */ +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + struct evict_data *evict_data = evict_arg; + + /* is this node still being used? */ + if (atomic_read(&node->refcount)) + return 0; /* keep this node */ + + /* this node will be evicted, add its pages to our count */ + evict_data->cleared += node->npages; + + /* have enough pages been cleared? */ + if (evict_data->cleared >= evict_data->target) + *stop = true; + + return 1; /* remove this node */ +} + static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode, struct mm_struct *mm) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); - spin_lock(&node->pq->evict_lock); - /* - * We've been called by the MMU notifier but this node has been - * scheduled for eviction. The eviction function will take care - * of freeing this node. - * We have to take the above lock first because we are racing - * against the setting of the bit in the eviction function. - */ - if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { - spin_unlock(&node->pq->evict_lock); - return; - } - - if (!list_empty(&node->list)) - list_del(&node->list); - node->pq->n_locked -= node->npages; - spin_unlock(&node->pq->evict_lock); + atomic_sub(node->npages, &node->pq->n_locked); /* * If mm is set, we are being called by the MMU notifier and we diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index bcdc9e8ae1f0..39001714f551 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -69,9 +69,7 @@ struct hfi1_user_sdma_pkt_q { wait_queue_head_t wait; unsigned long unpinned; struct mmu_rb_handler *handler; - u32 n_locked; - struct list_head evict; - spinlock_t evict_lock; /* protect evict and n_locked */ + atomic_t n_locked; struct mm_struct *mm; }; From b85ced91511f6c3add9a74ae13e12ba568bfa1af Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:24 -0400 Subject: [PATCH 81/84] IB/hfi1: Consistently call ops->remove outside spinlock The ops->remove() callback was called by hfi1_mmu_unregister() with a NULL mm argument while holding a spinlock. In the case of sdma_rb_remove() this caused it to pass current->mm to hfi1_release_user_pages() This had 2 problems. First this would attempt to acquire the mmap_sem under a spin lock. Second the use of current->mm is not always guaranteed to be the proper mm when the fd is being closed. Rather than depend on this implicit behavior we move all calls to ops->remove outside of the spinlock. This also allows the correct mm to be used in the remove callback without fear of deadlock. Because the MMU notifier is not guaranteed to hold mm->mmap_sem, but usually does, we must delay all remove callbacks until out of the notifier, when the callbacks can take the mmap_sem if they need to. Code comments were added to clarify what the expectations are for the users of the mmu rb tree. Suggested-by: Jim Foraker Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 76 +++++++++++++++++++++-- drivers/infiniband/hw/hfi1/mmu_rb.h | 5 ++ drivers/infiniband/hw/hfi1/user_exp_rcv.c | 4 +- drivers/infiniband/hw/hfi1/user_sdma.c | 22 ++----- 4 files changed, 84 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 97f2d3680751..76f7b0403207 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -59,6 +59,9 @@ struct mmu_rb_handler { spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; struct mm_struct *mm; + struct work_struct del_work; + struct list_head del_list; + struct workqueue_struct *wq; }; static unsigned long mmu_node_start(struct mmu_rb_node *); @@ -73,6 +76,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *, unsigned long, unsigned long); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); +static void do_remove(struct mmu_rb_handler *handler, + struct list_head *del_list); +static void handle_remove(struct work_struct *work); static struct mmu_notifier_ops mn_opts = { .invalidate_page = mmu_notifier_page, @@ -94,6 +100,7 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, struct mmu_rb_ops *ops, + struct workqueue_struct *wq, struct mmu_rb_handler **handler) { struct mmu_rb_handler *handlr; @@ -110,6 +117,9 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, spin_lock_init(&handlr->lock); handlr->mn.ops = &mn_opts; handlr->mm = mm; + INIT_WORK(&handlr->del_work, handle_remove); + INIT_LIST_HEAD(&handlr->del_list); + handlr->wq = wq; ret = mmu_notifier_register(&handlr->mn, handlr->mm); if (ret) { @@ -126,19 +136,29 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) struct mmu_rb_node *rbnode; struct rb_node *node; unsigned long flags; + struct list_head del_list; /* Unregister first so we don't get any more notifications. */ mmu_notifier_unregister(&handler->mn, handler->mm); + /* + * Make sure the wq delete handler is finished running. It will not + * be triggered once the mmu notifiers are unregistered above. + */ + flush_work(&handler->del_work); + + INIT_LIST_HEAD(&del_list); + spin_lock_irqsave(&handler->lock, flags); while ((node = rb_first(&handler->root))) { rbnode = rb_entry(node, struct mmu_rb_node, node); rb_erase(node, &handler->root); - handler->ops->remove(handler->ops_arg, rbnode, - NULL); + list_add(&rbnode->list, &del_list); } spin_unlock_irqrestore(&handler->lock, flags); + do_remove(handler, &del_list); + kfree(handler); } @@ -230,16 +250,19 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) } spin_unlock_irqrestore(&handler->lock, flags); - down_write(&handler->mm->mmap_sem); while (!list_empty(&del_list)) { rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); list_del(&rbnode->list); handler->ops->remove(handler->ops_arg, rbnode, handler->mm); } - up_write(&handler->mm->mmap_sem); } +/* + * It is up to the caller to ensure that this function does not race with the + * mmu invalidate notifier which may be calling the users remove callback on + * 'node'. + */ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, struct mmu_rb_node *node) { @@ -278,6 +301,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, struct rb_root *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; + bool added = false; spin_lock_irqsave(&handler->lock, flags); for (node = __mmu_int_rb_iter_first(root, start, end - 1); @@ -288,8 +312,50 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, node->addr, node->len); if (handler->ops->invalidate(handler->ops_arg, node)) { __mmu_int_rb_remove(node, root); - handler->ops->remove(handler->ops_arg, node, mm); + list_add(&node->list, &handler->del_list); + added = true; } } spin_unlock_irqrestore(&handler->lock, flags); + + if (added) + queue_work(handler->wq, &handler->del_work); +} + +/* + * Call the remove function for the given handler and the list. This + * is expected to be called with a delete list extracted from handler. + * The caller should not be holding the handler lock. + */ +static void do_remove(struct mmu_rb_handler *handler, + struct list_head *del_list) +{ + struct mmu_rb_node *node; + + while (!list_empty(del_list)) { + node = list_first_entry(del_list, struct mmu_rb_node, list); + list_del(&node->list); + handler->ops->remove(handler->ops_arg, node, handler->mm); + } +} + +/* + * Work queue function to remove all nodes that have been queued up to + * be removed. The key feature is that mm->mmap_sem is not being held + * and the remove callback can sleep while taking it, if needed. + */ +static void handle_remove(struct work_struct *work) +{ + struct mmu_rb_handler *handler = container_of(work, + struct mmu_rb_handler, + del_work); + struct list_head del_list; + unsigned long flags; + + /* remove anything that is queued to get removed */ + spin_lock_irqsave(&handler->lock, flags); + list_replace_init(&handler->del_list, &del_list); + spin_unlock_irqrestore(&handler->lock, flags); + + do_remove(handler, &del_list); } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 09e5888c0818..e4f853fa91e6 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -57,6 +57,10 @@ struct mmu_rb_node { struct list_head list; }; +/* + * NOTE: filter, insert, invalidate, and evict must not sleep. Only remove is + * allowed to sleep. + */ struct mmu_rb_ops { bool (*filter)(struct mmu_rb_node *node, unsigned long addr, unsigned long len); @@ -70,6 +74,7 @@ struct mmu_rb_ops { int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, struct mmu_rb_ops *ops, + struct workqueue_struct *wq, struct mmu_rb_handler **handler); void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 9b740db34963..3bcda22e7b87 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -209,7 +209,9 @@ int hfi1_user_exp_rcv_init(struct file *fp) * Register MMU notifier callbacks. If the registration * fails, continue without TID caching for this context. */ - ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, &fd->handler); + ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, + dd->pport->hfi1_wq, + &fd->handler); if (ret) { dd_dev_info(dd, "Failed MMU notifier registration %d\n", diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 3d76222d1aac..751aa2260c1c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -310,8 +310,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); static int sdma_rb_insert(void *, struct mmu_rb_node *); static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); -static void sdma_rb_remove(void *, struct mmu_rb_node *, - struct mm_struct *); +static void sdma_rb_remove(void *, struct mmu_rb_node *, struct mm_struct *); static int sdma_rb_invalidate(void *, struct mmu_rb_node *); static struct mmu_rb_ops sdma_rb_ops = { @@ -446,7 +445,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) cq->nentries = hfi1_sdma_comp_ring_size; fd->cq = cq; - ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, &pq->handler); + ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, + &pq->handler); if (ret) { dd_dev_err(dd, "Failed to register with MMU %d", ret); goto done; @@ -1651,20 +1651,8 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode, atomic_sub(node->npages, &node->pq->n_locked); - /* - * If mm is set, we are being called by the MMU notifier and we - * should not pass a mm_struct to unpin_vector_page(). This is to - * prevent a deadlock when hfi1_release_user_pages() attempts to - * take the mmap_sem, which the MMU notifier has already taken. - */ - unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0, - node->npages); - /* - * If called by the MMU notifier, we have to adjust the pinned - * page count ourselves. - */ - if (mm) - mm->pinned_vm -= node->npages; + unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); + kfree(node); } From 082b3532915395ea6620ba691138baf151a543b0 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:25 -0400 Subject: [PATCH 82/84] IB/hfi1: Remove unneeded mm argument in remove function The reworked mmu_rb interface allows the unused mm argument to be removed. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 7 +++---- drivers/infiniband/hw/hfi1/mmu_rb.h | 3 +-- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 10 ++++------ drivers/infiniband/hw/hfi1/user_sdma.c | 5 ++--- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 76f7b0403207..a02344a9d746 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -253,8 +253,7 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) while (!list_empty(&del_list)) { rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); list_del(&rbnode->list); - handler->ops->remove(handler->ops_arg, rbnode, - handler->mm); + handler->ops->remove(handler->ops_arg, rbnode); } } @@ -275,7 +274,7 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, __mmu_int_rb_remove(node, &handler->root); spin_unlock_irqrestore(&handler->lock, flags); - handler->ops->remove(handler->ops_arg, node, NULL); + handler->ops->remove(handler->ops_arg, node); } static inline void mmu_notifier_page(struct mmu_notifier *mn, @@ -335,7 +334,7 @@ static void do_remove(struct mmu_rb_handler *handler, while (!list_empty(del_list)) { node = list_first_entry(del_list, struct mmu_rb_node, list); list_del(&node->list); - handler->ops->remove(handler->ops_arg, node, handler->mm); + handler->ops->remove(handler->ops_arg, node); } } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index e4f853fa91e6..754f6ebf13fb 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -65,8 +65,7 @@ struct mmu_rb_ops { bool (*filter)(struct mmu_rb_node *node, unsigned long addr, unsigned long len); int (*insert)(void *ops_arg, struct mmu_rb_node *mnode); - void (*remove)(void *ops_arg, struct mmu_rb_node *mnode, - struct mm_struct *mm); + void (*remove)(void *ops_arg, struct mmu_rb_node *mnode); int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, void *evict_arg, bool *stop); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 3bcda22e7b87..8717e11fe3f5 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -87,8 +87,7 @@ static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); static int tid_rb_insert(void *, struct mmu_rb_node *); -static void tid_rb_remove(void *, struct mmu_rb_node *, - struct mm_struct *); +static void tid_rb_remove(void *, struct mmu_rb_node *); static int tid_rb_invalidate(void *, struct mmu_rb_node *); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, @@ -901,7 +900,7 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; if (!fd->handler) - tid_rb_remove(fd, &node->mmu, fd->mm); + tid_rb_remove(fd, &node->mmu); else hfi1_mmu_rb_remove(fd->handler, &node->mmu); @@ -964,7 +963,7 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, if (!node || node->rcventry != rcventry) continue; if (!fd->handler) - tid_rb_remove(fd, &node->mmu, fd->mm); + tid_rb_remove(fd, &node->mmu); else hfi1_mmu_rb_remove(fd->handler, &node->mmu); @@ -1028,8 +1027,7 @@ static int tid_rb_insert(void *arg, struct mmu_rb_node *node) return 0; } -static void tid_rb_remove(void *arg, struct mmu_rb_node *node, - struct mm_struct *mm) +static void tid_rb_remove(void *arg, struct mmu_rb_node *node) { struct hfi1_filedata *fdata = arg; struct tid_rb_node *tnode = diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 751aa2260c1c..0ecf27903dc2 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -310,7 +310,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); static int sdma_rb_insert(void *, struct mmu_rb_node *); static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); -static void sdma_rb_remove(void *, struct mmu_rb_node *, struct mm_struct *); +static void sdma_rb_remove(void *, struct mmu_rb_node *); static int sdma_rb_invalidate(void *, struct mmu_rb_node *); static struct mmu_rb_ops sdma_rb_ops = { @@ -1643,8 +1643,7 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, return 1; /* remove this node */ } -static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode, - struct mm_struct *mm) +static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) { struct sdma_mmu_node *node = container_of(mnode, struct sdma_mmu_node, rb); From 2677a7680e773195a4fdabd163d756cac1b9abd7 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 28 Jul 2016 15:21:26 -0400 Subject: [PATCH 83/84] IB/hfi1: Fix memory leak during unexpected shutdown During an unexpected shutdown, references to tid_rb_node were NULL'ed out without properly being released. Fix this by calling clear_tid_node in the mmu notifier remove callback rather than after these callbacks are called. Reviewed-by: Dean Luick Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 46 ++++++++++++++++------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 8717e11fe3f5..64d26525435a 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -87,13 +87,15 @@ static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); static int set_rcvarray_entry(struct file *, unsigned long, u32, struct tid_group *, struct page **, unsigned); static int tid_rb_insert(void *, struct mmu_rb_node *); +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode); static void tid_rb_remove(void *, struct mmu_rb_node *); static int tid_rb_invalidate(void *, struct mmu_rb_node *); static int program_rcvarray(struct file *, unsigned long, struct tid_group *, struct tid_pageset *, unsigned, u16, struct page **, u32 *, unsigned *, unsigned *); static int unprogram_rcvarray(struct file *, u32, struct tid_group **); -static void clear_tid_node(struct hfi1_filedata *, struct tid_rb_node *); +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static struct mmu_rb_ops tid_rb_ops = { .insert = tid_rb_insert, @@ -899,14 +901,15 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo, node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (!fd->handler) - tid_rb_remove(fd, &node->mmu); - else - hfi1_mmu_rb_remove(fd->handler, &node->mmu); if (grp) *grp = node->grp; - clear_tid_node(fd, node); + + if (!fd->handler) + cacheless_tid_rb_remove(fd, node); + else + hfi1_mmu_rb_remove(fd->handler, &node->mmu); + return 0; } @@ -943,6 +946,10 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) kfree(node); } +/* + * As a simple helper for hfi1_user_exp_rcv_free, this function deals with + * clearing nodes in the non-cached case. + */ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, struct exp_tid_set *set, struct hfi1_filedata *fd) @@ -962,17 +969,20 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, uctxt->expected_base]; if (!node || node->rcventry != rcventry) continue; - if (!fd->handler) - tid_rb_remove(fd, &node->mmu); - else - hfi1_mmu_rb_remove(fd->handler, - &node->mmu); - clear_tid_node(fd, node); + + cacheless_tid_rb_remove(fd, node); } } } } +/* + * Always return 0 from this function. A non-zero return indicates that the + * remove operation will be called and that memory should be unpinned. + * However, the driver cannot unpin out from under PSM. Instead, retain the + * memory (by returning 0) and inform PSM that the memory is going away. PSM + * will call back later when it has removed the memory from its list. + */ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) { struct hfi1_filedata *fdata = arg; @@ -1027,12 +1037,20 @@ static int tid_rb_insert(void *arg, struct mmu_rb_node *node) return 0; } +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode) +{ + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = NULL; + clear_tid_node(fdata, tnode); +} + static void tid_rb_remove(void *arg, struct mmu_rb_node *node) { struct hfi1_filedata *fdata = arg; struct tid_rb_node *tnode = container_of(node, struct tid_rb_node, mmu); - u32 base = fdata->uctxt->expected_base; - fdata->entry_to_rb[tnode->rcventry - base] = NULL; + cacheless_tid_rb_remove(fdata, tnode); } From 0636e9ab8355c82ff7e9d6bb8aa2ded834b1f88d Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Thu, 28 Jul 2016 15:21:27 -0400 Subject: [PATCH 84/84] IB/hfi1: Add cache evict LRU list The original code used a LRU list to evict nodes which were least recently used. For correctness the evict code was moved under the handler->lock, now add back the LRU list. Reviewed-by: Ira Weiny Signed-off-by: Dean Luick Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mmu_rb.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index a02344a9d746..7ad30898fc19 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -59,6 +59,7 @@ struct mmu_rb_handler { spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; struct mm_struct *mm; + struct list_head lru_list; struct work_struct del_work; struct list_head del_list; struct workqueue_struct *wq; @@ -119,6 +120,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, handlr->mm = mm; INIT_WORK(&handlr->del_work, handle_remove); INIT_LIST_HEAD(&handlr->del_list); + INIT_LIST_HEAD(&handlr->lru_list); handlr->wq = wq; ret = mmu_notifier_register(&handlr->mn, handlr->mm); @@ -153,7 +155,8 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) while ((node = rb_first(&handler->root))) { rbnode = rb_entry(node, struct mmu_rb_node, node); rb_erase(node, &handler->root); - list_add(&rbnode->list, &del_list); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); } spin_unlock_irqrestore(&handler->lock, flags); @@ -178,10 +181,13 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, goto unlock; } __mmu_int_rb_insert(mnode, &handler->root); + list_add(&mnode->list, &handler->lru_list); ret = handler->ops->insert(handler->ops_arg, mnode); - if (ret) + if (ret) { __mmu_int_rb_remove(mnode, &handler->root); + list_del(&mnode->list); /* remove from LRU list */ + } unlock: spin_unlock_irqrestore(&handler->lock, flags); return ret; @@ -219,8 +225,10 @@ struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, spin_lock_irqsave(&handler->lock, flags); node = __mmu_rb_search(handler, addr, len); - if (node) + if (node) { __mmu_int_rb_remove(node, &handler->root); + list_del(&node->list); /* remove from LRU list */ + } spin_unlock_irqrestore(&handler->lock, flags); return node; @@ -228,8 +236,7 @@ struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler, void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) { - struct mmu_rb_node *rbnode; - struct rb_node *node, *next; + struct mmu_rb_node *rbnode, *ptr; struct list_head del_list; unsigned long flags; bool stop = false; @@ -237,13 +244,13 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) INIT_LIST_HEAD(&del_list); spin_lock_irqsave(&handler->lock, flags); - for (node = rb_first(&handler->root); node; node = next) { - next = rb_next(node); - rbnode = rb_entry(node, struct mmu_rb_node, node); + list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list, + list) { if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, &stop)) { __mmu_int_rb_remove(rbnode, &handler->root); - list_add(&rbnode->list, &del_list); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); } if (stop) break; @@ -272,6 +279,7 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, node->len); spin_lock_irqsave(&handler->lock, flags); __mmu_int_rb_remove(node, &handler->root); + list_del(&node->list); /* remove from LRU list */ spin_unlock_irqrestore(&handler->lock, flags); handler->ops->remove(handler->ops_arg, node); @@ -311,7 +319,8 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, node->addr, node->len); if (handler->ops->invalidate(handler->ops_arg, node)) { __mmu_int_rb_remove(node, root); - list_add(&node->list, &handler->del_list); + /* move from LRU list to delete list */ + list_move(&node->list, &handler->del_list); added = true; } }