From a7b287bf7845fc335d53550a228ad4fa9aa659e5 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:25 +0300 Subject: [PATCH 001/194] IB/iser: Refactor iscsi_iser_check_protection function Reduce lines of code by using local variable. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 9c185a8dabd3..dbad8275b3bc 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -406,13 +406,10 @@ static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) { struct iscsi_iser_task *iser_task = task->dd_data; + enum iser_data_dir dir = iser_task->dir[ISER_DIR_IN] ? + ISER_DIR_IN : ISER_DIR_OUT; - if (iser_task->dir[ISER_DIR_IN]) - return iser_check_task_pi_status(iser_task, ISER_DIR_IN, - sector); - else - return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, - sector); + return iser_check_task_pi_status(iser_task, dir, sector); } /** From 0cc2896f02536272fc58a7d7d2fb2e84f6e717b4 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:26 +0300 Subject: [PATCH 002/194] IB/iser: Remove unused sig_attrs argument Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 2ba70729d7b0..f431c9b4065c 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -302,8 +302,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, } static void -iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, - struct ib_sig_domain *domain) +iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_domain *domain) { domain->sig_type = IB_SIG_TYPE_T10_DIF; domain->sig.dif.pi_interval = scsi_prot_interval(sc); @@ -326,21 +325,21 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) case SCSI_PROT_WRITE_INSERT: case SCSI_PROT_READ_STRIP: sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; - iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); + iser_set_dif_domain(sc, &sig_attrs->wire); sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; break; case SCSI_PROT_READ_INSERT: case SCSI_PROT_WRITE_STRIP: sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; - iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + iser_set_dif_domain(sc, &sig_attrs->mem); sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? IB_T10DIF_CSUM : IB_T10DIF_CRC; break; case SCSI_PROT_READ_PASS: case SCSI_PROT_WRITE_PASS: - iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); + iser_set_dif_domain(sc, &sig_attrs->wire); sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; - iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + iser_set_dif_domain(sc, &sig_attrs->mem); sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? IB_T10DIF_CSUM : IB_T10DIF_CRC; break; From dae6345aa7a992fda77b5d0de286c9b238931fbd Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:27 +0300 Subject: [PATCH 003/194] IB/isert: Remove unused sig_attrs argument Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 989f1ac4245c..ffef4ac152ca 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2067,8 +2067,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) } static inline void -isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, - struct ib_sig_domain *domain) +isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_domain *domain) { domain->sig_type = IB_SIG_TYPE_T10_DIF; domain->sig.dif.bg_type = IB_T10DIF_CRC; @@ -2096,17 +2095,17 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; - isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); + isert_set_dif_domain(se_cmd, &sig_attrs->wire); break; case TARGET_PROT_DOUT_INSERT: case TARGET_PROT_DIN_STRIP: sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; - isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); + isert_set_dif_domain(se_cmd, &sig_attrs->mem); break; case TARGET_PROT_DIN_PASS: case TARGET_PROT_DOUT_PASS: - isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); - isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); + isert_set_dif_domain(se_cmd, &sig_attrs->wire); + isert_set_dif_domain(se_cmd, &sig_attrs->mem); break; default: isert_err("Unsupported PI operation %d\n", se_cmd->prot_op); From 2d465a165aed9d15c4511de154009164d8933141 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:28 +0300 Subject: [PATCH 004/194] RDMA/rw: Fix doc typo Use the correct function name. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 89a5be3a2f97..e763e42dce05 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -613,7 +613,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy); /** * rdma_rw_ctx_destroy_signature - release all resources allocated by - * rdma_rw_ctx_init_signature + * rdma_rw_ctx_signature_init * @ctx: context to release * @qp: queue pair to operate on * @port_num: port num to which the connection is bound From f73e4076426b3494269e46fd5cf9d89ff63a8598 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:29 +0300 Subject: [PATCH 005/194] RDMA/rw: Print the correct number of sig MRs A wrong value was printed in case of sig MR pool initialization failure. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index e763e42dce05..deeaf2b4b273 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -731,7 +731,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) IB_MR_TYPE_SIGNATURE, 2); if (ret) { pr_err("%s: failed to allocated %d SIG MRs\n", - __func__, nr_mrs); + __func__, nr_sig_mrs); goto out_free_rdma_mrs; } } From 53bfbf9be86e95997d6dfe9dbfd0ffce49339efc Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 15 May 2019 13:49:30 +0300 Subject: [PATCH 006/194] RDMA/rw: Add info regarding SG count failure Print the supported and wanted values for SG count during signature operation. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index deeaf2b4b273..acf9ea625811 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -357,7 +357,8 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { - pr_err("SG count too large\n"); + pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n", + sg_cnt, prot_sg_cnt, pages_per_mr); return -EINVAL; } From 91f571293e26af3e17e209eed89e2d5777192819 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 15 May 2019 13:49:31 +0300 Subject: [PATCH 007/194] RDMA/core: Fix doc typo Use the correct function names. Fixes: c4367a26357b ("IB: Pass uverbs_attr_bundle down ib_x destroy path") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 4 ++-- drivers/infiniband/core/verbs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a4c81992267c..cb72aa4985a4 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -121,7 +121,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * __ib_alloc_cq - allocate a completion queue + * __ib_alloc_cq_user - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate @@ -201,7 +201,7 @@ out_destroy_cq: EXPORT_SYMBOL(__ib_alloc_cq_user); /** - * ib_free_cq - free a completion queue + * ib_free_cq_user - free a completion queue * @cq: completion queue to free. * @udata: User data or NULL for kernel object */ diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e666a1f7608d..4fd5aad890d2 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -316,7 +316,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, EXPORT_SYMBOL(__ib_alloc_pd); /** - * ib_dealloc_pd - Deallocates a protection domain. + * ib_dealloc_pd_user - Deallocates a protection domain. * @pd: The protection domain to deallocate. * @udata: Valid user data or NULL for kernel object * @@ -1981,7 +1981,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) EXPORT_SYMBOL(ib_dereg_mr_user); /** - * ib_alloc_mr() - Allocates a memory region + * ib_alloc_mr_user() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. From 69054666df0a9b4e8331319f98b6b9a88bc3fcc4 Mon Sep 17 00:00:00 2001 From: Sagiv Ozeri Date: Mon, 20 May 2019 12:33:20 +0300 Subject: [PATCH 008/194] RDMA/qedr: Fix incorrect device rate. Use the correct enum value introduced in commit 12113a35ada6 ("IB/core: Add HDR speed enum") Prior to this change a 50Gbps port would show 40Gbps. This patch also cleaned up the redundant redefiniton of ib speeds for qedr. Fixes: 12113a35ada6 ("IB/core: Add HDR speed enum") Signed-off-by: Sagiv Ozeri Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3d7bde19838e..3c0dba072071 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -159,54 +159,47 @@ int qedr_query_device(struct ib_device *ibdev, return 0; } -#define QEDR_SPEED_SDR (1) -#define QEDR_SPEED_DDR (2) -#define QEDR_SPEED_QDR (4) -#define QEDR_SPEED_FDR10 (8) -#define QEDR_SPEED_FDR (16) -#define QEDR_SPEED_EDR (32) - static inline void get_link_speed_and_width(int speed, u8 *ib_speed, u8 *ib_width) { switch (speed) { case 1000: - *ib_speed = QEDR_SPEED_SDR; + *ib_speed = IB_SPEED_SDR; *ib_width = IB_WIDTH_1X; break; case 10000: - *ib_speed = QEDR_SPEED_QDR; + *ib_speed = IB_SPEED_QDR; *ib_width = IB_WIDTH_1X; break; case 20000: - *ib_speed = QEDR_SPEED_DDR; + *ib_speed = IB_SPEED_DDR; *ib_width = IB_WIDTH_4X; break; case 25000: - *ib_speed = QEDR_SPEED_EDR; + *ib_speed = IB_SPEED_EDR; *ib_width = IB_WIDTH_1X; break; case 40000: - *ib_speed = QEDR_SPEED_QDR; + *ib_speed = IB_SPEED_QDR; *ib_width = IB_WIDTH_4X; break; case 50000: - *ib_speed = QEDR_SPEED_QDR; - *ib_width = IB_WIDTH_4X; + *ib_speed = IB_SPEED_HDR; + *ib_width = IB_WIDTH_1X; break; case 100000: - *ib_speed = QEDR_SPEED_EDR; + *ib_speed = IB_SPEED_EDR; *ib_width = IB_WIDTH_4X; break; default: /* Unsupported */ - *ib_speed = QEDR_SPEED_SDR; + *ib_speed = IB_SPEED_SDR; *ib_width = IB_WIDTH_1X; } } From d2183c6f1958e6b6dfdde279f4cee04280710e34 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 20 May 2019 09:05:25 +0300 Subject: [PATCH 009/194] RDMA/umem: Move page_shift from ib_umem to ib_odp_umem This value has always been set to PAGE_SHIFT in the core code, the only thing that does differently was the ODP path. Move the value into the ODP struct and still use it for ODP, but change all the non-ODP things to just use PAGE_SHIFT/PAGE_SIZE/PAGE_MASK directly. Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 3 +- drivers/infiniband/core/umem_odp.c | 79 +++++++++++------------- drivers/infiniband/hw/hns/hns_roce_cq.c | 3 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 10 ++- drivers/infiniband/hw/mlx4/mr.c | 8 +-- drivers/infiniband/hw/mlx4/srq.c | 2 +- drivers/infiniband/hw/mlx5/mem.c | 20 +++--- drivers/infiniband/hw/mlx5/mr.c | 5 +- drivers/infiniband/hw/mlx5/odp.c | 23 +++---- drivers/infiniband/hw/nes/nes_verbs.c | 9 +-- include/rdma/ib_umem.h | 19 ++---- include/rdma/ib_umem_odp.h | 20 ++++++ 12 files changed, 99 insertions(+), 102 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index e7ea819fcb11..7edc5839606b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -244,7 +244,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem->context = context; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); @@ -385,7 +384,7 @@ int ib_umem_page_count(struct ib_umem *umem) n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> umem->page_shift; + n += sg_dma_len(sg) >> PAGE_SHIFT; return n; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index f962b5bbfa40..c3b3c523401f 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -59,7 +59,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(&umem_odp->umem); + return ib_umem_start(umem_odp); } /* Note that the representation of the intervals in the interval tree @@ -72,7 +72,7 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(&umem_odp->umem) - 1; + return ib_umem_end(umem_odp) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, @@ -107,8 +107,6 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { - struct ib_umem *umem = &umem_odp->umem; - /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. @@ -119,8 +117,8 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, * all pending page faults. */ smp_wmb(); complete_all(&umem_odp->notifier_completion); - umem->context->invalidate_range(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + umem_odp->umem.context->invalidate_range( + umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); return 0; } @@ -205,10 +203,9 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); up_write(&per_mm->umem_rwsem); @@ -217,10 +214,9 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); complete_all(&umem_odp->notifier_completion); @@ -351,7 +347,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, umem->context = ctx; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; + odp_data->page_shift = PAGE_SHIFT; umem->writable = root->umem.writable; umem->is_odp = 1; odp_data->per_mm = per_mm; @@ -405,18 +401,19 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) struct mm_struct *mm = umem->owning_mm; int ret_val; + umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; struct hstate *h; down_read(&mm->mmap_sem); - vma = find_vma(mm, ib_umem_start(umem)); + vma = find_vma(mm, ib_umem_start(umem_odp)); if (!vma || !is_vm_hugetlb_page(vma)) { up_read(&mm->mmap_sem); return -EINVAL; } h = hstate_vma(vma); - umem->page_shift = huge_page_shift(h); + umem_odp->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); } @@ -424,16 +421,16 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) init_completion(&umem_odp->notifier_completion); - if (ib_umem_num_pages(umem)) { + if (ib_umem_odp_num_pages(umem_odp)) { umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_num_pages(umem))); + ib_umem_odp_num_pages(umem_odp))); if (!umem_odp->page_list) return -ENOMEM; umem_odp->dma_list = vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_num_pages(umem))); + ib_umem_odp_num_pages(umem_odp))); if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; @@ -456,16 +453,14 @@ out_page_list: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = &umem_odp->umem; - /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); remove_umem_from_per_mm(umem_odp); put_per_mm(umem_odp); @@ -498,8 +493,8 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = &umem_odp->umem; - struct ib_device *dev = umem->context->device; + struct ib_ucontext *context = umem_odp->umem.context; + struct ib_device *dev = context->device; dma_addr_t dma_addr; int remove_existing_mapping = 0; int ret = 0; @@ -514,10 +509,9 @@ static int ib_umem_odp_map_dma_single_page( goto out; } if (!(umem_odp->dma_list[page_index])) { - dma_addr = ib_dma_map_page(dev, - page, - 0, BIT(umem->page_shift), - DMA_BIDIRECTIONAL); + dma_addr = + ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); if (ib_dma_mapping_error(dev, dma_addr)) { ret = -EFAULT; goto out; @@ -540,11 +534,12 @@ out: if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); - umem->context->invalidate_range( + context->invalidate_range( umem_odp, - ib_umem_start(umem) + (page_index << umem->page_shift), - ib_umem_start(umem) + - ((page_index + 1) << umem->page_shift)); + ib_umem_start(umem_odp) + + (page_index << umem_odp->page_shift), + ib_umem_start(umem_odp) + + ((page_index + 1) << umem_odp->page_shift)); ib_umem_notifier_end_account(umem_odp); ret = -EAGAIN; } @@ -581,27 +576,26 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; - int j, k, ret = 0, start_idx, npages = 0, page_shift; - unsigned int flags = 0; + int j, k, ret = 0, start_idx, npages = 0; + unsigned int flags = 0, page_shift; phys_addr_t p = 0; if (access_mask == 0) return -EINVAL; - if (user_virt < ib_umem_start(umem) || - user_virt + bcnt > ib_umem_end(umem)) + if (user_virt < ib_umem_start(umem_odp) || + user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; local_page_list = (struct page **)__get_free_page(GFP_KERNEL); if (!local_page_list) return -ENOMEM; - page_shift = umem->page_shift; + page_shift = umem_odp->page_shift; page_mask = ~(BIT(page_shift) - 1); off = user_virt & (~page_mask); user_virt = user_virt & page_mask; @@ -621,7 +615,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, if (access_mask & ODP_WRITE_ALLOWED_BIT) flags |= FOLL_WRITE; - start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; + start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; k = start_idx; while (bcnt > 0) { @@ -711,21 +705,20 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; - struct ib_device *dev = umem->context->device; + struct ib_device *dev = umem_odp->umem.context->device; - virt = max_t(u64, virt, ib_umem_start(umem)); - bound = min_t(u64, bound, ib_umem_end(umem)); + virt = max_t(u64, virt, ib_umem_start(umem_odp)); + bound = min_t(u64, bound, ib_umem_end(umem_odp)); /* Note that during the run of this function, the * notifiers_count of the MR is > 0, preventing any racing * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ mutex_lock(&umem_odp->umem_mutex); - for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; + for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; if (umem_odp->page_list[idx]) { struct page *page = umem_odp->page_list[idx]; dma_addr_t dma = umem_odp->dma_list[idx]; diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 9caf35061721..6e81ff3f1813 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -235,8 +235,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev, &buf->hr_mtt); } else { ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem), - (*umem)->page_shift, - &buf->hr_mtt); + PAGE_SHIFT, &buf->hr_mtt); } if (ret) goto err_buf; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index b3421b1f21e0..ad15b41da30a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -264,8 +264,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, } else ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(srq->umem), - srq->umem->page_shift, - &srq->mtt); + PAGE_SHIFT, &srq->mtt); if (ret) goto err_buf; @@ -291,10 +290,9 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, ret = hns_roce_mtt_init(hr_dev, npages, page_shift, &srq->idx_que.mtt); } else { - ret = hns_roce_mtt_init(hr_dev, - ib_umem_page_count(srq->idx_que.umem), - srq->idx_que.umem->page_shift, - &srq->idx_que.mtt); + ret = hns_roce_mtt_init( + hr_dev, ib_umem_page_count(srq->idx_que.umem), + PAGE_SHIFT, &srq->idx_que.mtt); } if (ret) { diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 355205a28544..b0b94dedb848 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -258,7 +258,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int *num_of_mtts) { u64 block_shift = MLX4_MAX_MTT_SHIFT; - u64 min_shift = umem->page_shift; + u64 min_shift = PAGE_SHIFT; u64 last_block_aligned_end = 0; u64 current_block_start = 0; u64 first_block_start = 0; @@ -295,8 +295,8 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, * in access to the wrong data. */ misalignment_bits = - (start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL))) - ^ current_block_start; + (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^ + current_block_start; block_shift = min(alignment_of(misalignment_bits), block_shift); } @@ -514,7 +514,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, goto release_mpt_entry; } n = ib_umem_page_count(mmr->umem); - shift = mmr->umem->page_shift; + shift = PAGE_SHIFT; err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, virt_addr, length, n, shift, diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 4bf2946b9759..c9f555e04c9f 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -115,7 +115,7 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq, return PTR_ERR(srq->umem); err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), - srq->umem->page_shift, &srq->mtt); + PAGE_SHIFT, &srq->mtt); if (err) goto err_buf; diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 9f90be296ee0..fe1a76d8531c 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -55,9 +55,10 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int i = 0; struct scatterlist *sg; int entry; - unsigned long page_shift = umem->page_shift; if (umem->is_odp) { + unsigned int page_shift = to_ib_umem_odp(umem)->page_shift; + *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -67,15 +68,15 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, return; } - addr = addr >> page_shift; + addr = addr >> PAGE_SHIFT; tmp = (unsigned long)addr; m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) - m = min_t(unsigned long, max_page_shift - page_shift, m); + m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m); for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> page_shift; - pfn = sg_dma_address(sg) >> page_shift; + len = sg_dma_len(sg) >> PAGE_SHIFT; + pfn = sg_dma_address(sg) >> PAGE_SHIFT; if (base + p != pfn) { /* If either the offset or the new * base are unaligned update m @@ -107,7 +108,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, *ncont = 0; } - *shift = page_shift + m; + *shift = PAGE_SHIFT + m; *count = i; } @@ -140,8 +141,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, size_t offset, size_t num_pages, __be64 *pas, int access_flags) { - unsigned long umem_page_shift = umem->page_shift; - int shift = page_shift - umem_page_shift; + int shift = page_shift - PAGE_SHIFT; int mask = (1 << shift) - 1; int i, k, idx; u64 cur = 0; @@ -165,7 +165,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> umem_page_shift; + len = sg_dma_len(sg) >> PAGE_SHIFT; base = sg_dma_address(sg); /* Skip elements below offset */ @@ -184,7 +184,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for (; k < len; k++) { if (!(i & mask)) { - cur = base + (k << umem_page_shift); + cur = base + (k << PAGE_SHIFT); cur |= access_flags; idx = (i >> shift) - offset; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5f09699fab98..4d033796dcfc 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1606,8 +1606,9 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (umem_odp->page_list) - mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + mlx5_ib_invalidate_range(umem_odp, + ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); else mlx5_ib_free_implicit_mr(mr); /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 91507a2e9290..d0c6f9cc97ef 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -150,7 +150,7 @@ static struct ib_umem_odp *odp_lookup(u64 start, u64 length, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(&odp->umem) > start + length) + if (ib_umem_start(odp) > start + length) goto not_found; } not_found: @@ -200,7 +200,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; @@ -224,7 +224,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; - struct ib_umem *umem; int in_block = 0; u64 addr; @@ -232,15 +231,14 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } - umem = &umem_odp->umem; mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; - start = max_t(u64, ib_umem_start(umem), start); - end = min_t(u64, ib_umem_end(umem), end); + start = max_t(u64, ib_umem_start(umem_odp), start); + end = min_t(u64, ib_umem_end(umem_odp), end); /* * Iteration one - zap the HW's MTTs. The notifiers_count ensures that @@ -249,8 +247,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * but they will write 0s as well, so no difference in the end result. */ - for (addr = start; addr < end; addr += BIT(umem->page_shift)) { - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; + for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; /* * Strive to write the MTTs in chunks, but avoid overwriting * non-existing MTTs. The huristic here can be improved to @@ -544,13 +542,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); if (umem_odp->dying) return 0; @@ -602,9 +599,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt); - page_shift = mr->umem->page_shift; + page_shift = odp->page_shift; page_mask = ~(BIT(page_shift) - 1); start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 49024326a518..ad2b8322cc3f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2112,10 +2112,11 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return (struct ib_mr *)region; } - nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," - " offset = %u, page size = %lu.\n", - (unsigned long int)start, (unsigned long int)virt, (u32)length, - ib_umem_offset(region), BIT(region->page_shift)); + nes_debug( + NES_DBG_MR, + "User base = 0x%lX, Virt base = 0x%lX, length = %u, offset = %u, page size = %lu.\n", + (unsigned long)start, (unsigned long)virt, (u32)length, + ib_umem_offset(region), PAGE_SIZE); skip_pages = ((u32)ib_umem_offset(region)) >> 12; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 040d853077c6..1052d0d62be7 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -46,7 +46,6 @@ struct ib_umem { struct mm_struct *owning_mm; size_t length; unsigned long address; - int page_shift; u32 writable : 1; u32 is_odp : 1; struct work_struct work; @@ -58,24 +57,14 @@ struct ib_umem { /* Returns the offset of the umem start relative to the first page. */ static inline int ib_umem_offset(struct ib_umem *umem) { - return umem->address & (BIT(umem->page_shift) - 1); -} - -/* Returns the first page of an ODP umem. */ -static inline unsigned long ib_umem_start(struct ib_umem *umem) -{ - return umem->address - ib_umem_offset(umem); -} - -/* Returns the address of the page after the last one of an ODP umem. */ -static inline unsigned long ib_umem_end(struct ib_umem *umem) -{ - return ALIGN(umem->address + umem->length, BIT(umem->page_shift)); + return umem->address & ~PAGE_MASK; } static inline size_t ib_umem_num_pages(struct ib_umem *umem) { - return (ib_umem_end(umem) - ib_umem_start(umem)) >> umem->page_shift; + return (ALIGN(umem->address + umem->length, PAGE_SIZE) - + ALIGN_DOWN(umem->address, PAGE_SIZE)) >> + PAGE_SHIFT; } #ifdef CONFIG_INFINIBAND_USER_MEM diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index eeec4e53c448..479db5c98ff6 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -76,6 +76,7 @@ struct ib_umem_odp { struct completion notifier_completion; int dying; + unsigned int page_shift; struct work_struct work; }; @@ -84,6 +85,25 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) return container_of(umem, struct ib_umem_odp, umem); } +/* Returns the first page of an ODP umem. */ +static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) +{ + return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); +} + +/* Returns the address of the page after the last one of an ODP umem. */ +static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) +{ + return ALIGN(umem_odp->umem.address + umem_odp->umem.length, + 1UL << umem_odp->page_shift); +} + +static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) +{ + return (ib_umem_end(umem_odp) - ib_umem_start(umem_odp)) >> + umem_odp->page_shift; +} + /* * The lower 2 bits of the DMA address signal the R/W permissions for * the entry. To upgrade the permissions, provide the appropriate From 8ce0048f76542a1f4d68b647c5d4a4d35e7547cf Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 19 May 2019 18:31:27 +0300 Subject: [PATCH 010/194] IB/mlx4: Delete unused func arg The function argument virt_addr is not in use - delete it. Signed-off-by: Yuval Shaia Reviewed-by: Majd Dibbiny Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mr.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index b0b94dedb848..753479285ce9 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -368,8 +368,7 @@ end: } static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, - u64 length, u64 virt_addr, - int access_flags) + u64 length, int access_flags) { /* * Force registering the memory as writable if the underlying pages @@ -415,8 +414,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = - mlx4_get_umem_mr(udata, start, length, virt_addr, access_flags); + mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -505,7 +503,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = mlx4_get_umem_mr(udata, start, length, virt_addr, + mmr->umem = mlx4_get_umem_mr(udata, start, length, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); From deee3c7e499108b0575ee0d71b786da627a7cdee Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 21 May 2019 10:05:07 +0300 Subject: [PATCH 011/194] RDMA/core: Return void from ib_device_check_mandatory() The return value from ib_device_check_mandatory() is always 0 - change it to be void. Signed-off-by: Kamal Heib Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 78dc07c6ac4b..afb3f5946796 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -270,7 +270,7 @@ struct ib_port_data_rcu { struct ib_port_data pdata[]; }; -static int ib_device_check_mandatory(struct ib_device *device) +static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { @@ -305,8 +305,6 @@ static int ib_device_check_mandatory(struct ib_device *device) break; } } - - return 0; } /* @@ -1175,10 +1173,7 @@ static int setup_device(struct ib_device *device) int ret; setup_dma_device(device); - - ret = ib_device_check_mandatory(device); - if (ret) - return ret; + ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { From eaa1ca9cf99224a57852f339bfbf058fbeadcb79 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:19 +0300 Subject: [PATCH 012/194] rds: Don't check return value from destroy CQ There is no value in checking ib_destroy_cq() result and skipping to clear struct ic fields. This connection needs to be reinitialized anyway. Signed-off-by: Leon Romanovsky Acked-by: Santosh Shilimkar Signed-off-by: Jason Gunthorpe --- net/rds/ib_cm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 66c6eb56072b..5a42ebb892cd 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -611,11 +611,11 @@ send_hdrs_dma_out: qp_out: rdma_destroy_qp(ic->i_cm_id); recv_cq_out: - if (!ib_destroy_cq(ic->i_recv_cq)) - ic->i_recv_cq = NULL; + ib_destroy_cq(ic->i_recv_cq); + ic->i_recv_cq = NULL; send_cq_out: - if (!ib_destroy_cq(ic->i_send_cq)) - ic->i_send_cq = NULL; + ib_destroy_cq(ic->i_send_cq); + ic->i_send_cq = NULL; rds_ibdev_out: rds_ib_remove_conn(rds_ibdev, conn); out: From dfdb08990432b19579703ed5d1efb216b9d2142e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:20 +0300 Subject: [PATCH 013/194] RDMA/ipoib: Remove check of destroy CQ There are nothing to do from user side with knowledge that destroy CQ fails. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index ba09068f6200..b69304d28f06 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -260,11 +260,8 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) priv->qp = NULL; } - if (ib_destroy_cq(priv->send_cq)) - ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); - - if (ib_destroy_cq(priv->recv_cq)) - ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); + ib_destroy_cq(priv->send_cq); + ib_destroy_cq(priv->recv_cq); } void ipoib_event(struct ib_event_handler *handler, From 890ac8d97e6722a9e4a66a0bd836d1b028d075fe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:21 +0300 Subject: [PATCH 014/194] RDMA/core: Make ib_destroy_cq() void Kernel destroy CQ flows can't fail and the returned value of ib_destroy_cq() is not interested in those flows. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0742095355f2..ec6446864b08 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3858,9 +3858,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata); * * NOTE: for user cq use ib_destroy_cq_user with valid udata! */ -static inline int ib_destroy_cq(struct ib_cq *cq) +static inline void ib_destroy_cq(struct ib_cq *cq) { - return ib_destroy_cq_user(cq, NULL); + ib_destroy_cq_user(cq, NULL); } /** From 269c97fd485439702048676326286588c33fd3ba Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:23 +0300 Subject: [PATCH 015/194] RDMA/nes: Remove useless NULL checks The destroy functions are always called with relevant structs, there is no need to check their existence. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/nes/nes_verbs.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index ad2b8322cc3f..fb2d0762c7c8 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1646,9 +1646,6 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) u32 opcode = 0; int ret; - if (ib_cq == NULL) - return 0; - nescq = to_nescq(ib_cq); nesvnic = to_nesvnic(ib_cq->device); nesdev = nesvnic->nesdev; @@ -3708,9 +3705,6 @@ void nes_port_ibevent(struct nes_vnic *nesvnic) */ void nes_destroy_ofa_device(struct nes_ib_device *nesibdev) { - if (nesibdev == NULL) - return; - nes_unregister_ofa_device(nesibdev); ib_dealloc_device(&nesibdev->ibdev); From 3bb58cfe07e273903dde62592ccfb21f9b0761e2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:24 +0300 Subject: [PATCH 016/194] RDMA/i40iw: Remove useless NULL checks There is no need to check existence of structures to be destroyed. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 5689d742bafb..a10a30d44b32 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1070,11 +1070,6 @@ static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) struct i40iw_device *iwdev; struct i40iw_sc_cq *cq; - if (!ib_cq) { - i40iw_pr_err("ib_cq == NULL\n"); - return 0; - } - iwcq = to_iwcq(ib_cq); iwdev = to_iwdev(ib_cq->device); cq = &iwcq->sc_cq; @@ -2771,9 +2766,6 @@ void i40iw_port_ibevent(struct i40iw_device *iwdev) */ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev) { - if (!iwibdev) - return; - ib_unregister_device(&iwibdev->ibdev); wait_event_timeout(iwibdev->iwdev->close_wq, !atomic64_read(&iwibdev->iwdev->use_count), From dab99af99c7ca4c9dbea7d4d218ef7eb6b183aaf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:25 +0300 Subject: [PATCH 017/194] RDMA/nes: Remove second wait queue initialization call The same wait queue is initialized a couple of lines above. Fixes: 3c2d774cad5b ("RDMA/nes: Add a driver for NetEffect RNICs") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/nes/nes_utils.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 21b4a8373acf..90f28890246d 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -586,7 +586,6 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) cqp_request->waiting = 0; cqp_request->request_done = 0; cqp_request->callback = 0; - init_waitqueue_head(&cqp_request->waitq); nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", cqp_request); } else From 62a38e704d5720e2b73d0e1d6dfc54441ee75985 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:22 +0300 Subject: [PATCH 018/194] RDMA/efa: Remove check that prevents destroy of resources in error flows Drivers cannot check the udata for validity when doing destroy as there will be no way to report this error back to the uverbs. Since udata is new for destroy no driver should start to use it - instead drivers should opt for the ioctl interface and define it in a way where it cannot fail due to incorrect data. Remove the checks on udata construction so EFA is consistent with everything else. Signed-off-by: Leon Romanovsky Acked-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 6d6886c9009f..4999a74cee24 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -436,12 +436,6 @@ void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct efa_dev *dev = to_edev(ibpd->device); struct efa_pd *pd = to_epd(ibpd); - if (udata->inlen && - !ib_is_udata_cleared(udata, 0, udata->inlen)) { - ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); - return; - } - ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn); efa_pd_dealloc(dev, pd->pdn); } @@ -459,12 +453,6 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct efa_qp *qp = to_eqp(ibqp); int err; - if (udata->inlen && - !ib_is_udata_cleared(udata, 0, udata->inlen)) { - ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); - return -EINVAL; - } - ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num); err = efa_destroy_qp_handle(dev, qp->qp_handle); if (err) @@ -865,12 +853,6 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) struct efa_cq *cq = to_ecq(ibcq); int err; - if (udata->inlen && - !ib_is_udata_cleared(udata, 0, udata->inlen)) { - ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); - return -EINVAL; - } - ibdev_dbg(&dev->ibdev, "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); @@ -1556,12 +1538,6 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) struct efa_mr *mr = to_emr(ibmr); int err; - if (udata->inlen && - !ib_is_udata_cleared(udata, 0, udata->inlen)) { - ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n"); - return -EINVAL; - } - ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey); if (mr->umem) { From d34d37d5a102a016c63f904d1ff0284637759cc2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:28 +0300 Subject: [PATCH 019/194] RDMA/cxgb3: Use sizeof() notation instead of plain sizeof sizeof(a), sizeof a and sizeof (a) are all valid notations, but first is more readable format recommended by checkpatch.pl. Let's canonize it in cxgb3 drivers, so latter patches won't emit checkpatch warnings. As part of this change, a redundant memset() was removed. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 4 ++-- drivers/infiniband/hw/cxgb3/iwch_cm.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 24 ++++++++++----------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 8ac72ac7cbac..e9a5f45dfa14 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -219,7 +219,7 @@ static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) if (!qpid) goto out; for (i = qpid+1; i & rdev_p->qpmask; i++) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) break; entry->qpid = i; @@ -237,7 +237,7 @@ static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid, { struct cxio_qpid_list *entry; - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return; pr_debug("%s qpid 0x%x\n", __func__, qpid); diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 1c90c86fc8b8..0bca72cb4d9a 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -170,7 +170,7 @@ static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) { struct cpl_tid_release *req; - skb = get_skb(skb, sizeof *req, GFP_KERNEL); + skb = get_skb(skb, sizeof(*req), GFP_KERNEL); if (!skb) return; req = skb_put(skb, sizeof(*req)); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 3a481dfb1607..82669c254b3f 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -127,7 +127,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, if (udata) { if (!t3a_device(rhp)) { - if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { kfree(chp); return ERR_PTR(-EFAULT); } @@ -172,7 +172,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, struct iwch_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct iwch_ucontext, ibucontext); - mm = kmalloc(sizeof *mm, GFP_KERNEL); + mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) { iwch_destroy_cq(&chp->ibcq, udata); return ERR_PTR(-ENOMEM); @@ -185,7 +185,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, spin_unlock(&ucontext->mmap_lock); mm->key = uresp.key; mm->addr = virt_to_phys(chp->cq.queue); - if (udata->outlen < sizeof uresp) { + if (udata->outlen < sizeof(uresp)) { if (!warned++) pr_warn("Warning - downlevel libcxgb3 (non-fatal)\n"); mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * @@ -196,7 +196,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, sizeof(struct t3_cqe)); uresp.memsize = mm->len; uresp.reserved = 0; - resplen = sizeof uresp; + resplen = sizeof(uresp); } if (ib_copy_to_udata(udata, &uresp, resplen)) { kfree(mm); @@ -553,7 +553,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) { pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter)); - if (i == PAGE_SIZE / sizeof *pages) { + if (i == PAGE_SIZE / sizeof(*pages)) { err = iwch_write_pbl(mhp, pages, i, n); if (err) goto pbl_done; @@ -587,7 +587,7 @@ pbl_done: pr_debug("%s user resp pbl_addr 0x%x\n", __func__, uresp.pbl_addr); - if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { iwch_dereg_mr(&mhp->ibmr, udata); err = -EFAULT; goto err; @@ -880,13 +880,13 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, struct iwch_mm_entry *mm1, *mm2; - mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + mm1 = kmalloc(sizeof(*mm1), GFP_KERNEL); if (!mm1) { iwch_destroy_qp(&qhp->ibqp, udata); return ERR_PTR(-ENOMEM); } - mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL); if (!mm2) { kfree(mm1); iwch_destroy_qp(&qhp->ibqp, udata); @@ -903,7 +903,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, uresp.db_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); - if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { kfree(mm1); kfree(mm2); iwch_destroy_qp(&qhp->ibqp, udata); @@ -911,7 +911,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, } mm1->key = uresp.key; mm1->addr = virt_to_phys(qhp->wq.queue); - mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + mm1->len = PAGE_ALIGN(wqsize * sizeof(union t3_wr)); insert_mmap(ucontext, mm1); mm2->key = uresp.db_key; mm2->addr = qhp->wq.udb & PAGE_MASK; @@ -932,7 +932,7 @@ static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct iwch_dev *rhp; struct iwch_qp *qhp; enum iwch_qp_attr_mask mask = 0; - struct iwch_qp_attributes attrs; + struct iwch_qp_attributes attrs = {}; pr_debug("%s ib_qp %p\n", __func__, ibqp); @@ -944,7 +944,6 @@ static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!attr_mask) return 0; - memset(&attrs, 0, sizeof attrs); qhp = to_iwch_qp(ibqp); rhp = qhp->rhp; @@ -1040,7 +1039,6 @@ static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *pro return -EINVAL; dev = to_iwch_dev(ibdev); - memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); props->hw_ver = dev->rdev.t3cdev_p->type; props->fw_ver = fw_vers_string_to_u64(dev); From 0ddf8f6267ecefc07dace11be8a9d258061c36e1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:29 +0300 Subject: [PATCH 020/194] RDMA/cxgb3: Don't expose DMA addresses DMA addresses like all other kernel addresses should be printed with special %p* formatter. It is needed to allow control of exposure of such information through a dedicated knob. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 6 +++--- drivers/infiniband/hw/cxgb3/iwch_provider.c | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index e9a5f45dfa14..73bc6b8f2a0c 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -565,9 +565,9 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) wqe->sge_cmd = cpu_to_be64(sge_cmd); wqe->ctx1 = cpu_to_be64(ctx1); wqe->ctx0 = cpu_to_be64(ctx0); - pr_debug("CtrlQP dma_addr 0x%llx workq %p size %d\n", - (unsigned long long)rdev_p->ctrl_qp.dma_addr, - rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + pr_debug("CtrlQP dma_addr %pad workq %p size %d\n", + &rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq, + 1 << T3_CTRL_QP_SIZE_LOG2); skb->priority = CPL_PRIORITY_CONTROL; return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); err: diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 82669c254b3f..7e6adf01d1d6 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -205,9 +205,9 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, } insert_mmap(ucontext, mm); } - pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", + pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n", chp->cq.cqid, chp, (1 << chp->cq.size_log2), - (unsigned long long)chp->cq.dma_addr); + &chp->cq.dma_addr); return &chp->ibcq; } @@ -919,10 +919,11 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, insert_mmap(ucontext, mm2); } qhp->ibqp.qp_num = qhp->wq.qpid; - pr_debug("%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n", - __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, - qhp->wq.qpid, qhp, (unsigned long long)qhp->wq.dma_addr, - 1 << qhp->wq.size_log2, qhp->wq.rq_addr); + pr_debug( + "%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr %pad size %d rq_addr 0x%x\n", + __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, &qhp->wq.dma_addr, 1 << qhp->wq.size_log2, + qhp->wq.rq_addr); return &qhp->ibqp; } From a80287c8130552a837eb61e1ba14844bf2d03156 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:30 +0300 Subject: [PATCH 021/194] RDMA/cxgb3: Delete and properly mark unimplemented resize CQ function Resize CQ implementation was guarded by undeclared "notyet" define while cxgb3 was added to the kernel. Twelve years later, this call is still unimplemented, so safely delete it and fix improper return error code when .resize_cq() is not implemented. Fixes: b038ced7b370 ("RDMA/cxgb3: Add driver for Chelsio T3 RNIC") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 14 ----- drivers/infiniband/hw/cxgb3/cxio_hal.h | 1 - drivers/infiniband/hw/cxgb3/iwch_provider.c | 68 --------------------- 3 files changed, 83 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 73bc6b8f2a0c..d9c741fea0e9 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -187,20 +187,6 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); } -#ifdef notyet -int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) -{ - struct rdma_cq_setup setup; - setup.id = cq->cqid; - setup.base_addr = (u64) (cq->dma_addr); - setup.size = 1UL << cq->size_log2; - setup.credits = setup.size; - setup.credit_thres = setup.size; /* TBD: overflow recovery */ - setup.ovfl_mode = 1; - return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); -} -#endif - static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) { struct cxio_qpid_list *entry; diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index c64e50b5a548..5fc26a4648d3 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -159,7 +159,6 @@ int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, enum t3_cq_opcode op, u32 credit); int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); -int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 7e6adf01d1d6..4bfab739ec0d 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -211,73 +211,6 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, return &chp->ibcq; } -static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) -{ -#ifdef notyet - struct iwch_cq *chp = to_iwch_cq(cq); - struct t3_cq oldcq, newcq; - int ret; - - pr_debug("%s ib_cq %p cqe %d\n", __func__, cq, cqe); - - /* We don't downsize... */ - if (cqe <= cq->cqe) - return 0; - - /* create new t3_cq with new size */ - cqe = roundup_pow_of_two(cqe+1); - newcq.size_log2 = ilog2(cqe); - - /* Dont allow resize to less than the current wce count */ - if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { - return -ENOMEM; - } - - /* Quiesce all QPs using this CQ */ - ret = iwch_quiesce_qps(chp); - if (ret) { - return ret; - } - - ret = cxio_create_cq(&chp->rhp->rdev, &newcq); - if (ret) { - return ret; - } - - /* copy CQEs */ - memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * - sizeof(struct t3_cqe)); - - /* old iwch_qp gets new t3_cq but keeps old cqid */ - oldcq = chp->cq; - chp->cq = newcq; - chp->cq.cqid = oldcq.cqid; - - /* resize new t3_cq to update the HW context */ - ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); - if (ret) { - chp->cq = oldcq; - return ret; - } - chp->ibcq.cqe = (1<cq.size_log2) - 1; - - /* destroy old t3_cq */ - oldcq.cqid = newcq.cqid; - ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); - if (ret) { - pr_err("%s - cxio_destroy_cq failed %d\n", __func__, ret); - } - - /* add user hooks here */ - - /* resume qps */ - ret = iwch_resume_qps(chp); - return ret; -#else - return -ENOSYS; -#endif -} - static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct iwch_dev *rhp; @@ -1340,7 +1273,6 @@ static const struct ib_device_ops iwch_dev_ops = { .query_port = iwch_query_port, .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, - .resize_cq = iwch_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext), }; From 34d568930b87e0dd32c5dbe83a835959f2fc6107 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:31 +0300 Subject: [PATCH 022/194] RDMA/cxgb4: Use sizeof() notation Convert various sizeof call sites to be written in standard format sizeof(). Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 21 +++++++----- drivers/infiniband/hw/cxgb4/cq.c | 8 ++--- drivers/infiniband/hw/cxgb4/device.c | 9 +++-- drivers/infiniband/hw/cxgb4/mem.c | 5 +-- drivers/infiniband/hw/cxgb4/provider.c | 1 - drivers/infiniband/hw/cxgb4/qp.c | 46 ++++++++++++-------------- drivers/infiniband/hw/cxgb4/resource.c | 16 ++++----- 7 files changed, 53 insertions(+), 53 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0f3b1193d5f8..0147c407ac6c 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -953,7 +953,7 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, mpalen = sizeof(*mpa) + ep->plen; if (mpa_rev_to_use == 2) mpalen += sizeof(struct mpa_v2_conn_params); - wrlen = roundup(mpalen + sizeof *req, 16); + wrlen = roundup(mpalen + sizeof(*req), 16); skb = get_skb(skb, wrlen, GFP_KERNEL); if (!skb) { connect_reply_upcall(ep, -ENOMEM); @@ -997,8 +997,9 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, } if (mpa_rev_to_use == 2) { - mpa->private_data_size = htons(ntohs(mpa->private_data_size) + - sizeof (struct mpa_v2_conn_params)); + mpa->private_data_size = + htons(ntohs(mpa->private_data_size) + + sizeof(struct mpa_v2_conn_params)); pr_debug("initiator ird %u ord %u\n", ep->ird, ep->ord); mpa_v2_params.ird = htons((u16)ep->ird); @@ -1057,7 +1058,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) mpalen += sizeof(struct mpa_v2_conn_params); - wrlen = roundup(mpalen + sizeof *req, 16); + wrlen = roundup(mpalen + sizeof(*req), 16); skb = get_skb(NULL, wrlen, GFP_KERNEL); if (!skb) { @@ -1088,8 +1089,9 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; - mpa->private_data_size = htons(ntohs(mpa->private_data_size) + - sizeof (struct mpa_v2_conn_params)); + mpa->private_data_size = + htons(ntohs(mpa->private_data_size) + + sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons(((u16)ep->ird) | (peer2peer ? MPA_V2_PEER2PEER_MODEL : 0)); @@ -1136,7 +1138,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) mpalen += sizeof(struct mpa_v2_conn_params); - wrlen = roundup(mpalen + sizeof *req, 16); + wrlen = roundup(mpalen + sizeof(*req), 16); skb = get_skb(NULL, wrlen, GFP_KERNEL); if (!skb) { @@ -1171,8 +1173,9 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; - mpa->private_data_size = htons(ntohs(mpa->private_data_size) + - sizeof (struct mpa_v2_conn_params)); + mpa->private_data_size = + htons(ntohs(mpa->private_data_size) + + sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); if (peer2peer && (ep->mpa_attr.p2p_type != diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 52ce586621c6..2c0587d8630f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -43,7 +43,7 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, int wr_len; int ret; - wr_len = sizeof *res_wr + sizeof *res; + wr_len = sizeof(*res_wr) + sizeof(*res); set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); res_wr = __skb_put_zero(skb, wr_len); @@ -117,7 +117,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, } /* build fw_ri_res_wr */ - wr_len = sizeof *res_wr + sizeof *res; + wr_len = sizeof(*res_wr) + sizeof(*res); skb = alloc_skb(wr_len, GFP_KERNEL); if (!skb) { @@ -1095,10 +1095,10 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, if (ucontext) { ret = -ENOMEM; - mm = kmalloc(sizeof *mm, GFP_KERNEL); + mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) goto err_remove_handle; - mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL); if (!mm2) goto err_free_mm; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 4c0d925c5ff5..a8b9548bd1a2 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -327,7 +327,7 @@ static int qp_open(struct inode *inode, struct file *file) unsigned long index; int count = 1; - qpd = kmalloc(sizeof *qpd, GFP_KERNEL); + qpd = kmalloc(sizeof(*qpd), GFP_KERNEL); if (!qpd) return -ENOMEM; @@ -421,7 +421,7 @@ static int stag_open(struct inode *inode, struct file *file) int ret = 0; int count = 1; - stagd = kmalloc(sizeof *stagd, GFP_KERNEL); + stagd = kmalloc(sizeof(*stagd), GFP_KERNEL); if (!stagd) { ret = -ENOMEM; goto out; @@ -1075,7 +1075,7 @@ static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) pr_info("Chelsio T4/T5 RDMA Driver - version %s\n", DRV_VERSION); - ctx = kzalloc(sizeof *ctx, GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { ctx = ERR_PTR(-ENOMEM); goto out; @@ -1243,10 +1243,9 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) case CXGB4_STATE_START_RECOVERY: pr_info("%s: Fatal Error\n", pci_name(ctx->lldi.pdev)); if (ctx->dev) { - struct ib_event event; + struct ib_event event = {}; ctx->dev->rdev.flags |= T4_FATAL_ERROR; - memset(&event, 0, sizeof event); event.event = IB_EVENT_DEVICE_FATAL; event.device = &ctx->dev->ibdev; ib_dispatch_event(&event); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 811c0c8c5b16..fe3733c4026d 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -130,8 +130,9 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE : len; - wr_len = roundup(sizeof *req + sizeof *sc + - roundup(copy_len, T4_ULPTX_MIN_IO), 16); + wr_len = roundup(sizeof(*req) + sizeof(*sc) + + roundup(copy_len, T4_ULPTX_MIN_IO), + 16); if (!skb) { skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 74b795642fca..8ed75b141521 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -271,7 +271,6 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro return -EINVAL; dev = to_c4iw_dev(ibdev); - memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type); props->fw_ver = dev->rdev.lldi.fw_vers; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index e92b9544357a..41b2e70cd691 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -303,7 +303,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->rq.msn = 1; /* build fw_ri_res_wr */ - wr_len = sizeof *res_wr + 2 * sizeof *res; + wr_len = sizeof(*res_wr) + 2 * sizeof(*res); if (need_rq) wr_len += sizeof(*res); skb = alloc_skb(wr_len, GFP_KERNEL); @@ -439,7 +439,7 @@ static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, rem -= len; } } - len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); + len = roundup(plen + sizeof(*immdp), 16) - (plen + sizeof(*immdp)); if (len) memset(dstp, 0, len); immdp->op = FW_RI_DATA_IMMD; @@ -528,7 +528,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, T4_MAX_SEND_INLINE, &plen); if (ret) return ret; - size = sizeof wqe->send + sizeof(struct fw_ri_immd) + + size = sizeof(wqe->send) + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, @@ -537,7 +537,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; - size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + + size = sizeof(wqe->send) + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { @@ -545,7 +545,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, wqe->send.u.immd_src[0].r1 = 0; wqe->send.u.immd_src[0].r2 = 0; wqe->send.u.immd_src[0].immdlen = 0; - size = sizeof wqe->send + sizeof(struct fw_ri_immd); + size = sizeof(wqe->send) + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); @@ -579,7 +579,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, T4_MAX_WRITE_INLINE, &plen); if (ret) return ret; - size = sizeof wqe->write + sizeof(struct fw_ri_immd) + + size = sizeof(wqe->write) + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, @@ -588,7 +588,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; - size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + + size = sizeof(wqe->write) + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { @@ -596,7 +596,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, wqe->write.u.immd_src[0].r1 = 0; wqe->write.u.immd_src[0].r2 = 0; wqe->write.u.immd_src[0].immdlen = 0; - size = sizeof wqe->write + sizeof(struct fw_ri_immd); + size = sizeof(wqe->write) + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); @@ -683,7 +683,7 @@ static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr, } wqe->read.r2 = 0; wqe->read.r5 = 0; - *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); + *len16 = DIV_ROUND_UP(sizeof(wqe->read), 16); return 0; } @@ -766,8 +766,8 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); if (ret) return ret; - *len16 = DIV_ROUND_UP(sizeof wqe->recv + - wr->num_sge * sizeof(struct fw_ri_sge), 16); + *len16 = DIV_ROUND_UP( + sizeof(wqe->recv) + wr->num_sge * sizeof(struct fw_ri_sge), 16); return 0; } @@ -886,7 +886,7 @@ static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr, { wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; - *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); + *len16 = DIV_ROUND_UP(sizeof(wqe->inv), 16); return 0; } @@ -1606,7 +1606,7 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16))); wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; - wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); + wqe->u.terminate.immdlen = cpu_to_be32(sizeof(*term)); term = (struct terminate_message *)wqe->u.terminate.termmsg; if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { term->layer_etype = qhp->attr.layer_etype; @@ -1751,16 +1751,15 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) { pr_debug("p2p_type = %d\n", p2p_type); - memset(&init->u, 0, sizeof init->u); + memset(&init->u, 0, sizeof(init->u)); switch (p2p_type) { case FW_RI_INIT_P2PTYPE_RDMA_WRITE: init->u.write.opcode = FW_RI_RDMA_WRITE_WR; init->u.write.stag_sink = cpu_to_be32(1); init->u.write.to_sink = cpu_to_be64(1); init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD; - init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write + - sizeof(struct fw_ri_immd), - 16); + init->u.write.len16 = DIV_ROUND_UP( + sizeof(init->u.write) + sizeof(struct fw_ri_immd), 16); break; case FW_RI_INIT_P2PTYPE_READ_REQ: init->u.write.opcode = FW_RI_RDMA_READ_WR; @@ -1768,7 +1767,7 @@ static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) init->u.read.to_src_lo = cpu_to_be32(1); init->u.read.stag_sink = cpu_to_be32(1); init->u.read.to_sink_lo = cpu_to_be32(1); - init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16); + init->u.read.len16 = DIV_ROUND_UP(sizeof(init->u.read), 16); break; } } @@ -1782,7 +1781,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) pr_debug("qhp %p qid 0x%x tid %u ird %u ord %u\n", qhp, qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord); - skb = alloc_skb(sizeof *wqe, GFP_KERNEL); + skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); if (!skb) { ret = -ENOMEM; goto out; @@ -2302,7 +2301,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, ucontext->key += PAGE_SIZE; } spin_unlock(&ucontext->mmap_lock); - ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) goto err_free_ma_sync_key; sq_key_mm->key = uresp.sq_key; @@ -2386,7 +2385,7 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct c4iw_dev *rhp; struct c4iw_qp *qhp; enum c4iw_qp_attr_mask mask = 0; - struct c4iw_qp_attributes attrs; + struct c4iw_qp_attributes attrs = {}; pr_debug("ib_qp %p\n", ibqp); @@ -2398,7 +2397,6 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!attr_mask) return 0; - memset(&attrs, 0, sizeof attrs); qhp = to_c4iw_qp(ibqp); rhp = qhp->rhp; @@ -2482,8 +2480,8 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct c4iw_qp *qhp = to_c4iw_qp(ibqp); - memset(attr, 0, sizeof *attr); - memset(init_attr, 0, sizeof *init_attr); + memset(attr, 0, sizeof(*attr)); + memset(init_attr, 0, sizeof(*init_attr)); attr->qp_state = to_ib_qp_state(qhp->attr.state); init_attr->cap.max_send_wr = qhp->attr.sq_num_entries; init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries; diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index 57ed26b3cc21..5c95c789f302 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -126,7 +126,7 @@ u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) rdev->stats.qid.cur += rdev->qpmask + 1; mutex_unlock(&rdev->stats.lock); for (i = qid+1; i & rdev->qpmask; i++) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = i; @@ -137,13 +137,13 @@ u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) * now put the same ids on the qp list since they all * map to the same db/gts page. */ - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = qid; list_add_tail(&entry->entry, &uctx->qpids); for (i = qid+1; i & rdev->qpmask; i++) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = i; @@ -165,7 +165,7 @@ void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, { struct c4iw_qid_list *entry; - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return; pr_debug("qid 0x%x\n", qid); @@ -200,7 +200,7 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) rdev->stats.qid.cur += rdev->qpmask + 1; mutex_unlock(&rdev->stats.lock); for (i = qid+1; i & rdev->qpmask; i++) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = i; @@ -211,13 +211,13 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) * now put the same ids on the cq list since they all * map to the same db/gts page. */ - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = qid; list_add_tail(&entry->entry, &uctx->cqids); for (i = qid; i & rdev->qpmask; i++) { - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) goto out; entry->qid = i; @@ -239,7 +239,7 @@ void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid, { struct c4iw_qid_list *entry; - entry = kmalloc(sizeof *entry, GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return; pr_debug("qid 0x%x\n", qid); From cae626b97851afc2219e7607183a9a23cbba3bef Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 20 May 2019 09:54:32 +0300 Subject: [PATCH 023/194] RDMA/cxgb4: Don't expose DMA addresses Change unconditional print of DMA address to be printed with special printk format type specifier. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 2c0587d8630f..6557e7c5af66 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -1135,9 +1135,9 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, mm2->len = PAGE_SIZE; insert_mmap(ucontext, mm2); } - pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n", - chp->cq.cqid, chp, chp->cq.size, - chp->cq.memsize, (unsigned long long)chp->cq.dma_addr); + pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n", + chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize, + &chp->cq.dma_addr); return &chp->ibcq; err_free_mm2: kfree(mm2); From f70baa7ee3d1b5a9e66ac7549e31641a656f23c1 Mon Sep 17 00:00:00 2001 From: Nirranjan Kirubaharan Date: Thu, 23 May 2019 00:05:39 -0700 Subject: [PATCH 024/194] iw_cxgb4: Fix qpid leak Add await in destroy_qp() so that all references to qp are dereferenced and qp is freed in destroy_qp() itself. This ensures freeing of all QPs before invocation of dealloc_ucontext(), which prevents loss of in use qpids stored in the ucontext. Signed-off-by: Nirranjan Kirubaharan Reviewed-by: Potnuri Bharat Teja Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 +-- drivers/infiniband/hw/cxgb4/qp.c | 48 +++++++++----------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 916ef982172e..cf7512b2c4c0 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -490,13 +490,13 @@ struct c4iw_qp { struct t4_wq wq; spinlock_t lock; struct mutex mutex; - struct kref kref; wait_queue_head_t wait; int sq_sig_all; struct c4iw_srq *srq; - struct work_struct free_work; struct c4iw_ucontext *ucontext; struct c4iw_wr_wait *wr_waitp; + struct completion qp_rel_comp; + refcount_t qp_refcnt; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 41b2e70cd691..9523986d62b1 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -890,43 +890,17 @@ static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr, return 0; } -static void free_qp_work(struct work_struct *work) -{ - struct c4iw_ucontext *ucontext; - struct c4iw_qp *qhp; - struct c4iw_dev *rhp; - - qhp = container_of(work, struct c4iw_qp, free_work); - ucontext = qhp->ucontext; - rhp = qhp->rhp; - - pr_debug("qhp %p ucontext %p\n", qhp, ucontext); - destroy_qp(&rhp->rdev, &qhp->wq, - ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq); - - c4iw_put_wr_wait(qhp->wr_waitp); - kfree(qhp); -} - -static void queue_qp_free(struct kref *kref) -{ - struct c4iw_qp *qhp; - - qhp = container_of(kref, struct c4iw_qp, kref); - pr_debug("qhp %p\n", qhp); - queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work); -} - void c4iw_qp_add_ref(struct ib_qp *qp) { pr_debug("ib_qp %p\n", qp); - kref_get(&to_c4iw_qp(qp)->kref); + refcount_inc(&to_c4iw_qp(qp)->qp_refcnt); } void c4iw_qp_rem_ref(struct ib_qp *qp) { pr_debug("ib_qp %p\n", qp); - kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free); + if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt)) + complete(&to_c4iw_qp(qp)->qp_rel_comp); } static void add_to_fc_list(struct list_head *head, struct list_head *entry) @@ -2098,10 +2072,12 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; + struct c4iw_ucontext *ucontext; struct c4iw_qp_attributes attrs; qhp = to_c4iw_qp(ib_qp); rhp = qhp->rhp; + ucontext = qhp->ucontext; attrs.next_state = C4IW_QP_STATE_ERROR; if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) @@ -2119,7 +2095,17 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) c4iw_qp_rem_ref(ib_qp); + wait_for_completion(&qhp->qp_rel_comp); + pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid); + pr_debug("qhp %p ucontext %p\n", qhp, ucontext); + + destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq); + + c4iw_put_wr_wait(qhp->wr_waitp); + + kfree(qhp); return 0; } @@ -2229,8 +2215,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, spin_lock_init(&qhp->lock); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); - kref_init(&qhp->kref); - INIT_WORK(&qhp->free_work, free_qp_work); + init_completion(&qhp->qp_rel_comp); + refcount_set(&qhp->qp_refcnt, 1); ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL); if (ret) From 8ffb813255c422f4d05be227ab4443cb6054a078 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 24 May 2019 15:31:20 +0800 Subject: [PATCH 025/194] RDMA/hns: Remove unnecessary print message in aeq There is no need to print when communication is established, especially while lots of qp used by application. Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b5392cb5b20f..1c7e6e6f6972 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4682,7 +4682,6 @@ static void hns_roce_irq_work_handle(struct work_struct *work) dev_warn(dev, "Path migration failed.\n"); break; case HNS_ROCE_EVENT_TYPE_COMM_EST: - dev_info(dev, "Communication established.\n"); break; case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: dev_warn(dev, "Send queue drained.\n"); From 0502849d0bb133b492eed24fd270441e652c84cc Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Fri, 24 May 2019 15:31:21 +0800 Subject: [PATCH 026/194] RDMA/hns: Update CQE specifications According to hip08 UM, the maximum number of CQEs supported by each CQ is 4M. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index edfdbe2ce0db..0ac177ad6b79 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -54,7 +54,7 @@ #define HNS_ROCE_V2_MAX_CQ_NUM 0x100000 #define HNS_ROCE_V2_MAX_CQC_TIMER_NUM 0x100 #define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000 -#define HNS_ROCE_V2_MAX_CQE_NUM 0x10000 +#define HNS_ROCE_V2_MAX_CQE_NUM 0x400000 #define HNS_ROCE_V2_MAX_SRQWQE_NUM 0x8000 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0x100 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM 0xff From 780f33962ef27d7f27c6b47a55593c6ffd357922 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 24 May 2019 15:31:22 +0800 Subject: [PATCH 027/194] RDMA/hns: Move spin_lock_irqsave to the correct place When hip08 set gid, it will call spin_unlock_bh when send cmq. if main.ko call spin_lock_irqsave firstly, and the kernel is before commit f71b74bca637 ("irq/softirqs: Use lockdep to assert IRQs are disabled/enabled"), it will cause WARN_ON_ONCE because of calling spin_unlock_bh in disable context. In fact, the spin_lock_irqsave in main.ko is only used for hip06, and should be placed in hns_roce_hw_v1.c. hns_roce_hw_v2.c uses its own spin_unlock_bh and does not need main.ko manage spin_lock. Signed-off-by: Lang Cheng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 5 +++++ drivers/infiniband/hw/hns/hns_roce_main.c | 10 ---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 4c5d0f160c10..c35a4de6d4e6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1741,11 +1741,14 @@ static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port, int gid_index, const union ib_gid *gid, const struct ib_gid_attr *attr) { + unsigned long flags; u32 *p = NULL; u8 gid_idx = 0; gid_idx = hns_get_gid_index(hr_dev, port, gid_index); + spin_lock_irqsave(&hr_dev->iboe.lock, flags); + p = (u32 *)&gid->raw[0]; roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_L_0_REG + (HNS_ROCE_V1_GID_NUM * gid_idx)); @@ -1762,6 +1765,8 @@ static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port, roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_H_0_REG + (HNS_ROCE_V1_GID_NUM * gid_idx)); + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); + return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8da5f18bf820..05a0f7d50ce3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -78,18 +78,13 @@ static int hns_roce_add_gid(const struct ib_gid_attr *attr, void **context) { struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); u8 port = attr->port_num - 1; - unsigned long flags; int ret; if (port >= hr_dev->caps.num_ports) return -EINVAL; - spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &attr->gid, attr); - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - return ret; } @@ -98,18 +93,13 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); struct ib_gid_attr zattr = { }; u8 port = attr->port_num - 1; - unsigned long flags; int ret; if (port >= hr_dev->caps.num_ports) return -EINVAL; - spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &zgid, &zattr); - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - return ret; } From 669cefb654cb69b280e31380f5fc7e3b5755b0cd Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 24 May 2019 15:31:23 +0800 Subject: [PATCH 028/194] RDMA/hns: Remove jiffies operation in disable interrupt context In some functions, the jiffies operation is unnecessary, and we can control delay using mdelay and udelay functions only. Especially, in hns_roce_v1_clear_hem, the function calls spin_lock_irqsave, the context disables interrupt, so we can not use jiffies and msleep functions. Signed-off-by: Lang Cheng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hem.c | 21 +++++++++++---------- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 8e29dbb5b5fb..d0eacd8c2575 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -376,18 +376,19 @@ static int hns_roce_set_hem(struct hns_roce_dev *hr_dev, bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG; - end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies; - while (1) { - if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) { - if (!(time_before(jiffies, end))) { - dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n"); - spin_unlock_irqrestore(lock, flags); - return -EBUSY; - } - } else { + end = HW_SYNC_TIMEOUT_MSECS; + while (end) { + if (!readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) break; - } + mdelay(HW_SYNC_SLEEP_TIME_INTERVAL); + end -= HW_SYNC_SLEEP_TIME_INTERVAL; + } + + if (end <= 0) { + dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n"); + spin_unlock_irqrestore(lock, flags); + return -EBUSY; } bt_cmd_l = (u32)bt_ba; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index c35a4de6d4e6..f13c9c3e56d4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -965,8 +965,7 @@ static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev) struct hns_roce_free_mr *free_mr; struct hns_roce_v1_priv *priv; struct completion comp; - unsigned long end = - msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies; + unsigned long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS; priv = (struct hns_roce_v1_priv *)hr_dev->priv; free_mr = &priv->free_mr; @@ -986,10 +985,11 @@ static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev) queue_work(free_mr->free_mr_wq, &(lp_qp_work->work)); - while (time_before_eq(jiffies, end)) { + while (end) { if (try_wait_for_completion(&comp)) return 0; msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE); + end -= HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE; } lp_qp_work->comp_flag = 0; @@ -1103,8 +1103,7 @@ static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev, struct hns_roce_free_mr *free_mr; struct hns_roce_v1_priv *priv; struct completion comp; - unsigned long end = - msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies; + unsigned long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS; unsigned long start = jiffies; int npages; int ret = 0; @@ -1134,10 +1133,11 @@ static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev, queue_work(free_mr->free_mr_wq, &(mr_work->work)); - while (time_before_eq(jiffies, end)) { + while (end) { if (try_wait_for_completion(&comp)) goto free_mr; msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE); + end -= HNS_ROCE_V1_FREE_MR_WAIT_VALUE; } mr_work->comp_flag = 0; @@ -2462,10 +2462,10 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG; - end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies; + end = HW_SYNC_TIMEOUT_MSECS; while (1) { if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) { - if (!(time_before(jiffies, end))) { + if (end < 0) { dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n"); spin_unlock_irqrestore(&hr_dev->bt_cmd_lock, flags); @@ -2474,7 +2474,8 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, } else { break; } - msleep(HW_SYNC_SLEEP_TIME_INTERVAL); + mdelay(HW_SYNC_SLEEP_TIME_INTERVAL); + end -= HW_SYNC_SLEEP_TIME_INTERVAL; } bt_cmd_val[0] = (__le32)bt_ba; From 2a3d923f87303b2d49c6cfb7bf6a25144a1ce265 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Fri, 24 May 2019 23:29:36 +0800 Subject: [PATCH 029/194] RDMA/hns: Replace magic numbers with #defines This patch makes the code more readable by removing magic numbers. Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_db.c | 8 ++- drivers/infiniband/hw/hns/hns_roce_device.h | 37 ++++++++++-- drivers/infiniband/hw/hns/hns_roce_hem.c | 18 +++--- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 65 ++++++++++++--------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 ++ drivers/infiniband/hw/hns/hns_roce_main.c | 7 ++- drivers/infiniband/hw/hns/hns_roce_mr.c | 43 ++++++++------ 8 files changed, 115 insertions(+), 69 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 0c6c1fe87705..3a040a9390d8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -78,7 +78,8 @@ static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir( if (!pgdir) return NULL; - bitmap_fill(pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2); + bitmap_fill(pgdir->order1, + HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT); pgdir->bits[0] = pgdir->order0; pgdir->bits[1] = pgdir->order1; pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE, @@ -116,7 +117,7 @@ found: db->u.pgdir = pgdir; db->index = i; db->db_record = pgdir->page + db->index; - db->dma = pgdir->db_dma + db->index * 4; + db->dma = pgdir->db_dma + db->index * HNS_ROCE_DB_UNIT_SIZE; db->order = order; return 0; @@ -170,7 +171,8 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db) i >>= o; set_bit(i, db->u.pgdir->bits[o]); - if (bitmap_full(db->u.pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2)) { + if (bitmap_full(db->u.pgdir->order1, + HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT)) { dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page, db->u.pgdir->db_dma); list_del(&db->u.pgdir->list); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 563cf39df6d5..d6e8b446fc57 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -37,9 +37,12 @@ #define DRV_NAME "hns_roce" +/* hip08 is a pci device, it includes two version according pci version id */ +#define PCI_REVISION_ID_HIP08_A 0x20 +#define PCI_REVISION_ID_HIP08_B 0x21 + #define HNS_ROCE_HW_VER1 ('h' << 24 | 'i' << 16 | '0' << 8 | '6') -#define MAC_ADDR_OCTET_NUM 6 #define HNS_ROCE_MAX_MSG_LEN 0x80000000 #define HNS_ROCE_ALOGN_UP(a, b) ((((a) + (b) - 1) / (b)) * (b)) @@ -48,6 +51,10 @@ #define HNS_ROCE_BA_SIZE (32 * 4096) +#define BA_BYTE_LEN 8 + +#define BITS_PER_BYTE 8 + /* Hardware specification only for v1 engine */ #define HNS_ROCE_MIN_CQE_NUM 0x40 #define HNS_ROCE_MIN_WQE_NUM 0x20 @@ -55,6 +62,7 @@ /* Hardware specification only for v1 engine */ #define HNS_ROCE_MAX_INNER_MTPT_NUM 0x7 #define HNS_ROCE_MAX_MTPT_PBL_NUM 0x100000 +#define HNS_ROCE_MAX_SGE_NUM 2 #define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS 20 #define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT \ @@ -64,6 +72,9 @@ #define HNS_ROCE_MAX_IRQ_NUM 128 +#define HNS_ROCE_SGE_IN_WQE 2 +#define HNS_ROCE_SGE_SHIFT 4 + #define EQ_ENABLE 1 #define EQ_DISABLE 0 @@ -81,6 +92,7 @@ #define HNS_ROCE_MAX_PORTS 6 #define HNS_ROCE_MAX_GID_NUM 16 #define HNS_ROCE_GID_SIZE 16 +#define HNS_ROCE_SGE_SIZE 16 #define HNS_ROCE_HOP_NUM_0 0xff @@ -111,6 +123,8 @@ #define PAGES_SHIFT_24 24 #define PAGES_SHIFT_32 32 +#define HNS_ROCE_PCI_BAR_NUM 2 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230 @@ -213,6 +227,9 @@ enum hns_roce_mtt_type { MTT_TYPE_IDX }; +#define HNS_ROCE_DB_TYPE_COUNT 2 +#define HNS_ROCE_DB_UNIT_SIZE 4 + enum { HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4 }; @@ -413,8 +430,8 @@ struct hns_roce_buf { struct hns_roce_db_pgdir { struct list_head list; DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE); - DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / 2); - unsigned long *bits[2]; + DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT); + unsigned long *bits[HNS_ROCE_DB_TYPE_COUNT]; u32 *page; dma_addr_t db_dma; }; @@ -535,7 +552,7 @@ struct hns_roce_av { u8 hop_limit; __le32 sl_tclass_flowlabel; u8 dgid[HNS_ROCE_GID_SIZE]; - u8 mac[6]; + u8 mac[ETH_ALEN]; __le16 vlan; bool vlan_en; }; @@ -940,6 +957,16 @@ struct hns_roce_hw { const struct ib_device_ops *hns_roce_dev_srq_ops; }; +enum hns_phy_state { + HNS_ROCE_PHY_SLEEP = 1, + HNS_ROCE_PHY_POLLING = 2, + HNS_ROCE_PHY_DISABLED = 3, + HNS_ROCE_PHY_TRAINING = 4, + HNS_ROCE_PHY_LINKUP = 5, + HNS_ROCE_PHY_LINKERR = 6, + HNS_ROCE_PHY_TEST = 7 +}; + struct hns_roce_dev { struct ib_device ib_dev; struct platform_device *pdev; @@ -962,7 +989,7 @@ struct hns_roce_dev { struct hns_roce_caps caps; struct xarray qp_table_xa; - unsigned char dev_addr[HNS_ROCE_MAX_PORTS][MAC_ADDR_OCTET_NUM]; + unsigned char dev_addr[HNS_ROCE_MAX_PORTS][ETH_ALEN]; u64 sys_image_guid; u32 vendor_id; u32 vendor_part_id; diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index d0eacd8c2575..157c84a1f55f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -165,7 +165,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, + PAGE_SHIFT); mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / 8; + mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; mhop->hop_num = hr_dev->caps.mtt_hop_num; break; case HEM_TYPE_CQE: @@ -173,7 +173,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, + PAGE_SHIFT); mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / 8; + mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; mhop->hop_num = hr_dev->caps.cqe_hop_num; break; case HEM_TYPE_SRQWQE: @@ -181,7 +181,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, + PAGE_SHIFT); mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / 8; + mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; mhop->hop_num = hr_dev->caps.srqwqe_hop_num; break; case HEM_TYPE_IDX: @@ -189,7 +189,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, + PAGE_SHIFT); mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / 8; + mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; mhop->hop_num = hr_dev->caps.idx_hop_num; break; default: @@ -206,7 +206,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, * MTT/CQE alloc hem for bt pages. */ bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num); - chunk_ba_num = mhop->bt_chunk_size / 8; + chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN; chunk_size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size : mhop->bt_chunk_size; table_idx = (*obj & (table->num_obj - 1)) / @@ -436,7 +436,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, buf_chunk_size = mhop.buf_chunk_size; bt_chunk_size = mhop.bt_chunk_size; hop_num = mhop.hop_num; - chunk_ba_num = bt_chunk_size / 8; + chunk_ba_num = bt_chunk_size / BA_BYTE_LEN; bt_num = hns_roce_get_bt_num(table->type, hop_num); switch (bt_num) { @@ -646,7 +646,7 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev, bt_chunk_size = mhop.bt_chunk_size; hop_num = mhop.hop_num; - chunk_ba_num = bt_chunk_size / 8; + chunk_ba_num = bt_chunk_size / BA_BYTE_LEN; bt_num = hns_roce_get_bt_num(table->type, hop_num); switch (bt_num) { @@ -800,7 +800,7 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, i = mhop.l0_idx; j = mhop.l1_idx; if (mhop.hop_num == 2) - hem_idx = i * (mhop.bt_chunk_size / 8) + j; + hem_idx = i * (mhop.bt_chunk_size / BA_BYTE_LEN) + j; else if (mhop.hop_num == 1 || mhop.hop_num == HNS_ROCE_HOP_NUM_0) hem_idx = i; @@ -1000,7 +1000,7 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev, } obj_per_chunk = buf_chunk_size / obj_size; num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk; - bt_chunk_num = bt_chunk_size / 8; + bt_chunk_num = bt_chunk_size / BA_BYTE_LEN; if (type >= HEM_TYPE_MTT) num_bt_l0 = bt_chunk_num; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index f13c9c3e56d4..878c8ae35630 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -818,7 +818,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) attr.dest_qp_num = hr_qp->qpn; memcpy(rdma_ah_retrieve_dmac(&attr.ah_attr), hr_dev->dev_addr[port], - MAC_ADDR_OCTET_NUM); + ETH_ALEN); memcpy(&dgid.raw, &subnet_prefix, sizeof(u64)); memcpy(&dgid.raw[8], hr_dev->dev_addr[port], 3); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1c7e6e6f6972..7fcec998618a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3426,7 +3426,9 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, else roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? + V2_QPC_BYTE_4_SGE_SHIFT_S, + hr_qp->sq.max_gs > + HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ? ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, @@ -3708,13 +3710,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0); - memcpy(&(context->dmac), dmac, 4); + memcpy(&(context->dmac), dmac, sizeof(u32)); roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M, V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4]))); qpc_mask->dmac = 0; roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M, V2_QPC_BYTE_52_DMAC_S, 0); + /* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */ roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, V2_QPC_BYTE_56_LP_PKTN_INI_S, 4); roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, @@ -3756,6 +3759,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M, V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0); + /* rocee send 2^lp_sgen_ini segs every time */ roce_set_field(context->byte_168_irrl_idx, V2_QPC_BYTE_168_LP_SGEN_INI_M, V2_QPC_BYTE_168_LP_SGEN_INI_S, 3); @@ -3810,14 +3814,15 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - context->sq_cur_sge_blk_addr = - ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? - ((u32)(mtts[hr_qp->sge.offset / page_size] - >> PAGE_ADDR_SHIFT)) : 0; + context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) || + hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? + ((u32)(mtts[hr_qp->sge.offset / page_size] >> + PAGE_ADDR_SHIFT)) : 0; roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > + HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? (mtts[hr_qp->sge.offset / page_size] >> (32 + PAGE_ADDR_SHIFT)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; @@ -4144,7 +4149,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, roce_set_field(context->byte_224_retry_msg, V2_QPC_BYTE_224_RETRY_MSG_PSN_M, V2_QPC_BYTE_224_RETRY_MSG_PSN_S, - attr->sq_psn >> 16); + attr->sq_psn >> V2_QPC_BYTE_220_RETRY_MSG_PSN_S); roce_set_field(qpc_mask->byte_224_retry_msg, V2_QPC_BYTE_224_RETRY_MSG_PSN_M, V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0); @@ -4374,11 +4379,12 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S); qp_attr->qp_access_flags = ((roce_get_bit(context->byte_76_srqn_op_en, - V2_QPC_BYTE_76_RRE_S)) << 2) | - ((roce_get_bit(context->byte_76_srqn_op_en, - V2_QPC_BYTE_76_RWE_S)) << 1) | - ((roce_get_bit(context->byte_76_srqn_op_en, - V2_QPC_BYTE_76_ATE_S)) << 3); + V2_QPC_BYTE_76_RRE_S)) << V2_QP_RWE_S) | + ((roce_get_bit(context->byte_76_srqn_op_en, + V2_QPC_BYTE_76_RWE_S)) << V2_QP_RRE_S) | + ((roce_get_bit(context->byte_76_srqn_op_en, + V2_QPC_BYTE_76_ATE_S)) << V2_QP_ATE_S); + if (hr_qp->ibqp.qp_type == IB_QPT_RC || hr_qp->ibqp.qp_type == IB_QPT_UC) { struct ib_global_route *grh = @@ -5150,8 +5156,8 @@ static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev, dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], eq->l1_dma[i]); - for (j = 0; j < bt_chk_sz / 8; j++) { - idx = i * (bt_chk_sz / 8) + j; + for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) { + idx = i * (bt_chk_sz / BA_BYTE_LEN) + j; if ((i == eq->l0_last_num - 1) && j == eq->l1_last_num - 1) { eqe_alloc = (buf_chk_sz / eq->eqe_size) @@ -5367,9 +5373,9 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); - ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) - / buf_chk_sz; - bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8); + ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size), + buf_chk_sz); + bt_num = DIV_ROUND_UP(ba_num, bt_chk_sz / BA_BYTE_LEN); /* hop_num = 0 */ if (mhop_num == HNS_ROCE_HOP_NUM_0) { @@ -5414,12 +5420,12 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, goto err_dma_alloc_l0; if (mhop_num == 1) { - if (ba_num > (bt_chk_sz / 8)) + if (ba_num > (bt_chk_sz / BA_BYTE_LEN)) dev_err(dev, "ba_num %d is too large for 1 hop\n", ba_num); /* alloc buf */ - for (i = 0; i < bt_chk_sz / 8; i++) { + for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) { if (eq_buf_cnt + 1 < ba_num) { size = buf_chk_sz; } else { @@ -5443,7 +5449,7 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, } else if (mhop_num == 2) { /* alloc L1 BT and buf */ - for (i = 0; i < bt_chk_sz / 8; i++) { + for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) { eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz, &(eq->l1_dma[i]), GFP_KERNEL); @@ -5451,8 +5457,8 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, goto err_dma_alloc_l1; *(eq->bt_l0 + i) = eq->l1_dma[i]; - for (j = 0; j < bt_chk_sz / 8; j++) { - idx = i * bt_chk_sz / 8 + j; + for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) { + idx = i * bt_chk_sz / BA_BYTE_LEN + j; if (eq_buf_cnt + 1 < ba_num) { size = buf_chk_sz; } else { @@ -5497,8 +5503,8 @@ err_dma_alloc_l1: dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], eq->l1_dma[i]); - for (j = 0; j < bt_chk_sz / 8; j++) { - idx = i * bt_chk_sz / 8 + j; + for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) { + idx = i * bt_chk_sz / BA_BYTE_LEN + j; dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], eq->buf_dma[idx]); } @@ -5521,11 +5527,11 @@ err_dma_alloc_buf: dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], eq->l1_dma[i]); - for (j = 0; j < bt_chk_sz / 8; j++) { + for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) { if (i == record_i && j >= record_j) break; - idx = i * bt_chk_sz / 8 + j; + idx = i * bt_chk_sz / BA_BYTE_LEN + j; dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], eq->buf_dma[idx]); @@ -5982,7 +5988,7 @@ static int find_empty_entry(struct hns_roce_idx_que *idx_que) bit_num = ffs(idx_que->bitmap[i]); idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1)); - return i * sizeof(u64) * 8 + (bit_num - 1); + return i * BITS_PER_LONG_LONG + (bit_num - 1); } static void fill_idx_queue(struct hns_roce_idx_que *idx_que, @@ -6058,7 +6064,8 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, */ wmb(); - srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << 24 | srq->srqn; + srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S | + (srq->srqn & V2_DB_BYTE_4_TAG_M); srq_db.parameter = srq->head; hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 0ac177ad6b79..bce21fd2ebb6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -886,6 +886,10 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_256_SQ_FLUSH_IDX_S 16 #define V2_QPC_BYTE_256_SQ_FLUSH_IDX_M GENMASK(31, 16) +#define V2_QP_RWE_S 1 /* rdma write enable */ +#define V2_QP_RRE_S 2 /* rdma read enable */ +#define V2_QP_ATE_S 3 /* rdma atomic enable */ + struct hns_roce_v2_cqe { __le32 byte_4; union { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 05a0f7d50ce3..a6c5c67d0b87 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -64,10 +64,10 @@ static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr) u8 phy_port; u32 i = 0; - if (!memcmp(hr_dev->dev_addr[port], addr, MAC_ADDR_OCTET_NUM)) + if (!memcmp(hr_dev->dev_addr[port], addr, ETH_ALEN)) return 0; - for (i = 0; i < MAC_ADDR_OCTET_NUM; i++) + for (i = 0; i < ETH_ALEN; i++) hr_dev->dev_addr[port][i] = addr[i]; phy_port = hr_dev->iboe.phy_port[port]; @@ -262,7 +262,8 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num, props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256; props->state = (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; - props->phys_state = (props->state == IB_PORT_ACTIVE) ? 5 : 3; + props->phys_state = (props->state == IB_PORT_ACTIVE) ? + HNS_ROCE_PHY_LINKUP : HNS_ROCE_PHY_DISABLED; spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 6110ec408626..38ed4ac741b5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -314,11 +314,11 @@ static void hns_roce_loop_free(struct hns_roce_dev *hr_dev, dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], mr->pbl_l1_dma_addr[i]); - for (j = 0; j < pbl_bt_sz / 8; j++) { + for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { if (i == loop_i && j >= loop_j) break; - bt_idx = i * pbl_bt_sz / 8 + j; + bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j; dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l2[bt_idx], mr->pbl_l2_dma_addr[bt_idx]); @@ -329,8 +329,8 @@ static void hns_roce_loop_free(struct hns_roce_dev *hr_dev, dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], mr->pbl_l1_dma_addr[i]); - for (j = 0; j < pbl_bt_sz / 8; j++) { - bt_idx = i * pbl_bt_sz / 8 + j; + for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { + bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j; dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l2[bt_idx], mr->pbl_l2_dma_addr[bt_idx]); @@ -533,7 +533,7 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, { struct device *dev = hr_dev->dev; unsigned long index = 0; - int ret = 0; + int ret; /* Allocate a key for mr from mr_table */ ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index); @@ -559,7 +559,8 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, mr->pbl_l0_dma_addr = 0; } else { if (!hr_dev->caps.pbl_hop_num) { - mr->pbl_buf = dma_alloc_coherent(dev, npages * 8, + mr->pbl_buf = dma_alloc_coherent(dev, + npages * BA_BYTE_LEN, &(mr->pbl_dma_addr), GFP_KERNEL); if (!mr->pbl_buf) @@ -590,9 +591,8 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, if (mhop_num == HNS_ROCE_HOP_NUM_0) return; - /* hop_num = 1 */ if (mhop_num == 1) { - dma_free_coherent(dev, (unsigned int)(npages * 8), + dma_free_coherent(dev, (unsigned int)(npages * BA_BYTE_LEN), mr->pbl_buf, mr->pbl_dma_addr); return; } @@ -603,12 +603,13 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, if (mhop_num == 2) { for (i = 0; i < mr->l0_chunk_last_num; i++) { if (i == mr->l0_chunk_last_num - 1) { - npages_allocated = i * (pbl_bt_sz / 8); + npages_allocated = + i * (pbl_bt_sz / BA_BYTE_LEN); dma_free_coherent(dev, - (npages - npages_allocated) * 8, - mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); + (npages - npages_allocated) * BA_BYTE_LEN, + mr->pbl_bt_l1[i], + mr->pbl_l1_dma_addr[i]); break; } @@ -621,16 +622,17 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], mr->pbl_l1_dma_addr[i]); - for (j = 0; j < pbl_bt_sz / 8; j++) { - bt_idx = i * (pbl_bt_sz / 8) + j; + for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { + bt_idx = i * (pbl_bt_sz / BA_BYTE_LEN) + j; if ((i == mr->l0_chunk_last_num - 1) && j == mr->l1_chunk_last_num - 1) { npages_allocated = bt_idx * - (pbl_bt_sz / 8); + (pbl_bt_sz / BA_BYTE_LEN); dma_free_coherent(dev, - (npages - npages_allocated) * 8, + (npages - npages_allocated) * + BA_BYTE_LEN, mr->pbl_bt_l2[bt_idx], mr->pbl_l2_dma_addr[bt_idx]); @@ -675,7 +677,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, npages = ib_umem_page_count(mr->umem); if (!hr_dev->caps.pbl_hop_num) - dma_free_coherent(dev, (unsigned int)(npages * 8), + dma_free_coherent(dev, + (unsigned int)(npages * BA_BYTE_LEN), mr->pbl_buf, mr->pbl_dma_addr); else hns_roce_mhop_free(hr_dev, mr); @@ -1059,6 +1062,7 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev, for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { page_addr = sg_page_iter_dma_address(&sg_iter); if (!hr_dev->caps.pbl_hop_num) { + /* for hip06, page addr is aligned to 4K */ mr->pbl_buf[i++] = page_addr >> 12; } else if (hr_dev->caps.pbl_hop_num == 1) { mr->pbl_buf[i++] = page_addr; @@ -1069,7 +1073,7 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev, mr->pbl_bt_l2[i][j] = page_addr; j++; - if (j >= (pbl_bt_sz / 8)) { + if (j >= (pbl_bt_sz / BA_BYTE_LEN)) { i++; j = 0; } @@ -1117,7 +1121,8 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } else { u64 pbl_size = 1; - bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / 8; + bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / + BA_BYTE_LEN; for (i = 0; i < hr_dev->caps.pbl_hop_num; i++) pbl_size *= bt_size; if (n > pbl_size) { From cfcc048ca76e4927f163f178f59d557588ba32f7 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 25 May 2019 20:57:37 +0800 Subject: [PATCH 030/194] IB/hfi1: Remove set but not used variables 'offset' and 'fspsn' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/hw/hfi1/tid_rdma.c: In function tid_rdma_rcv_error: drivers/infiniband/hw/hfi1/tid_rdma.c:2029:7: warning: variable offset set but not used [-Wunused-but-set-variable] drivers/infiniband/hw/hfi1/tid_rdma.c: In function hfi1_rc_rcv_tid_rdma_ack: drivers/infiniband/hw/hfi1/tid_rdma.c:4555:35: warning: variable fspsn set but not used [-Wunused-but-set-variable] 'offset' is never used since introduction in commit d0d564a1caac ("IB/hfi1: Add functions to receive TID RDMA READ request") 'fspsn' is never used since introduciotn in commit 9e93e967f7b4 ("IB/hfi1: Add a function to receive TID RDMA ACK packet") Signed-off-by: YueHaibing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/tid_rdma.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 6fb93032fbef..bdf1c313e13f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2026,7 +2026,6 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); if (e->opcode == TID_OP(READ_REQ)) { struct ib_reth *reth; - u32 offset; u32 len; u32 rkey; u64 vaddr; @@ -2038,7 +2037,6 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, * The requester always restarts from the start of the original * request. */ - offset = delta_psn(psn, e->psn) * qp->pmtu; len = be32_to_cpu(reth->length); if (psn != e->psn || len != req->total_len) goto unlock; @@ -4552,7 +4550,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) struct rvt_swqe *wqe; struct tid_rdma_request *req; struct tid_rdma_flow *flow; - u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn; + u32 aeth, psn, req_psn, ack_psn, resync_psn, ack_kpsn; unsigned long flags; u16 fidx; @@ -4756,7 +4754,6 @@ done: IB_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ flow = &req->flows[req->acked_tail]; - fspsn = full_flow_psn(flow, flow->flow_state.spsn); trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); From ea996974589e7987eb463d8a7c404358244755ea Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 24 May 2019 18:45:22 -0700 Subject: [PATCH 031/194] RDMA: Convert put_page() to put_user_page*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For infiniband code that retains pages via get_user_pages*(), release those pages via the new put_user_page(), or put_user_pages*(), instead of put_page() This is a tiny part of the second step of fixing the problem described in [1]. The steps are: 1) Provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. Again, [1] provides details as to why that is desirable. [1] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()" Reviewed-by: Jan Kara Reviewed-by: Dennis Dalessandro Reviewed-by: Ira Weiny Reviewed-by: Jérôme Glisse Acked-by: Jason Gunthorpe Tested-by: Ira Weiny Signed-off-by: John Hubbard Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 7 ++++--- drivers/infiniband/core/umem_odp.c | 10 +++++----- drivers/infiniband/hw/hfi1/user_pages.c | 11 ++++------- drivers/infiniband/hw/mthca/mthca_memfree.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 11 ++++------- drivers/infiniband/hw/qib/qib_user_sdma.c | 6 +++--- drivers/infiniband/hw/usnic/usnic_uiom.c | 7 ++++--- 7 files changed, 27 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 7edc5839606b..54628ef879f0 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,9 +54,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (!PageDirty(page) && umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); + if (umem->writable && dirty) + put_user_pages_dirty_lock(&page, 1); + else + put_user_page(page); } sg_free_table(&umem->sg_head); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c3b3c523401f..9001cc10770a 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -482,7 +482,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. * - * The page is released via put_page even if the operation failed. For + * The page is released via put_user_page even if the operation failed. For * on-demand pinning, the page is released whenever it isn't stored in the * umem. */ @@ -530,7 +530,7 @@ static int ib_umem_odp_map_dma_single_page( } out: - put_page(page); + put_user_page(page); if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); @@ -653,7 +653,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, ret = -EFAULT; break; } - put_page(local_page_list[j]); + put_user_page(local_page_list[j]); continue; } @@ -680,8 +680,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * ib_umem_odp_map_dma_single_page(). */ if (npages - (j + 1) > 0) - release_pages(&local_page_list[j+1], - npages - (j + 1)); + put_user_pages(&local_page_list[j+1], + npages - (j + 1)); break; } } diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 02eee8eff1db..b89a9b9aef7a 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -118,13 +118,10 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty) { - size_t i; - - for (i = 0; i < npages; i++) { - if (dirty) - set_page_dirty_lock(p[i]); - put_page(p[i]); - } + if (dirty) + put_user_pages_dirty_lock(p, npages); + else + put_user_pages(p, npages); if (mm) { /* during close after signal, mm can be NULL */ atomic64_sub(npages, &mm->pinned_vm); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 8ff0e90d7564..edccfd6e178f 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -482,7 +482,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); if (ret < 0) { - put_page(pages[0]); + put_user_page(pages[0]); goto out; } @@ -490,7 +490,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, mthca_uarc_virt(dev, uar, i)); if (ret) { pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); - put_page(sg_page(&db_tab->page[i].mem)); + put_user_page(sg_page(&db_tab->page[i].mem)); goto out; } @@ -556,7 +556,7 @@ void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, if (db_tab->page[i].uvirt) { mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); - put_page(sg_page(&db_tab->page[i].mem)); + put_user_page(sg_page(&db_tab->page[i].mem)); } } diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f712fb7fa82f..bfbfbb7e0ff4 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -40,13 +40,10 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, int dirty) { - size_t i; - - for (i = 0; i < num_pages; i++) { - if (dirty) - set_page_dirty_lock(p[i]); - put_page(p[i]); - } + if (dirty) + put_user_pages_dirty_lock(p, num_pages); + else + put_user_pages(p, num_pages); } /** diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 0c204776263f..ac5bdb02144f 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -317,7 +317,7 @@ static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, * the caller can ignore this page. */ if (put) { - put_page(page); + put_user_page(page); } else { /* coalesce case */ kunmap(page); @@ -631,7 +631,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, kunmap(pkt->addr[i].page); if (pkt->addr[i].put_page) - put_page(pkt->addr[i].page); + put_user_page(pkt->addr[i].page); else __free_page(pkt->addr[i].page); } else if (pkt->addr[i].kvaddr) { @@ -706,7 +706,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, /* if error, return all pages not managed by pkt */ free_pages: while (i < j) - put_page(pages[i++]); + put_user_page(pages[i++]); done: return ret; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index e312f522a66d..0b0237d41613 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -75,9 +75,10 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - if (!PageDirty(page) && dirty) - set_page_dirty_lock(page); - put_page(page); + if (dirty) + put_user_pages_dirty_lock(&page, 1); + else + put_user_page(page); usnic_dbg("pa: %pa\n", &pa); } kfree(chunk); From 5f5e4eb4fb2d6c09db18dc431a0c4a3b11ff5bae Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 24 May 2019 11:44:58 -0400 Subject: [PATCH 032/194] IB/hfi1: Remove extra brackets from an if A recent patch to hfi1 left behind a checkpatch error. Fixes: fb24ea52f78e ("drivers: Remove explicit invocations of mmiowb()") Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 16ba9d52e1b9..919008620dd3 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1577,9 +1577,8 @@ void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint) else sc_del_credit_return_intr(sc); trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl); - if (needint) { + if (needint) sc_return_credits(sc); - } } /** From 255efcaeb623dd9777523b98d29aa5a0792d9245 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 28 May 2019 15:46:14 +0300 Subject: [PATCH 033/194] RDMA/efa: Use kvzalloc instead of kzalloc with fallback Use kvzalloc which attempts to allocate a physically continuous buffer and fallbacks to virtually continuous on failure instead of open coding it in the driver. The is_vmalloc_addr function is used to determine whether the buffer is physically continuous or not (which determines direct vs indirect MR registration mode). Suggested-by: Jason Gunthorpe Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 52 +++++++++++++-------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 4999a74cee24..b791cc90d106 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1285,30 +1285,30 @@ static int pbl_create(struct efa_dev *dev, int err; pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE; - pbl->pbl_buf = kzalloc(pbl->pbl_buf_size_in_bytes, - GFP_KERNEL | __GFP_NOWARN); - if (pbl->pbl_buf) { + pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL); + if (!pbl->pbl_buf) + return -ENOMEM; + + if (is_vmalloc_addr(pbl->pbl_buf)) { + pbl->physically_continuous = 0; + err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt, + hp_shift); + if (err) + goto err_free; + + err = pbl_indirect_initialize(dev, pbl); + if (err) + goto err_free; + } else { pbl->physically_continuous = 1; err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt, hp_shift); if (err) - goto err_continuous; + goto err_free; + err = pbl_continuous_initialize(dev, pbl); if (err) - goto err_continuous; - } else { - pbl->physically_continuous = 0; - pbl->pbl_buf = vzalloc(pbl->pbl_buf_size_in_bytes); - if (!pbl->pbl_buf) - return -ENOMEM; - - err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt, - hp_shift); - if (err) - goto err_indirect; - err = pbl_indirect_initialize(dev, pbl); - if (err) - goto err_indirect; + goto err_free; } ibdev_dbg(&dev->ibdev, @@ -1317,24 +1317,20 @@ static int pbl_create(struct efa_dev *dev, return 0; -err_continuous: - kfree(pbl->pbl_buf); - return err; -err_indirect: - vfree(pbl->pbl_buf); +err_free: + kvfree(pbl->pbl_buf); return err; } static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl) { - if (pbl->physically_continuous) { + if (pbl->physically_continuous) dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr, pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE); - kfree(pbl->pbl_buf); - } else { + else pbl_indirect_terminate(dev, pbl); - vfree(pbl->pbl_buf); - } + + kvfree(pbl->pbl_buf); } static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr, From e0e3f39759151fb1b445a0dbc5d6a14f3e1732aa Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 28 May 2019 15:46:15 +0300 Subject: [PATCH 034/194] RDMA/efa: Remove unneeded admin commands abort flow The admin commands abort flow is buggy (use-after-free) and not really necessary as it is guaranteed that after ib_unregister_device() is called there are no user verbs threads running in parallel, delete it. Suggested-by: Jason Gunthorpe Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_com.c | 74 +---------------------------- drivers/infiniband/hw/efa/efa_com.h | 1 - 2 files changed, 1 insertion(+), 74 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index a5c788741a04..ec04ced9fd2b 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -39,8 +39,6 @@ enum efa_cmd_status { EFA_CMD_SUBMITTED, EFA_CMD_COMPLETED, - /* Abort - canceled by the driver */ - EFA_CMD_ABORTED, }; struct efa_comp_ctx { @@ -532,16 +530,6 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c msleep(aq->poll_interval); } - if (comp_ctx->status == EFA_CMD_ABORTED) { - ibdev_err(aq->efa_dev, "Command was aborted\n"); - atomic64_inc(&aq->stats.aborted_cmd); - err = -ENODEV; - goto out; - } - - WARN_ONCE(comp_ctx->status != EFA_CMD_COMPLETED, - "Invalid completion status %d\n", comp_ctx->status); - err = efa_com_comp_status_to_errno(comp_ctx->comp_status); out: efa_com_put_comp_ctx(aq, comp_ctx); @@ -665,66 +653,6 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq, return err; } -/** - * efa_com_abort_admin_commands - Abort all the outstanding admin commands. - * @edev: EFA communication layer struct - * - * This method aborts all the outstanding admin commands. - * The caller should then call efa_com_wait_for_abort_completion to make sure - * all the commands were completed. - */ -static void efa_com_abort_admin_commands(struct efa_com_dev *edev) -{ - struct efa_com_admin_queue *aq = &edev->aq; - struct efa_comp_ctx *comp_ctx; - unsigned long flags; - u16 i; - - spin_lock(&aq->sq.lock); - spin_lock_irqsave(&aq->cq.lock, flags); - for (i = 0; i < aq->depth; i++) { - comp_ctx = efa_com_get_comp_ctx(aq, i, false); - if (!comp_ctx) - break; - - comp_ctx->status = EFA_CMD_ABORTED; - - complete(&comp_ctx->wait_event); - } - spin_unlock_irqrestore(&aq->cq.lock, flags); - spin_unlock(&aq->sq.lock); -} - -/** - * efa_com_wait_for_abort_completion - Wait for admin commands abort. - * @edev: EFA communication layer struct - * - * This method wait until all the outstanding admin commands will be completed. - */ -static void efa_com_wait_for_abort_completion(struct efa_com_dev *edev) -{ - struct efa_com_admin_queue *aq = &edev->aq; - int i; - - /* all mine */ - for (i = 0; i < aq->depth; i++) - down(&aq->avail_cmds); - - /* let it go */ - for (i = 0; i < aq->depth; i++) - up(&aq->avail_cmds); -} - -static void efa_com_admin_flush(struct efa_com_dev *edev) -{ - struct efa_com_admin_queue *aq = &edev->aq; - - clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); - - efa_com_abort_admin_commands(edev); - efa_com_wait_for_abort_completion(edev); -} - /** * efa_com_admin_destroy - Destroy the admin and the async events queues. * @edev: EFA communication layer struct @@ -737,7 +665,7 @@ void efa_com_admin_destroy(struct efa_com_dev *edev) struct efa_com_admin_sq *sq = &aq->sq; u16 size; - efa_com_admin_flush(edev); + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); devm_kfree(edev->dmadev, aq->comp_ctx_pool); devm_kfree(edev->dmadev, aq->comp_ctx); diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h index 84d96724a74b..c67dd8109d1c 100644 --- a/drivers/infiniband/hw/efa/efa_com.h +++ b/drivers/infiniband/hw/efa/efa_com.h @@ -45,7 +45,6 @@ struct efa_com_admin_sq { /* Don't use anything other than atomic64 */ struct efa_com_stats_admin { - atomic64_t aborted_cmd; atomic64_t submitted_cmd; atomic64_t completed_cmd; atomic64_t no_completion; From 4d50e084c560fa952e235d092cf53c86d8bf4b7b Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 28 May 2019 15:46:17 +0300 Subject: [PATCH 035/194] RDMA/efa: Use rdma block iterator in chunk list creation When creating the chunks list the rdma_for_each_block() iterator is used in order to iterate over the payload in EFA_CHUNK_PAYLOAD_SIZE (device defined) strides. Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Reviewed-by: Shiraz Saleem Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index b791cc90d106..607aff869200 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1085,14 +1085,14 @@ err: */ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl) { - unsigned int entry, payloads_in_sg, chunk_list_size, chunk_idx, payload_idx; struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list; int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages; struct scatterlist *pages_sgl = pbl->phys.indirect.sgl; + unsigned int chunk_list_size, chunk_idx, payload_idx; int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt; struct efa_com_ctrl_buff_info *ctrl_buf; u64 *cur_chunk_buf, *prev_chunk_buf; - struct scatterlist *sg; + struct ib_block_iter biter; dma_addr_t dma_addr; int i; @@ -1126,18 +1126,15 @@ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl) chunk_idx = 0; payload_idx = 0; cur_chunk_buf = chunk_list->chunks[0].buf; - for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) { - payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT; - for (i = 0; i < payloads_in_sg; i++) { - cur_chunk_buf[payload_idx++] = - (sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) + - (EFA_CHUNK_PAYLOAD_SIZE * i); + rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt, + EFA_CHUNK_PAYLOAD_SIZE) { + cur_chunk_buf[payload_idx++] = + rdma_block_iter_dma_address(&biter); - if (payload_idx == EFA_PTRS_PER_CHUNK) { - chunk_idx++; - cur_chunk_buf = chunk_list->chunks[chunk_idx].buf; - payload_idx = 0; - } + if (payload_idx == EFA_PTRS_PER_CHUNK) { + chunk_idx++; + cur_chunk_buf = chunk_list->chunks[chunk_idx].buf; + payload_idx = 0; } } From 2367d00e2ca3dfda1b30f89d854cb1669b941e7f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 28 May 2019 15:46:18 +0300 Subject: [PATCH 036/194] RDMA/efa: Remove unused includes Remove leftover includes that are no longer used from the driver. Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa.h | 2 -- drivers/infiniband/hw/efa/efa_com_cmd.c | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 9e3cc3239c13..14a36546985b 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -7,10 +7,8 @@ #define _EFA_H_ #include -#include #include #include -#include #include #include diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 14227725521c..91e7f2195802 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -3,7 +3,6 @@ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "efa.h" #include "efa_com.h" #include "efa_com_cmd.h" From 34755f596110fb85f4c5b5fbe56aeb86042074bc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 May 2019 10:12:48 -0500 Subject: [PATCH 037/194] IB/rdmavt: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes, in particular in the context in which this code is being used. So, replace the following form: sizeof(struct rvt_sge) * init_attr->cap.max_send_sge + sizeof(struct rvt_swqe) with: struct_size(swq, sg_list, init_attr->cap.max_send_sge) and so on... Also, notice that variable size is unnecessary, hence it is removed. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 31a2e65e4906..a60f5faea198 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -988,9 +988,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, case IB_QPT_UC: case IB_QPT_RC: case IB_QPT_UD: - sz = sizeof(struct rvt_sge) * - init_attr->cap.max_send_sge + - sizeof(struct rvt_swqe); + sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge); swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); From 829ca44ecf60e9b6f83d0161a6ef10c1304c5060 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 May 2019 10:13:26 -0500 Subject: [PATCH 038/194] IB/qib: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes, in particular in the context in which this code is being used. So, replace the following form: sizeof(*pkt) + sizeof(pkt->addr[0])*n with: struct_size(pkt, addr, n) Also, notice that variable size is unnecessary, hence it is removed. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_user_sdma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index ac5bdb02144f..05190edc2611 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -904,10 +904,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, } if (frag_size) { - int pktsize, tidsmsize, n; + int tidsmsize, n; + size_t pktsize; n = npages*((2*PAGE_SIZE/frag_size)+1); - pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n; + pktsize = struct_size(pkt, addr, n); /* * Determine if this is tid-sdma or just sdma. From 6fe1a9b9b6542d460099395492d029bb6c6c2f5e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 29 May 2019 10:15:28 -0500 Subject: [PATCH 039/194] IB/hfi1: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes, in particular in the context in which this code is being used. So, replace the following form: sizeof(struct opa_port_status_rsp) + num_vls * sizeof(struct _vls_pctrs) with: struct_size(rsp, vls, num_vls) and so on... Also, notice that variable size is unnecessary, hence it is removed. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mad.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 4228393e6c4c..184dba3c2828 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2744,8 +2744,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, u16 link_width; u16 link_speed; - response_data_size = sizeof(struct opa_port_status_rsp) + - num_vls * sizeof(struct _vls_pctrs); + response_data_size = struct_size(rsp, vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE; return reply((struct ib_mad_hdr *)pmp); @@ -3014,8 +3013,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, } /* Sanity check */ - response_data_size = sizeof(struct opa_port_data_counters_msg) + - num_vls * sizeof(struct _vls_dctrs); + response_data_size = struct_size(req, port[0].vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; @@ -3232,8 +3230,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - response_data_size = sizeof(struct opa_port_error_counters64_msg) + - num_vls * sizeof(struct _vls_ectrs); + response_data_size = struct_size(req, port[0].vls, num_vls); if (response_data_size > sizeof(pmp->data)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; From cac2a301c02a9b178842e22df34217da7854e588 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 30 May 2019 11:20:24 +0300 Subject: [PATCH 040/194] RDMA/uverbs: check for allocation failure in uapi_add_elm() If the kzalloc() fails then we should return ERR_PTR(-ENOMEM). In the current code it's possible that the kzalloc() fails and the radix_tree_insert() inserts the NULL pointer successfully and we return the NULL "elm" pointer to the caller. That results in a NULL pointer dereference. Fixes: 9ed3e5f44772 ("IB/uverbs: Build the specs into a radix tree at runtime") Signed-off-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_uapi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 7a987acf0c0b..ccc4be0a6566 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -22,6 +22,8 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) return ERR_PTR(-EOVERFLOW); elm = kzalloc(alloc_size, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); rc = radix_tree_insert(&uapi->radix, key, elm); if (rc) { kfree(elm); From 97545b10221ad14b046dba135a37f4e98a560697 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Thu, 30 May 2019 23:55:53 +0800 Subject: [PATCH 041/194] RDMA/hns: Bugfix for posting multiple srq work request When the user submits more than 32 work request to a srq queue at a time, it needs to find the corresponding number of entries in the bitmap in the idx queue. However, the original lookup function named ffs only processes 32 bits of the array element, When the number of srq wqe issued exceeds 32, the ffs will only process the lower 32 bits of the elements, it will not be able to get the correct wqe index for srq wqe. Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 +++++++++++---------- drivers/infiniband/hw/hns/hns_roce_srq.c | 15 ++------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index d6e8b446fc57..ce23338831eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -489,7 +489,7 @@ struct hns_roce_idx_que { u32 buf_size; struct ib_umem *umem; struct hns_roce_mtt mtt; - u64 *bitmap; + unsigned long *bitmap; }; struct hns_roce_srq { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7fcec998618a..ac017c24b200 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2340,15 +2340,10 @@ static void *get_srq_wqe(struct hns_roce_srq *srq, int n) static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index) { - u32 bitmap_num; - int bit_num; - /* always called with interrupts disabled. */ spin_lock(&srq->lock); - bitmap_num = wqe_index / (sizeof(u64) * 8); - bit_num = wqe_index % (sizeof(u64) * 8); - srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num); + bitmap_clear(srq->idx_que.bitmap, wqe_index, 1); srq->tail++; spin_unlock(&srq->lock); @@ -5977,18 +5972,19 @@ out: return ret; } -static int find_empty_entry(struct hns_roce_idx_que *idx_que) +static int find_empty_entry(struct hns_roce_idx_que *idx_que, + unsigned long size) { - int bit_num; - int i; + int wqe_idx; - /* bitmap[i] is set zero if all bits are allocated */ - for (i = 0; idx_que->bitmap[i] == 0; ++i) - ; - bit_num = ffs(idx_que->bitmap[i]); - idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1)); + if (unlikely(bitmap_full(idx_que->bitmap, size))) + return -ENOSPC; - return i * BITS_PER_LONG_LONG + (bit_num - 1); + wqe_idx = find_first_zero_bit(idx_que->bitmap, size); + + bitmap_set(idx_que->bitmap, wqe_idx, 1); + + return wqe_idx; } static void fill_idx_queue(struct hns_roce_idx_que *idx_que, @@ -6034,7 +6030,13 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, break; } - wqe_idx = find_empty_entry(&srq->idx_que); + wqe_idx = find_empty_entry(&srq->idx_que, srq->max); + if (wqe_idx < 0) { + ret = -ENOMEM; + *bad_wr = wr; + break; + } + fill_idx_queue(&srq->idx_que, ind, wqe_idx); wqe = get_srq_wqe(srq, wqe_idx); dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index ad15b41da30a..c222f243953a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -181,28 +181,19 @@ static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq, { struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); struct hns_roce_idx_que *idx_que = &srq->idx_que; - u32 bitmap_num; - int i; - bitmap_num = HNS_ROCE_ALOGN_UP(srq->max, 8 * sizeof(u64)); - - idx_que->bitmap = kcalloc(1, bitmap_num / 8, GFP_KERNEL); + idx_que->bitmap = bitmap_zalloc(srq->max, GFP_KERNEL); if (!idx_que->bitmap) return -ENOMEM; - bitmap_num = bitmap_num / (8 * sizeof(u64)); - idx_que->buf_size = srq->idx_que.buf_size; if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2, &idx_que->idx_buf, page_shift)) { - kfree(idx_que->bitmap); + bitmap_free(idx_que->bitmap); return -ENOMEM; } - for (i = 0; i < bitmap_num; i++) - idx_que->bitmap[i] = ~(0UL); - return 0; } @@ -395,7 +386,7 @@ err_idx_mtt: err_create_idx: hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, &srq->idx_que.idx_buf); - kfree(srq->idx_que.bitmap); + bitmap_free(srq->idx_que.bitmap); err_srq_mtt: hns_roce_mtt_cleanup(hr_dev, &srq->mtt); From 633fbb06cbb34cb37117a298c4316ad9d54a7a63 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Wed, 29 May 2019 16:55:45 +0300 Subject: [PATCH 042/194] RDMA/ipoib: implement ethtool .get_link() callback Add support for reporting link state for ipoib net devices. $ ip l set dev mlx4_ib0 up $ ethtool mlx4_ib0 | grep Link Link detected: yes $ ip l set dev mlx4_ib0 down $ ethtool mlx4_ib0 | grep Link Link detected: no Signed-off-by: Kamal Heib Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 83429925dfc6..58016532bf86 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -222,6 +222,7 @@ static const struct ethtool_ops ipoib_ethtool_ops = { .get_strings = ipoib_get_strings, .get_ethtool_stats = ipoib_get_ethtool_stats, .get_sset_count = ipoib_get_sset_count, + .get_link = ethtool_op_get_link, }; void ipoib_set_ethtool_ops(struct net_device *dev) From bcef5b7215681250c4bf8961dfe15e9e4fef97d0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 29 May 2019 09:38:31 -0700 Subject: [PATCH 043/194] RDMA/srp: Accept again source addresses that do not have a port number The function srp_parse_in() is used both for parsing source address specifications and for target address specifications. Target addresses must have a port number. Having to specify a port number for source addresses is inconvenient. Make sure that srp_parse_in() supports again parsing addresses with no port number. Cc: Fixes: c62adb7def71 ("IB/srp: Fix IPv6 address parsing") Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index be9ddcad8f28..87848faa7502 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3481,13 +3481,14 @@ static const match_table_t srp_opt_tokens = { * @net: [in] Network namespace. * @sa: [out] Address family, IP address and port number. * @addr_port_str: [in] IP address and port number. + * @has_port: [out] Whether or not @addr_port_str includes a port number. * * Parse the following address formats: * - IPv4: :, e.g. 1.2.3.4:5. * - IPv6: \[\]:, e.g. [1::2:3%4]:5. */ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, - const char *addr_port_str) + const char *addr_port_str, bool *has_port) { char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL); char *port_str; @@ -3496,9 +3497,12 @@ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, if (!addr) return -ENOMEM; port_str = strrchr(addr, ':'); - if (!port_str) - return -EINVAL; - *port_str++ = '\0'; + if (port_str && strchr(port_str, ']')) + port_str = NULL; + if (port_str) + *port_str++ = '\0'; + if (has_port) + *has_port = port_str != NULL; ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa); if (ret && addr[0]) { addr_end = addr + strlen(addr) - 1; @@ -3520,6 +3524,7 @@ static int srp_parse_options(struct net *net, const char *buf, char *p; substring_t args[MAX_OPT_ARGS]; unsigned long long ull; + bool has_port; int opt_mask = 0; int token; int ret = -EINVAL; @@ -3618,7 +3623,8 @@ static int srp_parse_options(struct net *net, const char *buf, ret = -ENOMEM; goto out; } - ret = srp_parse_in(net, &target->rdma_cm.src.ss, p); + ret = srp_parse_in(net, &target->rdma_cm.src.ss, p, + NULL); if (ret < 0) { pr_warn("bad source parameter '%s'\n", p); kfree(p); @@ -3634,7 +3640,10 @@ static int srp_parse_options(struct net *net, const char *buf, ret = -ENOMEM; goto out; } - ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p); + ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p, + &has_port); + if (!has_port) + ret = -EINVAL; if (ret < 0) { pr_warn("bad dest parameter '%s'\n", p); kfree(p); From fa027328a1c93d72031e6cc6b4a7eee967fd7406 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 31 May 2019 10:21:01 +0100 Subject: [PATCH 044/194] RDMA/hns: fix inverted logic of readl read and shift A previous change incorrectly changed the inverted logic and logically negated the readl rather than the shifted readl result. Fix this by adding in missing parentheses around the expression that needs to be logically negated. Addresses-Coverity: ("Logically dead code") Fixes: 669cefb654cb ("RDMA/hns: Remove jiffies operation in disable interrupt context") Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 157c84a1f55f..8490a86c3ef0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -378,7 +378,7 @@ static int hns_roce_set_hem(struct hns_roce_dev *hr_dev, end = HW_SYNC_TIMEOUT_MSECS; while (end) { - if (!readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) + if (!(readl(bt_cmd) >> BT_CMD_SYNC_SHIFT)) break; mdelay(HW_SYNC_SLEEP_TIME_INTERVAL); From 4f18904c78495ec16e271ee507709c626e61a62a Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Fri, 31 May 2019 18:28:03 +0800 Subject: [PATCH 045/194] RDMA/hns: Bugfix for filling the sge of srq When user post recv a srq with multiple sges, the hardware will get the last correct sge and count the sge numbers according to the specific identifier with lkey. For example, when the driver fills the sges with every wr less than the max sge that the user configured when creating srq, the hardware will stop getting the sge according to the specific lkey in the sge. However, it will always end with the first sge in the current post srq recv interface implementation. Fixes: c7bcb13442e1 ("RDMA/hns: Add SRQ support for hip08 kernel mode") Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ac017c24b200..2d27dc91a823 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6048,9 +6048,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, } if (i < srq->max_gs) { - dseg->len = 0; - dseg->lkey = cpu_to_le32(0x100); - dseg->addr = 0; + dseg[i].len = 0; + dseg[i].lkey = cpu_to_le32(0x100); + dseg[i].addr = 0; } srq->wrid[wqe_idx] = wr->wr_id; From 9bcb8940f4c0c58d2bec66e72e019ed58e602b95 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 4 Jun 2019 10:42:22 -0500 Subject: [PATCH 046/194] RDMA/ucma: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 140a338a135f..cbe460076611 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -951,8 +951,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, } } - if (copy_to_user(response, resp, - sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); From 4dfd5321cf0a22bc43dd215d80d362895e0deb36 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 20 Feb 2019 16:21:04 -0800 Subject: [PATCH 047/194] ucma: Convert multicast_idr to XArray Signed-off-by: Matthew Wilcox Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index cbe460076611..c8a2918546fb 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -104,7 +104,7 @@ struct ucma_context { struct ucma_multicast { struct ucma_context *ctx; - int id; + u32 id; int events_reported; u64 uid; @@ -124,7 +124,7 @@ struct ucma_event { static DEFINE_MUTEX(mut); static DEFINE_IDR(ctx_idr); -static DEFINE_IDR(multicast_idr); +static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; @@ -238,13 +238,10 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) if (!mc) return NULL; - mutex_lock(&mut); - mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (mc->id < 0) + mc->ctx = ctx; + if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) goto error; - mc->ctx = ctx; list_add_tail(&mc->list, &ctx->mc_list); return mc; @@ -540,7 +537,7 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) mutex_lock(&mut); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); - idr_remove(&multicast_idr, mc->id); + xa_erase(&multicast_table, mc->id); kfree(mc); } mutex_unlock(&mut); @@ -1431,9 +1428,7 @@ static ssize_t ucma_process_join(struct ucma_file *file, goto err3; } - mutex_lock(&mut); - idr_replace(&multicast_idr, mc, mc->id); - mutex_unlock(&mut); + xa_store(&multicast_table, mc->id, mc, 0); mutex_unlock(&file->mut); ucma_put_ctx(ctx); @@ -1443,9 +1438,7 @@ err3: rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); ucma_cleanup_mc_events(mc); err2: - mutex_lock(&mut); - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); + xa_erase(&multicast_table, mc->id); list_del(&mc->list); kfree(mc); err1: @@ -1507,8 +1500,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); - mc = idr_find(&multicast_idr, cmd.id); + xa_lock(&multicast_table); + mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (mc->ctx->file != file) @@ -1516,8 +1509,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, else if (!atomic_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); else - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); if (IS_ERR(mc)) { ret = PTR_ERR(mc); @@ -1846,7 +1839,6 @@ static void __exit ucma_cleanup(void) device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); - idr_destroy(&multicast_idr); } module_init(ucma_init); From afcafe07af0e0aeddbf40e163663fdf319c34739 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 20 Feb 2019 16:21:05 -0800 Subject: [PATCH 048/194] ucma: Convert ctx_idr to XArray Signed-off-by: Matthew Wilcox Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 58 ++++++++++++++-------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index c8a2918546fb..39823c842202 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -81,7 +81,7 @@ struct ucma_file { }; struct ucma_context { - int id; + u32 id; struct completion comp; atomic_t ref; int events_reported; @@ -94,7 +94,7 @@ struct ucma_context { struct list_head list; struct list_head mc_list; /* mark that device is in process of destroying the internal HW - * resources, protected by the global mut + * resources, protected by the ctx_table lock */ int closing; /* sync between removal event and id destroy, protected by file mut */ @@ -122,8 +122,7 @@ struct ucma_event { struct work_struct close_work; }; -static DEFINE_MUTEX(mut); -static DEFINE_IDR(ctx_idr); +static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; @@ -133,7 +132,7 @@ static inline struct ucma_context *_ucma_find_context(int id, { struct ucma_context *ctx; - ctx = idr_find(&ctx_idr, id); + ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file || !ctx->cm_id) @@ -145,7 +144,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); if (!IS_ERR(ctx)) { if (ctx->closing) @@ -153,7 +152,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) else atomic_inc(&ctx->ref); } - mutex_unlock(&mut); + xa_unlock(&ctx_table); return ctx; } @@ -216,10 +215,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_LIST_HEAD(&ctx->mc_list); ctx->file = file; - mutex_lock(&mut); - ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (ctx->id < 0) + if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) goto error; list_add_tail(&ctx->list, &file->ctx_list); @@ -316,9 +312,9 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) * handled separately below. */ if (ctx->cm_id == cm_id) { - mutex_lock(&mut); + xa_lock(&ctx_table); ctx->closing = 1; - mutex_unlock(&mut); + xa_unlock(&ctx_table); queue_work(ctx->file->close_wq, &ctx->close_work); return; } @@ -520,9 +516,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, err2: rdma_destroy_id(cm_id); err1: - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); + xa_erase(&ctx_table, ctx->id); mutex_lock(&file->mut); list_del(&ctx->list); mutex_unlock(&file->mut); @@ -534,13 +528,13 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; - mutex_lock(&mut); + mutex_lock(&ctx->file->mut); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); xa_erase(&multicast_table, mc->id); kfree(mc); } - mutex_unlock(&mut); + mutex_unlock(&ctx->file->mut); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) @@ -611,11 +605,11 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); + __xa_erase(&ctx_table, ctx->id); + xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -627,14 +621,14 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, flush_workqueue(ctx->file->close_wq); /* At this point it's guaranteed that there is no inflight * closing task */ - mutex_lock(&mut); + xa_lock(&ctx_table); if (!ctx->closing) { - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); rdma_destroy_id(ctx->cm_id); } else { - mutex_unlock(&mut); + xa_unlock(&ctx_table); } resp.events_reported = ucma_free_ctx(ctx); @@ -1607,14 +1601,14 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, * events being added before existing events. */ ucma_lock_files(cur_file, new_file); - mutex_lock(&mut); + xa_lock(&ctx_table); list_move_tail(&ctx->list, &new_file->ctx_list); ucma_move_events(ctx, new_file); ctx->file = new_file; resp.events_reported = ctx->events_reported; - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_unlock_files(cur_file, new_file); response: @@ -1749,18 +1743,15 @@ static int ucma_close(struct inode *inode, struct file *filp) ctx->destroying = 1; mutex_unlock(&file->mut); - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); - + xa_erase(&ctx_table, ctx->id); flush_workqueue(file->close_wq); /* At that step once ctx was marked as destroying and workqueue * was flushed we are safe from any inflights handlers that * might put other closing task. */ - mutex_lock(&mut); + xa_lock(&ctx_table); if (!ctx->closing) { - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* rdma_destroy_id ensures that no event handlers are @@ -1768,7 +1759,7 @@ static int ucma_close(struct inode *inode, struct file *filp) */ rdma_destroy_id(ctx->cm_id); } else { - mutex_unlock(&mut); + xa_unlock(&ctx_table); } ucma_free_ctx(ctx); @@ -1838,7 +1829,6 @@ static void __exit ucma_cleanup(void) unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); - idr_destroy(&ctx_idr); } module_init(ucma_init); From a1a8e4a85cf7daff8b26c7b8698442ef677b4f97 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Jun 2019 15:02:01 -0300 Subject: [PATCH 049/194] rdma: Delete the ib_ucm module This has been marked CONFIG_BROKEN for over a year now with no complaints. Delete the whole thing for good. The module provided the /dev/infiniband/ucmX interface. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 11 - drivers/infiniband/core/Makefile | 3 - drivers/infiniband/core/ucm.c | 1350 ------------------------------ include/uapi/rdma/ib_user_cm.h | 326 -------- 4 files changed, 1690 deletions(-) delete mode 100644 drivers/infiniband/core/ucm.c delete mode 100644 include/uapi/rdma/ib_user_cm.h diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index cbfbea49f126..cbaafa4e0302 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -35,17 +35,6 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from rdma-core . -config INFINIBAND_USER_ACCESS_UCM - tristate "Userspace CM (UCM, DEPRECATED)" - depends on BROKEN || COMPILE_TEST - depends on INFINIBAND_USER_ACCESS - help - The UCM module has known security flaws, which no one is - interested to fix. The user-space part of this code was - dropped from the upstream a long time ago. - - This option is DEPRECATED and planned to be removed. - config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI bool "Allow experimental legacy verbs in new ioctl uAPI (EXPERIMENTAL)" depends on INFINIBAND_USER_ACCESS diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 313f2349b518..42f1b2a4f746 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) -obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ @@ -29,8 +28,6 @@ rdma_ucm-y := ucma.o ib_umad-y := user_mad.o -ib_ucm-y := ucm.o - ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c deleted file mode 100644 index 8e7da2d41fd8..000000000000 --- a/drivers/infiniband/core/ucm.c +++ /dev/null @@ -1,1350 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include - -#include "core_priv.h" - -MODULE_AUTHOR("Libor Michalek"); -MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); -MODULE_LICENSE("Dual BSD/GPL"); - -struct ib_ucm_device { - int devnum; - struct cdev cdev; - struct device dev; - struct ib_device *ib_dev; -}; - -struct ib_ucm_file { - struct mutex file_mutex; - struct file *filp; - struct ib_ucm_device *device; - - struct list_head ctxs; - struct list_head events; - wait_queue_head_t poll_wait; -}; - -struct ib_ucm_context { - int id; - struct completion comp; - atomic_t ref; - int events_reported; - - struct ib_ucm_file *file; - struct ib_cm_id *cm_id; - __u64 uid; - - struct list_head events; /* list of pending events. */ - struct list_head file_list; /* member in file ctx list */ -}; - -struct ib_ucm_event { - struct ib_ucm_context *ctx; - struct list_head file_list; /* member in file event list */ - struct list_head ctx_list; /* member in ctx event list */ - - struct ib_cm_id *cm_id; - struct ib_ucm_event_resp resp; - void *data; - void *info; - int data_len; - int info_len; -}; - -enum { - IB_UCM_MAJOR = 231, - IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, - IB_UCM_NUM_FIXED_MINOR = 32, - IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, -}; - -#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) -static dev_t dynamic_ucm_dev; - -static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device, void *client_data); - -static struct ib_client ucm_client = { - .name = "ucm", - .add = ib_ucm_add_one, - .remove = ib_ucm_remove_one -}; - -static DEFINE_XARRAY_ALLOC(ctx_id_table); -static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); - -static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) -{ - struct ib_ucm_context *ctx; - - xa_lock(&ctx_id_table); - ctx = xa_load(&ctx_id_table, id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - atomic_inc(&ctx->ref); - xa_unlock(&ctx_id_table); - - return ctx; -} - -static void ib_ucm_ctx_put(struct ib_ucm_context *ctx) -{ - if (atomic_dec_and_test(&ctx->ref)) - complete(&ctx->comp); -} - -static inline int ib_ucm_new_cm_id(int event) -{ - return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED; -} - -static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) -{ - struct ib_ucm_event *uevent; - - mutex_lock(&ctx->file->file_mutex); - list_del(&ctx->file_list); - while (!list_empty(&ctx->events)) { - - uevent = list_entry(ctx->events.next, - struct ib_ucm_event, ctx_list); - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - mutex_unlock(&ctx->file->file_mutex); - - /* clear incoming connections. */ - if (ib_ucm_new_cm_id(uevent->resp.event)) - ib_destroy_cm_id(uevent->cm_id); - - kfree(uevent); - mutex_lock(&ctx->file->file_mutex); - } - mutex_unlock(&ctx->file->file_mutex); -} - -static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) -{ - struct ib_ucm_context *ctx; - - ctx = kzalloc(sizeof *ctx, GFP_KERNEL); - if (!ctx) - return NULL; - - atomic_set(&ctx->ref, 1); - init_completion(&ctx->comp); - ctx->file = file; - INIT_LIST_HEAD(&ctx->events); - - if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&ctx->file_list, &file->ctxs); - return ctx; - -error: - kfree(ctx); - return NULL; -} - -static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, - const struct ib_cm_req_event_param *kreq) -{ - ureq->remote_ca_guid = kreq->remote_ca_guid; - ureq->remote_qkey = kreq->remote_qkey; - ureq->remote_qpn = kreq->remote_qpn; - ureq->qp_type = kreq->qp_type; - ureq->starting_psn = kreq->starting_psn; - ureq->responder_resources = kreq->responder_resources; - ureq->initiator_depth = kreq->initiator_depth; - ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; - ureq->flow_control = kreq->flow_control; - ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; - ureq->retry_count = kreq->retry_count; - ureq->rnr_retry_count = kreq->rnr_retry_count; - ureq->srq = kreq->srq; - ureq->port = kreq->port; - - ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); - if (kreq->alternate_path) - ib_copy_path_rec_to_user(&ureq->alternate_path, - kreq->alternate_path); -} - -static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, - const struct ib_cm_rep_event_param *krep) -{ - urep->remote_ca_guid = krep->remote_ca_guid; - urep->remote_qkey = krep->remote_qkey; - urep->remote_qpn = krep->remote_qpn; - urep->starting_psn = krep->starting_psn; - urep->responder_resources = krep->responder_resources; - urep->initiator_depth = krep->initiator_depth; - urep->target_ack_delay = krep->target_ack_delay; - urep->failover_accepted = krep->failover_accepted; - urep->flow_control = krep->flow_control; - urep->rnr_retry_count = krep->rnr_retry_count; - urep->srq = krep->srq; -} - -static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, - const struct ib_cm_sidr_rep_event_param *krep) -{ - urep->status = krep->status; - urep->qkey = krep->qkey; - urep->qpn = krep->qpn; -}; - -static int ib_ucm_event_process(const struct ib_cm_event *evt, - struct ib_ucm_event *uvt) -{ - void *info = NULL; - - switch (evt->event) { - case IB_CM_REQ_RECEIVED: - ib_ucm_event_req_get(&uvt->resp.u.req_resp, - &evt->param.req_rcvd); - uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_PRIMARY; - uvt->resp.present |= (evt->param.req_rcvd.alternate_path ? - IB_UCM_PRES_ALTERNATE : 0); - break; - case IB_CM_REP_RECEIVED: - ib_ucm_event_rep_get(&uvt->resp.u.rep_resp, - &evt->param.rep_rcvd); - uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE; - break; - case IB_CM_RTU_RECEIVED: - uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREQ_RECEIVED: - uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREP_RECEIVED: - uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_MRA_RECEIVED: - uvt->resp.u.mra_resp.timeout = - evt->param.mra_rcvd.service_timeout; - uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE; - break; - case IB_CM_REJ_RECEIVED: - uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason; - uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.rej_rcvd.ari_length; - info = evt->param.rej_rcvd.ari; - break; - case IB_CM_LAP_RECEIVED: - ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, - evt->param.lap_rcvd.alternate_path); - uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_ALTERNATE; - break; - case IB_CM_APR_RECEIVED: - uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status; - uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.apr_rcvd.info_len; - info = evt->param.apr_rcvd.apr_info; - break; - case IB_CM_SIDR_REQ_RECEIVED: - uvt->resp.u.sidr_req_resp.pkey = - evt->param.sidr_req_rcvd.pkey; - uvt->resp.u.sidr_req_resp.port = - evt->param.sidr_req_rcvd.port; - uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; - break; - case IB_CM_SIDR_REP_RECEIVED: - ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp, - &evt->param.sidr_rep_rcvd); - uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.sidr_rep_rcvd.info_len; - info = evt->param.sidr_rep_rcvd.info; - break; - default: - uvt->resp.u.send_status = evt->param.send_status; - break; - } - - if (uvt->data_len) { - uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL); - if (!uvt->data) - goto err1; - - uvt->resp.present |= IB_UCM_PRES_DATA; - } - - if (uvt->info_len) { - uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL); - if (!uvt->info) - goto err2; - - uvt->resp.present |= IB_UCM_PRES_INFO; - } - return 0; - -err2: - kfree(uvt->data); -err1: - return -ENOMEM; -} - -static int ib_ucm_event_handler(struct ib_cm_id *cm_id, - const struct ib_cm_event *event) -{ - struct ib_ucm_event *uevent; - struct ib_ucm_context *ctx; - int result = 0; - - ctx = cm_id->context; - - uevent = kzalloc(sizeof *uevent, GFP_KERNEL); - if (!uevent) - goto err1; - - uevent->ctx = ctx; - uevent->cm_id = cm_id; - uevent->resp.uid = ctx->uid; - uevent->resp.id = ctx->id; - uevent->resp.event = event->event; - - result = ib_ucm_event_process(event, uevent); - if (result) - goto err2; - - mutex_lock(&ctx->file->file_mutex); - list_add_tail(&uevent->file_list, &ctx->file->events); - list_add_tail(&uevent->ctx_list, &ctx->events); - wake_up_interruptible(&ctx->file->poll_wait); - mutex_unlock(&ctx->file->file_mutex); - return 0; - -err2: - kfree(uevent); -err1: - /* Destroy new cm_id's */ - return ib_ucm_new_cm_id(event->event); -} - -static ssize_t ib_ucm_event(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_event_get cmd; - struct ib_ucm_event *uevent; - int result = 0; - - if (out_len < sizeof(struct ib_ucm_event_resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - while (list_empty(&file->events)) { - mutex_unlock(&file->file_mutex); - - if (file->filp->f_flags & O_NONBLOCK) - return -EAGAIN; - - if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->events))) - return -ERESTARTSYS; - - mutex_lock(&file->file_mutex); - } - - uevent = list_entry(file->events.next, struct ib_ucm_event, file_list); - - if (ib_ucm_new_cm_id(uevent->resp.event)) { - ctx = ib_ucm_ctx_alloc(file); - if (!ctx) { - result = -ENOMEM; - goto done; - } - - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &uevent->resp, sizeof(uevent->resp))) { - result = -EFAULT; - goto done; - } - - if (uevent->data) { - if (cmd.data_len < uevent->data_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.data), - uevent->data, uevent->data_len)) { - result = -EFAULT; - goto done; - } - } - - if (uevent->info) { - if (cmd.info_len < uevent->info_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.info), - uevent->info, uevent->info_len)) { - result = -EFAULT; - goto done; - } - } - - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - uevent->ctx->events_reported++; - - kfree(uevent->data); - kfree(uevent->info); - kfree(uevent); -done: - mutex_unlock(&file->file_mutex); - return result; -} - -static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_create_id cmd; - struct ib_ucm_create_id_resp resp; - struct ib_ucm_context *ctx; - int result; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - ctx = ib_ucm_ctx_alloc(file); - mutex_unlock(&file->file_mutex); - if (!ctx) - return -ENOMEM; - - ctx->uid = cmd.uid; - ctx->cm_id = ib_create_cm_id(file->device->ib_dev, - ib_ucm_event_handler, ctx); - if (IS_ERR(ctx->cm_id)) { - result = PTR_ERR(ctx->cm_id); - goto err1; - } - - resp.id = ctx->id; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) { - result = -EFAULT; - goto err2; - } - return 0; - -err2: - ib_destroy_cm_id(ctx->cm_id); -err1: - xa_erase(&ctx_id_table, ctx->id); - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_destroy_id cmd; - struct ib_ucm_destroy_id_resp resp; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - xa_lock(&ctx_id_table); - ctx = xa_load(&ctx_id_table, cmd.id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - __xa_erase(&ctx_id_table, ctx->id); - xa_unlock(&ctx_id_table); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ib_ucm_ctx_put(ctx); - wait_for_completion(&ctx->comp); - - /* No new events will be generated after destroying the cm_id. */ - ib_destroy_cm_id(ctx->cm_id); - /* Cleanup events not yet reported to the user. */ - ib_ucm_cleanup_events(ctx); - - resp.events_reported = ctx->events_reported; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_attr_id_resp resp; - struct ib_ucm_attr_id cmd; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.service_id = ctx->cm_id->service_id; - resp.service_mask = ctx->cm_id->service_mask; - resp.local_id = ctx->cm_id->local_id; - resp.remote_id = ctx->cm_id->remote_id; - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_uverbs_qp_attr resp; - struct ib_ucm_init_qp_attr cmd; - struct ib_ucm_context *ctx; - struct ib_qp_attr qp_attr; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.qp_attr_mask = 0; - memset(&qp_attr, 0, sizeof qp_attr); - qp_attr.qp_state = cmd.qp_state; - result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); - if (result) - goto out; - - ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static int ucm_validate_listen(__be64 service_id, __be64 service_mask) -{ - service_id &= service_mask; - - if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || - ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) - return -EINVAL; - - return 0; -} - -static ssize_t ib_ucm_listen(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_listen cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ucm_validate_listen(cmd.service_id, cmd.service_mask); - if (result) - goto out; - - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_notify(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_notify cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event); - ib_ucm_ctx_put(ctx); - return result; -} - -static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) -{ - void *data; - - *dest = NULL; - - if (!len) - return 0; - - data = memdup_user(u64_to_user_ptr(src), len); - if (IS_ERR(data)) - return PTR_ERR(data); - - *dest = data; - return 0; -} - -static int ib_ucm_path_get(struct sa_path_rec **path, u64 src) -{ - struct ib_user_path_rec upath; - struct sa_path_rec *sa_path; - - *path = NULL; - - if (!src) - return 0; - - sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL); - if (!sa_path) - return -ENOMEM; - - if (copy_from_user(&upath, u64_to_user_ptr(src), - sizeof(upath))) { - - kfree(sa_path); - return -EFAULT; - } - - ib_copy_path_rec_from_user(sa_path, &upath); - *path = sa_path; - return 0; -} - -static ssize_t ib_ucm_send_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_req_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_req cmd; - int result; - - param.private_data = NULL; - param.primary_path = NULL; - param.alternate_path = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.primary_path, cmd.primary_path); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.alternate_path, cmd.alternate_path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.qp_num = cmd.qpn; - param.qp_type = cmd.qp_type; - param.starting_psn = cmd.psn; - param.peer_to_peer = cmd.peer_to_peer; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.remote_cm_response_timeout = cmd.remote_cm_response_timeout; - param.flow_control = cmd.flow_control; - param.local_cm_response_timeout = cmd.local_cm_response_timeout; - param.retry_count = cmd.retry_count; - param.rnr_retry_count = cmd.rnr_retry_count; - param.max_cm_retries = cmd.max_cm_retries; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.primary_path); - kfree(param.alternate_path); - return result; -} - -static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_rep_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_rep cmd; - int result; - - param.private_data = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - return result; - - param.qp_num = cmd.qpn; - param.starting_psn = cmd.psn; - param.private_data_len = cmd.len; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.failover_accepted = cmd.failover_accepted; - param.flow_control = cmd.flow_control; - param.rnr_retry_count = cmd.rnr_retry_count; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - ctx->uid = cmd.uid; - result = ib_send_cm_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(param.private_data); - return result; -} - -static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len)) -{ - struct ib_ucm_private_data cmd; - struct ib_ucm_context *ctx; - const void *private_data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, private_data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(private_data); - return result; -} - -static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu); -} - -static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq); -} - -static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep); -} - -static ssize_t ib_ucm_send_info(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - int status, - const void *info, - u8 info_len, - const void *data, - u8 data_len)) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_info cmd; - const void *data = NULL; - const void *info = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, cmd.status, info, cmd.info_len, - data, cmd.data_len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(info); - return result; -} - -static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej); -} - -static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr); -} - -static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_mra cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(data); - return result; -} - -static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct sa_path_rec *path = NULL; - struct ib_ucm_lap cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(&path, cmd.path); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(path); - return result; -} - -static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_req_param param = {}; - struct ib_ucm_context *ctx; - struct ib_ucm_sidr_req cmd; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.path, cmd.path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.timeout_ms = cmd.timeout; - param.max_cm_retries = cmd.max_cm_retries; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.path); - return result; -} - -static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_rep_param param; - struct ib_ucm_sidr_rep cmd; - struct ib_ucm_context *ctx; - int result; - - param.info = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, - cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(¶m.info, cmd.info, cmd.info_len); - if (result) - goto done; - - param.qp_num = cmd.qpn; - param.qkey = cmd.qkey; - param.status = cmd.status; - param.info_length = cmd.info_len; - param.private_data_len = cmd.data_len; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.info); - return result; -} - -static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) = { - [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id, - [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id, - [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id, - [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen, - [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify, - [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req, - [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep, - [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu, - [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq, - [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep, - [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej, - [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra, - [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap, - [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr, - [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req, - [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep, - [IB_USER_CM_CMD_EVENT] = ib_ucm_event, - [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr, -}; - -static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, - size_t len, loff_t *pos) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_cmd_hdr hdr; - ssize_t result; - - if (!ib_safe_file_access(filp)) { - pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); - return -EACCES; - } - - if (len < sizeof(hdr)) - return -EINVAL; - - if (copy_from_user(&hdr, buf, sizeof(hdr))) - return -EFAULT; - - if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) - return -EINVAL; - hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table)); - - if (hdr.in + sizeof(hdr) > len) - return -EINVAL; - - result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr), - hdr.in, hdr.out); - if (!result) - result = len; - - return result; -} - -static __poll_t ib_ucm_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct ib_ucm_file *file = filp->private_data; - __poll_t mask = 0; - - poll_wait(filp, &file->poll_wait, wait); - - if (!list_empty(&file->events)) - mask = EPOLLIN | EPOLLRDNORM; - - return mask; -} - -/* - * ib_ucm_open() does not need the BKL: - * - * - no global state is referred to; - * - there is no ioctl method to race against; - * - no further module initialization is required for open to work - * after the device is registered. - */ -static int ib_ucm_open(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file; - - file = kmalloc(sizeof(*file), GFP_KERNEL); - if (!file) - return -ENOMEM; - - INIT_LIST_HEAD(&file->events); - INIT_LIST_HEAD(&file->ctxs); - init_waitqueue_head(&file->poll_wait); - - mutex_init(&file->file_mutex); - - filp->private_data = file; - file->filp = filp; - file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); - - return stream_open(inode, filp); -} - -static int ib_ucm_close(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_context *ctx; - - mutex_lock(&file->file_mutex); - while (!list_empty(&file->ctxs)) { - ctx = list_entry(file->ctxs.next, - struct ib_ucm_context, file_list); - mutex_unlock(&file->file_mutex); - - xa_erase(&ctx_id_table, ctx->id); - ib_destroy_cm_id(ctx->cm_id); - ib_ucm_cleanup_events(ctx); - kfree(ctx); - - mutex_lock(&file->file_mutex); - } - mutex_unlock(&file->file_mutex); - kfree(file); - return 0; -} - -static void ib_ucm_release_dev(struct device *dev) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - kfree(ucm_dev); -} - -static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) -{ - clear_bit(ucm_dev->devnum, dev_map); -} - -static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, - .release = ib_ucm_close, - .write = ib_ucm_write, - .poll = ib_ucm_poll, - .llseek = no_llseek, -}; - -static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - return sprintf(buf, "%s\n", ucm_dev->ib_dev->name); -} -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); - -static void ib_ucm_add_one(struct ib_device *device) -{ - int devnum; - dev_t base; - struct ib_ucm_device *ucm_dev; - - if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1)) - return; - - ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); - if (!ucm_dev) - return; - - device_initialize(&ucm_dev->dev); - ucm_dev->ib_dev = device; - ucm_dev->dev.release = ib_ucm_release_dev; - - devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) - goto err; - ucm_dev->devnum = devnum; - set_bit(devnum, dev_map); - if (devnum >= IB_UCM_NUM_FIXED_MINOR) - base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; - else - base = IB_UCM_BASE_DEV + devnum; - - cdev_init(&ucm_dev->cdev, &ucm_fops); - ucm_dev->cdev.owner = THIS_MODULE; - kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - - ucm_dev->dev.class = &cm_class; - ucm_dev->dev.parent = device->dev.parent; - ucm_dev->dev.devt = base; - - dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); - if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev)) - goto err_devnum; - - if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) - goto err_dev; - - ib_set_client_data(device, &ucm_client, ucm_dev); - return; - -err_dev: - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); -err_devnum: - ib_ucm_free_dev(ucm_dev); -err: - put_device(&ucm_dev->dev); - return; -} - -static void ib_ucm_remove_one(struct ib_device *device, void *client_data) -{ - struct ib_ucm_device *ucm_dev = client_data; - - if (!ucm_dev) - return; - - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); - ib_ucm_free_dev(ucm_dev); - put_device(&ucm_dev->dev); -} - -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_CM_ABI_VERSION)); - -static int __init ib_ucm_init(void) -{ - int ret; - - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register device number\n"); - goto error1; - } - - ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - goto err_alloc; - } - - ret = class_create_file(&cm_class, &class_attr_abi_version.attr); - if (ret) { - pr_err("ucm: couldn't create abi_version attribute\n"); - goto error2; - } - - ret = ib_register_client(&ucm_client); - if (ret) { - pr_err("ucm: couldn't register client\n"); - goto error3; - } - return 0; - -error3: - class_remove_file(&cm_class, &class_attr_abi_version.attr); -error2: - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); -err_alloc: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); -error1: - return ret; -} - -static void __exit ib_ucm_cleanup(void) -{ - ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); - WARN_ON(!xa_empty(&ctx_id_table)); -} - -module_init(ib_ucm_init); -module_exit(ib_ucm_cleanup); diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h deleted file mode 100644 index e2709bb8cb18..000000000000 --- a/include/uapi/rdma/ib_user_cm.h +++ /dev/null @@ -1,326 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_USER_CM_H -#define IB_USER_CM_H - -#include -#include - -#define IB_USER_CM_ABI_VERSION 5 - -enum { - IB_USER_CM_CMD_CREATE_ID, - IB_USER_CM_CMD_DESTROY_ID, - IB_USER_CM_CMD_ATTR_ID, - - IB_USER_CM_CMD_LISTEN, - IB_USER_CM_CMD_NOTIFY, - - IB_USER_CM_CMD_SEND_REQ, - IB_USER_CM_CMD_SEND_REP, - IB_USER_CM_CMD_SEND_RTU, - IB_USER_CM_CMD_SEND_DREQ, - IB_USER_CM_CMD_SEND_DREP, - IB_USER_CM_CMD_SEND_REJ, - IB_USER_CM_CMD_SEND_MRA, - IB_USER_CM_CMD_SEND_LAP, - IB_USER_CM_CMD_SEND_APR, - IB_USER_CM_CMD_SEND_SIDR_REQ, - IB_USER_CM_CMD_SEND_SIDR_REP, - - IB_USER_CM_CMD_EVENT, - IB_USER_CM_CMD_INIT_QP_ATTR, -}; -/* - * command ABI structures. - */ -struct ib_ucm_cmd_hdr { - __u32 cmd; - __u16 in; - __u16 out; -}; - -struct ib_ucm_create_id { - __aligned_u64 uid; - __aligned_u64 response; -}; - -struct ib_ucm_create_id_resp { - __u32 id; -}; - -struct ib_ucm_destroy_id { - __aligned_u64 response; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_destroy_id_resp { - __u32 events_reported; -}; - -struct ib_ucm_attr_id { - __aligned_u64 response; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_attr_id_resp { - __be64 service_id; - __be64 service_mask; - __be32 local_id; - __be32 remote_id; -}; - -struct ib_ucm_init_qp_attr { - __aligned_u64 response; - __u32 id; - __u32 qp_state; -}; - -struct ib_ucm_listen { - __be64 service_id; - __be64 service_mask; - __u32 id; - __u32 reserved; -}; - -struct ib_ucm_notify { - __u32 id; - __u32 event; -}; - -struct ib_ucm_private_data { - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 reserved[3]; -}; - -struct ib_ucm_req { - __u32 id; - __u32 qpn; - __u32 qp_type; - __u32 psn; - __be64 sid; - __aligned_u64 data; - __aligned_u64 primary_path; - __aligned_u64 alternate_path; - __u8 len; - __u8 peer_to_peer; - __u8 responder_resources; - __u8 initiator_depth; - __u8 remote_cm_response_timeout; - __u8 flow_control; - __u8 local_cm_response_timeout; - __u8 retry_count; - __u8 rnr_retry_count; - __u8 max_cm_retries; - __u8 srq; - __u8 reserved[5]; -}; - -struct ib_ucm_rep { - __aligned_u64 uid; - __aligned_u64 data; - __u32 id; - __u32 qpn; - __u32 psn; - __u8 len; - __u8 responder_resources; - __u8 initiator_depth; - __u8 target_ack_delay; - __u8 failover_accepted; - __u8 flow_control; - __u8 rnr_retry_count; - __u8 srq; - __u8 reserved[4]; -}; - -struct ib_ucm_info { - __u32 id; - __u32 status; - __aligned_u64 info; - __aligned_u64 data; - __u8 info_len; - __u8 data_len; - __u8 reserved[6]; -}; - -struct ib_ucm_mra { - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 timeout; - __u8 reserved[2]; -}; - -struct ib_ucm_lap { - __aligned_u64 path; - __aligned_u64 data; - __u32 id; - __u8 len; - __u8 reserved[3]; -}; - -struct ib_ucm_sidr_req { - __u32 id; - __u32 timeout; - __be64 sid; - __aligned_u64 data; - __aligned_u64 path; - __u16 reserved_pkey; - __u8 len; - __u8 max_cm_retries; - __u8 reserved[4]; -}; - -struct ib_ucm_sidr_rep { - __u32 id; - __u32 qpn; - __u32 qkey; - __u32 status; - __aligned_u64 info; - __aligned_u64 data; - __u8 info_len; - __u8 data_len; - __u8 reserved[6]; -}; -/* - * event notification ABI structures. - */ -struct ib_ucm_event_get { - __aligned_u64 response; - __aligned_u64 data; - __aligned_u64 info; - __u8 data_len; - __u8 info_len; - __u8 reserved[6]; -}; - -struct ib_ucm_req_event_resp { - struct ib_user_path_rec primary_path; - struct ib_user_path_rec alternate_path; - __be64 remote_ca_guid; - __u32 remote_qkey; - __u32 remote_qpn; - __u32 qp_type; - __u32 starting_psn; - __u8 responder_resources; - __u8 initiator_depth; - __u8 local_cm_response_timeout; - __u8 flow_control; - __u8 remote_cm_response_timeout; - __u8 retry_count; - __u8 rnr_retry_count; - __u8 srq; - __u8 port; - __u8 reserved[7]; -}; - -struct ib_ucm_rep_event_resp { - __be64 remote_ca_guid; - __u32 remote_qkey; - __u32 remote_qpn; - __u32 starting_psn; - __u8 responder_resources; - __u8 initiator_depth; - __u8 target_ack_delay; - __u8 failover_accepted; - __u8 flow_control; - __u8 rnr_retry_count; - __u8 srq; - __u8 reserved[5]; -}; - -struct ib_ucm_rej_event_resp { - __u32 reason; - /* ari in ib_ucm_event_get info field. */ -}; - -struct ib_ucm_mra_event_resp { - __u8 timeout; - __u8 reserved[3]; -}; - -struct ib_ucm_lap_event_resp { - struct ib_user_path_rec path; -}; - -struct ib_ucm_apr_event_resp { - __u32 status; - /* apr info in ib_ucm_event_get info field. */ -}; - -struct ib_ucm_sidr_req_event_resp { - __u16 pkey; - __u8 port; - __u8 reserved; -}; - -struct ib_ucm_sidr_rep_event_resp { - __u32 status; - __u32 qkey; - __u32 qpn; - /* info in ib_ucm_event_get info field. */ -}; - -#define IB_UCM_PRES_DATA 0x01 -#define IB_UCM_PRES_INFO 0x02 -#define IB_UCM_PRES_PRIMARY 0x04 -#define IB_UCM_PRES_ALTERNATE 0x08 - -struct ib_ucm_event_resp { - __aligned_u64 uid; - __u32 id; - __u32 event; - __u32 present; - __u32 reserved; - union { - struct ib_ucm_req_event_resp req_resp; - struct ib_ucm_rep_event_resp rep_resp; - struct ib_ucm_rej_event_resp rej_resp; - struct ib_ucm_mra_event_resp mra_resp; - struct ib_ucm_lap_event_resp lap_resp; - struct ib_ucm_apr_event_resp apr_resp; - - struct ib_ucm_sidr_req_event_resp sidr_req_resp; - struct ib_ucm_sidr_rep_event_resp sidr_rep_resp; - - __u32 send_status; - } u; -}; - -#endif /* IB_USER_CM_H */ From b9560a419bfd498279333387817adcf5faef2825 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:24 -0300 Subject: [PATCH 050/194] RDMA: Move driver_id into struct ib_device_ops No reason for every driver to emit code to set this, just make it part of the driver's existing static const ops structure. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 12 +++++++++--- drivers/infiniband/core/uverbs_uapi.c | 2 +- drivers/infiniband/hw/bnxt_re/main.c | 3 ++- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 ++- drivers/infiniband/hw/cxgb4/provider.c | 3 ++- drivers/infiniband/hw/efa/efa_main.c | 3 ++- drivers/infiniband/hw/hfi1/verbs.c | 4 +++- drivers/infiniband/hw/hns/hns_roce_main.c | 3 ++- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 ++- drivers/infiniband/hw/mlx4/main.c | 3 ++- drivers/infiniband/hw/mlx5/main.c | 3 ++- drivers/infiniband/hw/mthca/mthca_provider.c | 3 ++- drivers/infiniband/hw/nes/nes_verbs.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 ++- drivers/infiniband/hw/qedr/main.c | 3 ++- drivers/infiniband/hw/qib/qib_verbs.c | 4 +++- drivers/infiniband/hw/usnic/usnic_ib_main.c | 3 ++- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 3 ++- drivers/infiniband/sw/rdmavt/vt.c | 3 +-- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++- include/rdma/ib_verbs.h | 3 ++- include/rdma/rdma_vt.h | 2 +- 22 files changed, 50 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index afb3f5946796..538d01f27bf8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -373,7 +373,7 @@ struct ib_device *ib_device_get_by_name(const char *name, down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && - device->driver_id != driver_id) + device->ops.driver_id != driver_id) device = NULL; if (device) { @@ -1456,7 +1456,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id) down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { - if (ib_dev->driver_id != driver_id) + if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); @@ -2013,7 +2013,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || - cur->ib_dev->driver_id == driver_id) && + cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; @@ -2318,6 +2318,12 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { + WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && + dev_ops->driver_id != ops->driver_id); + dev_ops->driver_id = ops->driver_id; + } + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index ccc4be0a6566..00c547887132 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -647,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); - uapi->driver_id = ibdev->driver_id; + uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 814f959c7db9..1ef5f83ec914 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -596,6 +596,8 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) } static const struct ib_device_ops bnxt_re_dev_ops = { + .driver_id = RDMA_DRIVER_BNXT_RE, + .add_gid = bnxt_re_add_gid, .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats, .alloc_mr = bnxt_re_alloc_mr, @@ -691,7 +693,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); - ibdev->driver_id = RDMA_DRIVER_BNXT_RE; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 4bfab739ec0d..9a9527fdcb9a 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1236,6 +1236,8 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) } static const struct ib_device_ops iwch_dev_ops = { + .driver_id = RDMA_DRIVER_CXGB3, + .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, .alloc_mw = iwch_alloc_mw, @@ -1319,7 +1321,6 @@ int iwch_register_device(struct iwch_dev *dev) memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name, sizeof(dev->ibdev.iw_ifname)); - dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); ib_set_device_ops(&dev->ibdev, &iwch_dev_ops); return ib_register_device(&dev->ibdev, "cxgb3_%d"); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 8ed75b141521..74644afe25ab 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -489,6 +489,8 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) } static const struct ib_device_ops c4iw_dev_ops = { + .driver_id = RDMA_DRIVER_CXGB4, + .alloc_hw_stats = c4iw_alloc_stats, .alloc_mr = c4iw_alloc_mr, .alloc_mw = c4iw_alloc_mw, @@ -599,7 +601,6 @@ void c4iw_register_device(struct work_struct *work) sizeof(dev->ibdev.iw_ifname)); rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); - dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops); ret = set_netdevs(&dev->ibdev, &dev->rdev); if (ret) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index db974caf1eb1..3803dd4526b5 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -197,6 +197,8 @@ static void efa_stats_init(struct efa_dev *dev) } static const struct ib_device_ops efa_dev_ops = { + .driver_id = RDMA_DRIVER_EFA, + .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, .create_ah = efa_create_ah, @@ -287,7 +289,6 @@ static int efa_ib_device_add(struct efa_dev *dev) dev->ibdev.uverbs_ex_cmd_mask = (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); - dev->ibdev.driver_id = RDMA_DRIVER_EFA; ib_set_device_ops(&dev->ibdev, &efa_dev_ops); err = ib_register_device(&dev->ibdev, "efa_%d"); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 1eb4105b2d22..a97f4f9e5c6a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1779,6 +1779,8 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, } static const struct ib_device_ops hfi1_dev_ops = { + .driver_id = RDMA_DRIVER_HFI1, + .alloc_hw_stats = alloc_hw_stats, .alloc_rdma_netdev = hfi1_vnic_alloc_rn, .get_dev_fw_str = hfi1_get_dev_fw_str, @@ -1923,7 +1925,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &ib_hfi1_attr_group); - ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index a6c5c67d0b87..dd408f8afe72 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -414,6 +414,8 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) } static const struct ib_device_ops hns_roce_dev_ops = { + .driver_id = RDMA_DRIVER_HNS, + .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, .alloc_ucontext = hns_roce_alloc_ucontext, @@ -536,7 +538,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops); } - ib_dev->driver_id = RDMA_DRIVER_HNS; ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); for (i = 0; i < hr_dev->caps.num_ports; i++) { diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index a10a30d44b32..1979cefdf90c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2650,6 +2650,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev, } static const struct ib_device_ops i40iw_dev_ops = { + .driver_id = RDMA_DRIVER_I40IW, + .alloc_hw_stats = i40iw_alloc_hw_stats, .alloc_mr = i40iw_alloc_mr, .alloc_pd = i40iw_alloc_pd, @@ -2787,7 +2789,6 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) return -ENOMEM; iwibdev = iwdev->iwibdev; rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group); - iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; ret = ib_register_device(&iwibdev->ibdev, "i40iw%d"); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 25d09d53b51c..03847e2f7835 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2509,6 +2509,8 @@ static void get_fw_ver_str(struct ib_device *device, char *str) } static const struct ib_device_ops mlx4_ib_dev_ops = { + .driver_id = RDMA_DRIVER_MLX4, + .add_gid = mlx4_ib_add_gid, .alloc_mr = mlx4_ib_alloc_mr, .alloc_pd = mlx4_ib_alloc_pd, @@ -2839,7 +2841,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group); - ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; if (ib_register_device(&ibdev->ib_dev, "mlx4_%d")) goto err_diag_counters; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abac70ad5c7c..abd416a31e71 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6124,6 +6124,8 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) } static const struct ib_device_ops mlx5_ib_dev_ops = { + .driver_id = RDMA_DRIVER_MLX5, + .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, .alloc_pd = mlx5_ib_alloc_pd, @@ -6290,7 +6292,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops); - dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 4f40dfedf920..d6467da39aab 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1153,6 +1153,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops mthca_dev_ops = { + .driver_id = RDMA_DRIVER_MTHCA, + .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, .attach_mcast = mthca_multicast_attach, @@ -1303,7 +1305,6 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group); - dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; ret = ib_register_device(&dev->ib_dev, "mthca%d"); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fb2d0762c7c8..3c85e2ef5a08 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3558,6 +3558,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) } static const struct ib_device_ops nes_dev_ops = { + .driver_id = RDMA_DRIVER_NES, + .alloc_mr = nes_alloc_mr, .alloc_mw = nes_alloc_mw, .alloc_pd = nes_alloc_pd, @@ -3722,7 +3724,6 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) int ret; rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); - nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d"); if (ret) { return ret; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index fc6c0962dea9..a9da4b857566 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -144,6 +144,8 @@ static const struct attribute_group ocrdma_attr_group = { }; static const struct ib_device_ops ocrdma_dev_ops = { + .driver_id = RDMA_DRIVER_OCRDMA, + .alloc_mr = ocrdma_alloc_mr, .alloc_pd = ocrdma_alloc_pd, .alloc_ucontext = ocrdma_alloc_ucontext, @@ -249,7 +251,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops); } rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group); - dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1); if (ret) return ret; diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 083c2c00a8e9..737745231f8f 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -183,6 +183,8 @@ static void qedr_roce_register_device(struct qedr_dev *dev) } static const struct ib_device_ops qedr_dev_ops = { + .driver_id = RDMA_DRIVER_QEDR, + .alloc_mr = qedr_alloc_mr, .alloc_pd = qedr_alloc_pd, .alloc_ucontext = qedr_alloc_ucontext, @@ -274,7 +276,6 @@ static int qedr_register_device(struct qedr_dev *dev) rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group); ib_set_device_ops(&dev->ibdev, &qedr_dev_ops); - dev->ibdev.driver_id = RDMA_DRIVER_QEDR; rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1); if (rc) return rc; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 5ff32d32c61c..bbc331d7f49b 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1482,6 +1482,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd) } static const struct ib_device_ops qib_dev_ops = { + .driver_id = RDMA_DRIVER_QIB, + .init_port = qib_create_port_files, .modify_device = qib_modify_device, .process_mad = qib_process_mad, @@ -1616,7 +1618,7 @@ int qib_register_ib_device(struct qib_devdata *dd) rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group); ib_set_device_ops(ibdev, &qib_dev_ops); - ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB); + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) goto err_tx; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index d88d9f8a7f9a..47830334cb55 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -329,6 +329,8 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops usnic_dev_ops = { + .driver_id = RDMA_DRIVER_USNIC, + .alloc_pd = usnic_ib_alloc_pd, .alloc_ucontext = usnic_ib_alloc_ucontext, .create_cq = usnic_ib_create_cq, @@ -412,7 +414,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops); - us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group); ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 40182297f87f..54a0b6372629 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -144,6 +144,8 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, } static const struct ib_device_ops pvrdma_dev_ops = { + .driver_id = RDMA_DRIVER_VMW_PVRDMA, + .add_gid = pvrdma_add_gid, .alloc_mr = pvrdma_alloc_mr, .alloc_pd = pvrdma_alloc_pd, @@ -261,7 +263,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) if (!dev->srq_tbl) goto err_qp_free; } - dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1); if (ret) return ret; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 9546a837a8ac..60700e197e6c 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -530,7 +530,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * * Return: 0 on success otherwise an errno. */ -int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) +int rvt_register_device(struct rvt_dev_info *rdi) { int ret = 0, i; @@ -636,7 +636,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) if (!rdi->ibdev.num_comp_vectors) rdi->ibdev.num_comp_vectors = 1; - rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev)); if (ret) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 8c3e2a18cfe4..3d3130dc6380 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1111,6 +1111,8 @@ static int rxe_enable_driver(struct ib_device *ib_dev) } static const struct ib_device_ops rxe_dev_ops = { + .driver_id = RDMA_DRIVER_RXE, + .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, .alloc_pd = rxe_alloc_pd, @@ -1230,7 +1232,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) rxe->tfm = tfm; rdma_set_device_sysfs_group(dev, &rxe_attr_group); - dev->driver_id = RDMA_DRIVER_RXE; err = ib_register_device(dev, ibdev_name); if (err) pr_warn("%s failed with error %d\n", __func__, err); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ec6446864b08..dacf2b5ad862 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2329,6 +2329,8 @@ struct iw_cm_conn_param; * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { + enum rdma_driver_id driver_id; + int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, @@ -2672,7 +2674,6 @@ struct ib_device { struct rdma_restrack_root *res; const struct uapi_definition *driver_def; - enum rdma_driver_id driver_id; /* * Positive refcount indicates that the device is currently diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index b9cd06db1a71..997f42678806 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -555,7 +555,7 @@ static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi, struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); void rvt_dealloc_device(struct rvt_dev_info *rdi); -int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id); +int rvt_register_device(struct rvt_dev_info *rvd); void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, From 72c6ec18eb6161c8fc672ae96ec5c77df4d07405 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:25 -0300 Subject: [PATCH 051/194] RDMA: Move uverbs_abi_ver into struct ib_device_ops No reason for every driver to emit code to set this, just make it part of the driver's existing static const ops structure. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 ++ drivers/infiniband/core/uverbs_main.c | 2 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 +++--- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/efa/efa_main.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 ++ drivers/infiniband/hw/mlx4/main.c | 15 ++++++++------- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 ++ drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rdmavt/vt.c | 3 ++- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- include/rdma/ib_verbs.h | 2 +- 20 files changed, 33 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 538d01f27bf8..a00b7fc360bf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2323,6 +2323,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } + if (ops->uverbs_abi_ver) + dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 84a5e9a6d483..0f8a286a92d3 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1186,7 +1186,7 @@ static ssize_t abi_version_show(struct device *device, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2c3685faa57a..8af8e1472101 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3630,10 +3630,10 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) u32 chip_met_rev_num = 0; int rc; - dev_dbg(rdev_to_dev(rdev), "ABI version requested %d", - ibdev->uverbs_abi_ver); + dev_dbg(rdev_to_dev(rdev), "ABI version requested %u", + ibdev->ops.uverbs_abi_ver); - if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) { + if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) { dev_dbg(rdev_to_dev(rdev), " is different from the device %d ", BNXT_RE_ABI_VERSION); return -EPERM; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 1ef5f83ec914..a45cb9ee51ab 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -597,6 +597,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) static const struct ib_device_ops bnxt_re_dev_ops = { .driver_id = RDMA_DRIVER_BNXT_RE, + .uverbs_abi_ver = BNXT_RE_ABI_VERSION, .add_gid = bnxt_re_add_gid, .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats, @@ -663,7 +664,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; /* User space */ - ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 9a9527fdcb9a..5b410605a3a1 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1237,6 +1237,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) static const struct ib_device_ops iwch_dev_ops = { .driver_id = RDMA_DRIVER_CXGB3, + .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, @@ -1316,7 +1317,6 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; dev->ibdev.num_comp_vectors = 1; dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev; - dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name, sizeof(dev->ibdev.iw_ifname)); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 74644afe25ab..c56cdfbd8a88 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -490,6 +490,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) static const struct ib_device_ops c4iw_dev_ops = { .driver_id = RDMA_DRIVER_CXGB4, + .uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION, .alloc_hw_stats = c4iw_alloc_stats, .alloc_mr = c4iw_alloc_mr, @@ -595,7 +596,6 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev; - dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iw_ifname)); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 3803dd4526b5..b05c5a0b9bc0 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -198,6 +198,7 @@ static void efa_stats_init(struct efa_dev *dev) static const struct ib_device_ops efa_dev_ops = { .driver_id = RDMA_DRIVER_EFA, + .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, @@ -266,7 +267,6 @@ static int efa_ib_device_add(struct efa_dev *dev) dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = 1; dev->ibdev.dev.parent = &pdev->dev; - dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index dd408f8afe72..e496b0628e25 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -415,6 +415,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) static const struct ib_device_ops hns_roce_dev_ops = { .driver_id = RDMA_DRIVER_HNS, + .uverbs_abi_ver = 1, .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, @@ -489,7 +490,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->phys_port_cnt = hr_dev->caps.num_ports; ib_dev->local_dma_lkey = hr_dev->caps.reserved_lkey; ib_dev->num_comp_vectors = hr_dev->caps.num_comp_vectors; - ib_dev->uverbs_abi_ver = 1; ib_dev->uverbs_cmd_mask = (1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1979cefdf90c..4dc647c8556b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2651,6 +2651,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev, static const struct ib_device_ops i40iw_dev_ops = { .driver_id = RDMA_DRIVER_I40IW, + /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ + .uverbs_abi_ver = I40IW_ABI_VER, .alloc_hw_stats = i40iw_alloc_hw_stats, .alloc_mr = i40iw_alloc_mr, diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 03847e2f7835..1f87221acbb0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1089,7 +1089,8 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx, if (!dev->ib_active) return -EAGAIN; - if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + if (ibdev->ops.uverbs_abi_ver == + MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; @@ -1111,7 +1112,7 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx, INIT_LIST_HEAD(&context->wqn_ranges_list); mutex_init(&context->wqn_ranges_mutex); - if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -2510,6 +2511,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str) static const struct ib_device_ops mlx4_ib_dev_ops = { .driver_id = RDMA_DRIVER_MLX4, + .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION, .add_gid = mlx4_ib_add_gid, .alloc_mr = mlx4_ib_alloc_mr, @@ -2653,11 +2655,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dev.parent = &dev->persist->pdev->dev; - if (dev->caps.userspace_caps) - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; - else - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; - ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -2731,6 +2728,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops); } + if (!dev->caps.userspace_caps) + ibdev->ib_dev.ops.uverbs_abi_ver = + MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abd416a31e71..0c23eccb8855 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6125,6 +6125,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) static const struct ib_device_ops mlx5_ib_dev_ops = { .driver_id = RDMA_DRIVER_MLX5, + .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, @@ -6223,7 +6224,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; int err; - dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index d6467da39aab..690c65accea4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1154,6 +1154,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static const struct ib_device_ops mthca_dev_ops = { .driver_id = RDMA_DRIVER_MTHCA, + .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, @@ -1247,7 +1248,6 @@ int mthca_register_device(struct mthca_dev *dev) dev->ib_dev.owner = THIS_MODULE; - dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 3c85e2ef5a08..f1fdd6829a40 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3559,6 +3559,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) static const struct ib_device_ops nes_dev_ops = { .driver_id = RDMA_DRIVER_NES, + /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ + .uverbs_abi_ver = NES_ABI_USERSPACE_VER, .alloc_mr = nes_alloc_mr, .alloc_mw = nes_alloc_mw, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index a9da4b857566..ef823f1144b5 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -145,6 +145,7 @@ static const struct attribute_group ocrdma_attr_group = { static const struct ib_device_ops ocrdma_dev_ops = { .driver_id = RDMA_DRIVER_OCRDMA, + .uverbs_abi_ver = OCRDMA_ABI_VERSION, .alloc_mr = ocrdma_alloc_mr, .alloc_pd = ocrdma_alloc_pd, @@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, sizeof(OCRDMA_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; - dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = OCRDMA_UVERBS(GET_CONTEXT) | OCRDMA_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 737745231f8f..793e25776a7e 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -184,6 +184,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev) static const struct ib_device_ops qedr_dev_ops = { .driver_id = RDMA_DRIVER_QEDR, + .uverbs_abi_ver = QEDR_ABI_VERSION, .alloc_mr = qedr_alloc_mr, .alloc_pd = qedr_alloc_pd, @@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; - dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | QEDR_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 47830334cb55..f61690816095 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -330,6 +330,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) static const struct ib_device_ops usnic_dev_ops = { .driver_id = RDMA_DRIVER_USNIC, + .uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION, .alloc_pd = usnic_ib_alloc_pd, .alloc_ucontext = usnic_ib_alloc_ucontext, @@ -391,7 +392,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; - us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 54a0b6372629..2a7eb2838453 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -145,6 +145,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, static const struct ib_device_ops pvrdma_dev_ops = { .driver_id = RDMA_DRIVER_VMW_PVRDMA, + .uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION, .add_gid = pvrdma_add_gid, .alloc_mr = pvrdma_alloc_mr, @@ -203,7 +204,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dev.parent = &dev->pdev->dev; - dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 60700e197e6c..639ef8ac5400 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -382,6 +382,8 @@ enum { }; static const struct ib_device_ops rvt_dev_ops = { + .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION, + .alloc_fmr = rvt_alloc_fmr, .alloc_mr = rvt_alloc_mr, .alloc_pd = rvt_alloc_pd, @@ -600,7 +602,6 @@ int rvt_register_device(struct rvt_dev_info *rdi) * exactly which functions rdmavt supports, nor do they know the ABI * version, so we do all of this sort of stuff here. */ - rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION; rdi->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 3d3130dc6380..9e87cdb82bec 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1112,6 +1112,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev) static const struct ib_device_ops rxe_dev_ops = { .driver_id = RDMA_DRIVER_RXE, + .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, @@ -1184,7 +1185,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dma_coerce_mask_and_coherent(&dev->dev, dma_get_required_mask(&dev->dev)); - dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index dacf2b5ad862..16405b9bca13 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2330,6 +2330,7 @@ struct iw_cm_conn_param; */ struct ib_device_ops { enum rdma_driver_id driver_id; + u32 uverbs_abi_ver; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); @@ -2650,7 +2651,6 @@ struct ib_device { */ const struct attribute_group *groups[3]; - int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; From 7a15414252ae4f1d450462d83f883b2d9d8036ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:26 -0300 Subject: [PATCH 052/194] RDMA: Move owner into struct ib_device_ops This more closely follows how other subsytems work, with owner being a member of the structure containing the function pointers. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 4 ++++ drivers/infiniband/core/uverbs_main.c | 6 +++--- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/efa/efa_main.c | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +-- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- include/rdma/ib_verbs.h | 2 +- 20 files changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a00b7fc360bf..357d74c8df2b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2323,6 +2323,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } + if (ops->owner) { + WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); + dev_ops->owner = ops->owner; + } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 0f8a286a92d3..870b3dd35aac 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -198,7 +198,7 @@ void ib_uverbs_release_file(struct kref *ref) ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); if (ib_dev && !ib_dev->ops.disassociate_ucontext) - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); if (atomic_dec_and_test(&file->device->refcount)) @@ -1065,7 +1065,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) module_dependent = !(ib_dev->ops.disassociate_ucontext); if (module_dependent) { - if (!try_module_get(ib_dev->owner)) { + if (!try_module_get(ib_dev->ops.owner)) { ret = -ENODEV; goto err; } @@ -1100,7 +1100,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) return stream_open(inode, filp); err_module: - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); err: mutex_unlock(&dev->lists_mutex); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a45cb9ee51ab..351c420248a0 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -596,6 +596,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) } static const struct ib_device_ops bnxt_re_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, .uverbs_abi_ver = BNXT_RE_ABI_VERSION, @@ -651,7 +652,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) int ret; /* ib device init */ - ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 5b410605a3a1..1b35941eee74 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1236,6 +1236,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) } static const struct ib_device_ops iwch_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB3, .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, @@ -1285,7 +1286,6 @@ int iwch_register_device(struct iwch_dev *dev) pr_debug("%s iwch_dev %p\n", __func__, dev); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); - dev->ibdev.owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index c56cdfbd8a88..2b1f2443b7da 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -489,6 +489,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) } static const struct ib_device_ops c4iw_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB4, .uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION, @@ -563,7 +564,6 @@ void c4iw_register_device(struct work_struct *work) pr_debug("c4iw_dev %p\n", dev); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); - dev->ibdev.owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index b05c5a0b9bc0..b891ee239a67 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -197,6 +197,7 @@ static void efa_stats_init(struct efa_dev *dev) } static const struct ib_device_ops efa_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_EFA, .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, @@ -262,7 +263,6 @@ static int efa_ib_device_add(struct efa_dev *dev) if (err) goto err_release_doorbell_bar; - dev->ibdev.owner = THIS_MODULE; dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = 1; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a97f4f9e5c6a..1f36db98240f 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1779,6 +1779,7 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, } static const struct ib_device_ops hfi1_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HFI1, .alloc_hw_stats = alloc_hw_stats, @@ -1831,7 +1832,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) */ if (!ib_hfi1_sys_image_guid) ib_hfi1_sys_image_guid = ibdev->node_guid; - ibdev->owner = THIS_MODULE; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index e496b0628e25..f07b2ec86ec2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -414,6 +414,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) } static const struct ib_device_ops hns_roce_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, .uverbs_abi_ver = 1, @@ -483,7 +484,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev = &hr_dev->ib_dev; - ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; ib_dev->dev.parent = dev; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 4dc647c8556b..bfe16e6f04f4 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2650,6 +2650,7 @@ static int i40iw_query_pkey(struct ib_device *ibdev, } static const struct ib_device_ops i40iw_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_I40IW, /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ .uverbs_abi_ver = I40IW_ABI_VER, @@ -2711,7 +2712,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1f87221acbb0..5d7a87842291 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2510,6 +2510,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str) } static const struct ib_device_ops mlx4_ib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX4, .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION, @@ -2646,7 +2647,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0c23eccb8855..1e3d936ed809 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6044,7 +6044,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->ib_dev.phys_port_cnt = dev->num_ports; @@ -6124,6 +6123,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) } static const struct ib_device_ops mlx5_ib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX5, .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 690c65accea4..b128ff76f709 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1153,6 +1153,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops mthca_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MTHCA, .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, @@ -1246,8 +1247,6 @@ int mthca_register_device(struct mthca_dev *dev) if (ret) return ret; - dev->ib_dev.owner = THIS_MODULE; - dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f1fdd6829a40..db044b2eaead 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3558,6 +3558,7 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) } static const struct ib_device_ops nes_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_NES, /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ .uverbs_abi_ver = NES_ABI_USERSPACE_VER, @@ -3617,7 +3618,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index ef823f1144b5..b326313d413f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -144,6 +144,7 @@ static const struct attribute_group ocrdma_attr_group = { }; static const struct ib_device_ops ocrdma_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_OCRDMA, .uverbs_abi_ver = OCRDMA_ABI_VERSION, @@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, sizeof(OCRDMA_NODE_DESC)); - dev->ibdev.owner = THIS_MODULE; dev->ibdev.uverbs_cmd_mask = OCRDMA_UVERBS(GET_CONTEXT) | OCRDMA_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 793e25776a7e..a0bb07ba0f3c 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -183,6 +183,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev) } static const struct ib_device_ops qedr_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_QEDR, .uverbs_abi_ver = QEDR_ABI_VERSION, @@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); - dev->ibdev.owner = THIS_MODULE; dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | QEDR_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index bbc331d7f49b..54310fd6c7b6 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1482,6 +1482,7 @@ static void qib_fill_device_attr(struct qib_devdata *dd) } static const struct ib_device_ops qib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_QIB, .init_port = qib_create_port_files, @@ -1547,7 +1548,6 @@ int qib_register_ib_device(struct qib_devdata *dd) if (!ib_qib_sys_image_guid) ib_qib_sys_image_guid = ppd->guid; - ibdev->owner = THIS_MODULE; ibdev->node_guid = ppd->guid; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f61690816095..e701322dc740 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -329,6 +329,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops usnic_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_USNIC, .uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION, @@ -387,7 +388,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->pdev = dev; us_ibdev->netdev = pci_get_drvdata(dev); - us_ibdev->ib_dev.owner = THIS_MODULE; us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 2a7eb2838453..0c48464ffff1 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -144,6 +144,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, } static const struct ib_device_ops pvrdma_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_VMW_PVRDMA, .uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION, @@ -201,7 +202,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; - dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dev.parent = &dev->pdev->dev; dev->ib_dev.uverbs_cmd_mask = diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9e87cdb82bec..046129393215 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1111,6 +1111,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev) } static const struct ib_device_ops rxe_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, @@ -1173,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); - dev->owner = THIS_MODULE; dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 16405b9bca13..d1f16a6c4810 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2329,6 +2329,7 @@ struct iw_cm_conn_param; * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { + struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; @@ -2639,7 +2640,6 @@ struct ib_device { int num_comp_vectors; - struct module *owner; union { struct device dev; struct ib_core_device coredev; From 147b308e6a63a0572b750b1dc9b8b6fc33997e4b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2019 14:37:27 +0300 Subject: [PATCH 053/194] RDMA/nes: Avoid memory allocation during CQ destroy The memory allocation call can fail and cause to early return from nes_desotroy_cq() function. This situation will cause to memory leak of struct nes_cq. Rewrite function to avoid memory allocation. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 28 +++++++++++---------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index db044b2eaead..7677f1f734bb 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1641,7 +1641,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) struct nes_vnic *nesvnic; struct nes_adapter *nesadapter; struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; + struct nes_cqp_request cqp_request = {}; unsigned long flags; u32 opcode = 0; int ret; @@ -1654,13 +1654,10 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number); /* Send DestroyCQ request to CQP */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; + INIT_LIST_HEAD(&cqp_request.list); + init_waitqueue_head(&cqp_request.waitq); + cqp_request.waiting = 1; + cqp_wqe = &cqp_request.cqp_wqe; opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); spin_lock_irqsave(&nesadapter->pbl_lock, flags); if (nescq->virtual_cq == 1) { @@ -1687,30 +1684,28 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) if (!nescq->mcrqf) nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number); - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); + nes_post_cqp_request(nesdev, &cqp_request); /* Wait for CQP */ nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", nescq->hw_cq.cq_number); - ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), - NES_EVENT_TIMEOUT); + ret = wait_event_timeout(cqp_request.waitq, cqp_request.request_done, + NES_EVENT_TIMEOUT); nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nescq->hw_cq.cq_number, ret, cqp_request->major_code, - cqp_request->minor_code); + nescq->hw_cq.cq_number, ret, cqp_request.major_code, + cqp_request.minor_code); if (!ret) { nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", nescq->hw_cq.cq_number); ret = -ETIME; - } else if (cqp_request->major_code) { + } else if (cqp_request.major_code) { nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", nescq->hw_cq.cq_number); ret = -EIO; } else { ret = 0; } - nes_put_cqp_request(nesdev, cqp_request); if (nescq->cq_mem_size) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, @@ -1899,7 +1894,6 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, } barrier(); - atomic_set(&cqp_request->refcount, 2); nes_post_cqp_request(nesdev, cqp_request); /* Wait for CQP */ From a52c8e2469c30cf7ac453d624aed9c168b23d1af Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2019 14:37:28 +0300 Subject: [PATCH 054/194] RDMA: Clean destroy CQ in drivers do not return errors Like all other destroy commands, .destroy_cq() call is not supposed to fail. In all flows, the attempt to return earlier caused to memory leaks. This patch converts .destroy_cq() to do not return any errors. Signed-off-by: Leon Romanovsky Acked-by: Gal Pressman Acked-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cq.c | 5 +- drivers/infiniband/core/verbs.c | 3 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 13 ++--- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 +- drivers/infiniband/hw/cxgb3/cxio_hal.c | 6 +-- drivers/infiniband/hw/cxgb3/cxio_hal.h | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +- drivers/infiniband/hw/cxgb4/cq.c | 13 ++--- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +- drivers/infiniband/hw/efa/efa.h | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 9 +--- drivers/infiniband/hw/hns/hns_roce_cq.c | 50 +++++++++---------- drivers/infiniband/hw/hns/hns_roce_device.h | 4 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 14 ++---- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +- drivers/infiniband/hw/mlx4/cq.c | 4 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 +- drivers/infiniband/hw/mlx5/cq.c | 4 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 4 +- drivers/infiniband/hw/nes/nes_utils.c | 4 +- drivers/infiniband/hw/nes/nes_verbs.c | 30 ++++------- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 8 ++- drivers/infiniband/hw/ocrdma/ocrdma_hw.h | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 6 +-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 2 +- drivers/infiniband/hw/qedr/verbs.c | 20 +------- drivers/infiniband/hw/qedr/verbs.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 4 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 6 +-- .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 6 +-- drivers/infiniband/sw/rdmavt/cq.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 +- include/rdma/ib_verbs.h | 2 +- 36 files changed, 82 insertions(+), 166 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index cb72aa4985a4..6ee62600a812 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -207,8 +207,6 @@ EXPORT_SYMBOL(__ib_alloc_cq_user); */ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) { - int ret; - if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; @@ -228,7 +226,6 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) kfree(cq->wc); rdma_restrack_del(&cq->res); - ret = cq->device->ops.destroy_cq(cq, udata); - WARN_ON_ONCE(ret); + cq->device->ops.destroy_cq(cq, udata); } EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 4fd5aad890d2..933bc35701ad 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1949,7 +1949,8 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) return -EBUSY; rdma_restrack_del(&cq->res); - return cq->device->ops.destroy_cq(cq, udata); + cq->device->ops.destroy_cq(cq, udata); + return 0; } EXPORT_SYMBOL(ib_destroy_cq_user); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 8af8e1472101..0127af45dcd1 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2517,9 +2517,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, } /* Completion Queues */ -int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { - int rc; struct bnxt_re_cq *cq; struct bnxt_qplib_nq *nq; struct bnxt_re_dev *rdev; @@ -2528,20 +2527,14 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) rdev = cq->rdev; nq = cq->qplib_cq.nq; - rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); - if (rc) { - dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ"); - return rc; - } - if (!IS_ERR_OR_NULL(cq->umem)) + bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); + if (!cq->umem) ib_umem_release(cq->umem); atomic_dec(&rdev->cq_count); nq->budget--; kfree(cq->cql); kfree(cq); - - return 0; } struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 09a33049e42f..828403ee0104 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -193,7 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags); diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index d9c741fea0e9..37ee93824349 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -303,17 +303,15 @@ err1: return -ENOMEM; } -int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) { - int err; - err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); kfree(cq->sw_queue); dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), (1UL << (cq->size_log2)) * sizeof(struct t3_cqe) + 1, cq->queue, dma_unmap_addr(cq, mapping)); cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); - return err; } int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 5fc26a4648d3..40c029ffa425 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -158,7 +158,7 @@ void cxio_rdev_close(struct cxio_rdev *rdev); int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, enum t3_cq_opcode op, u32 credit); int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); -int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b35941eee74..5bde4ae93681 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -88,7 +88,7 @@ static int iwch_alloc_ucontext(struct ib_ucontext *ucontext, return 0; } -static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct iwch_cq *chp; @@ -101,7 +101,6 @@ static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); kfree(chp); - return 0; } static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 6557e7c5af66..f49e6d271c42 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -34,14 +34,13 @@ #include "iw_cxgb4.h" -static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, - struct c4iw_dev_ucontext *uctx, struct sk_buff *skb, - struct c4iw_wr_wait *wr_waitp) +static void destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx, struct sk_buff *skb, + struct c4iw_wr_wait *wr_waitp) { struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; - int ret; wr_len = sizeof(*res_wr) + sizeof(*res); set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); @@ -59,14 +58,13 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, res->u.cq.iqid = cpu_to_be32(cq->cqid); c4iw_init_wr_wait(wr_waitp); - ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__); + c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__); kfree(cq->sw_queue); dma_free_coherent(&(rdev->lldi.pdev->dev), cq->memsize, cq->queue, dma_unmap_addr(cq, mapping)); c4iw_put_cqid(rdev, cq->cqid, uctx); - return ret; } static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, @@ -970,7 +968,7 @@ int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return !err || err == -ENODATA ? npolled : err; } -int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct c4iw_cq *chp; struct c4iw_ucontext *ucontext; @@ -989,7 +987,6 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) chp->destroy_skb, chp->wr_waitp); c4iw_put_wr_wait(chp->wr_waitp); kfree(chp); - return 0; } struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index cf7512b2c4c0..45e720288f0f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -992,7 +992,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, struct ib_udata *udata); struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); -int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); +void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 14a36546985b..52d894f0ad3e 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -134,7 +134,7 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); struct ib_qp *efa_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); -int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); struct ib_cq *efa_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 607aff869200..42865cf4f149 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -847,25 +847,20 @@ static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx) return efa_com_destroy_cq(&dev->edev, ¶ms); } -int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct efa_dev *dev = to_edev(ibcq->device); struct efa_cq *cq = to_ecq(ibcq); - int err; ibdev_dbg(&dev->ibdev, "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); - err = efa_destroy_cq_idx(dev, cq->cq_idx); - if (err) - return err; - + efa_destroy_cq_idx(dev, cq->cq_idx); dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size, DMA_FROM_DEVICE); kfree(cq); - return 0; } static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6e81ff3f1813..0eb7c16c007b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -443,40 +443,36 @@ err_cq: } EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq); -int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); - int ret = 0; if (hr_dev->hw->destroy_cq) { - ret = hr_dev->hw->destroy_cq(ib_cq, udata); - } else { - hns_roce_free_cq(hr_dev, hr_cq); - hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - - if (udata) { - ib_umem_release(hr_cq->umem); - - if (hr_cq->db_en == 1) - hns_roce_db_unmap_user( - rdma_udata_to_drv_context( - udata, - struct hns_roce_ucontext, - ibucontext), - &hr_cq->db); - } else { - /* Free the buff of stored cq */ - hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, - ib_cq->cqe); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) - hns_roce_free_db(hr_dev, &hr_cq->db); - } - - kfree(hr_cq); + hr_dev->hw->destroy_cq(ib_cq, udata); + return; } - return ret; + hns_roce_free_cq(hr_dev, hr_cq); + hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); + + if (udata) { + ib_umem_release(hr_cq->umem); + + if (hr_cq->db_en == 1) + hns_roce_db_unmap_user(rdma_udata_to_drv_context( + udata, + struct hns_roce_ucontext, + ibucontext), + &hr_cq->db); + } else { + /* Free the buff of stored cq */ + hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) + hns_roce_free_db(hr_dev, &hr_cq->db); + } + + kfree(hr_cq); } EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ce23338831eb..2f7d9644fd24 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -938,7 +938,7 @@ struct hns_roce_hw { int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, struct ib_udata *udata); - int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata); + void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*init_eq)(struct hns_roce_dev *hr_dev); void (*cleanup_eq)(struct hns_roce_dev *hr_dev); @@ -1209,7 +1209,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); +void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); int hns_roce_db_map_user(struct hns_roce_ucontext *context, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 878c8ae35630..aa7b67d283af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -865,8 +865,7 @@ alloc_pd_failed: kfree(pd); alloc_mem_failed: - if (hns_roce_ib_destroy_cq(cq, NULL)) - dev_err(dev, "Destroy cq for create_lp_qp failed!\n"); + hns_roce_ib_destroy_cq(cq, NULL); return ret; } @@ -894,10 +893,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) i, ret); } - ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); - if (ret) - dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret); - + hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL); } @@ -3654,7 +3650,7 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return 0; } -static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); @@ -3663,7 +3659,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) u32 cqe_cnt_cur; u32 cq_buf_size; int wait_time = 0; - int ret = 0; hns_roce_free_cq(hr_dev, hr_cq); @@ -3685,7 +3680,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) { dev_warn(dev, "Destroy cq 0x%lx timeout!\n", hr_cq->cqn); - ret = -ETIMEDOUT; break; } wait_time++; @@ -3702,8 +3696,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) } kfree(hr_cq); - - return ret; } static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index bfe16e6f04f4..205053cb5f97 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1064,7 +1064,7 @@ void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq) * @ib_cq: cq pointer * @udata: user data or NULL for kernel object */ -static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct i40iw_cq *iwcq; struct i40iw_device *iwdev; @@ -1077,7 +1077,6 @@ static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cq_free_resources(iwdev, iwcq); kfree(iwcq); i40iw_rem_devusecount(iwdev); - return 0; } /** diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 022a0b4ea452..8eb7490dabb8 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -486,7 +486,7 @@ out: return err; } -int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(cq->device); struct mlx4_ib_cq *mcq = to_mcq(cq); @@ -508,8 +508,6 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) } kfree(mcq); - - return 0; } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 26897102057d..af5ee45a9f19 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -746,7 +746,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 2e2e65f00257..ebd01bd7f8f6 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -998,7 +998,7 @@ err_create: return ERR_PTR(err); } -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); @@ -1010,8 +1010,6 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) destroy_cq_kernel(dev, mcq); kfree(mcq); - - return 0; } static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 40eb8be482e4..bf697c5b3e79 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1118,7 +1118,7 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index b128ff76f709..81fc04e1c142 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -804,7 +804,7 @@ out: return ret; } -static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { if (udata) { struct mthca_ucontext *context = @@ -824,8 +824,6 @@ static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) } mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); kfree(cq); - - return 0; } static inline u32 convert_access(int acc) diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 90f28890246d..e976292fc6c0 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -588,9 +588,7 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) cqp_request->callback = 0; nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", cqp_request); - } else - printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n", - __func__); + } return cqp_request; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 7677f1f734bb..cac3fa624c4d 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1634,7 +1634,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, /** * nes_destroy_cq */ -static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct nes_cq *nescq; struct nes_device *nesdev; @@ -1644,7 +1644,6 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) struct nes_cqp_request cqp_request = {}; unsigned long flags; u32 opcode = 0; - int ret; nescq = to_nescq(ib_cq); nesvnic = to_nesvnic(ib_cq->device); @@ -1656,6 +1655,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) /* Send DestroyCQ request to CQP */ INIT_LIST_HEAD(&cqp_request.list); init_waitqueue_head(&cqp_request.waitq); + cqp_request.waiting = 1; cqp_wqe = &cqp_request.cqp_wqe; opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); @@ -1689,30 +1689,18 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) /* Wait for CQP */ nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", nescq->hw_cq.cq_number); - ret = wait_event_timeout(cqp_request.waitq, cqp_request.request_done, - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nescq->hw_cq.cq_number, ret, cqp_request.major_code, - cqp_request.minor_code); - if (!ret) { - nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", - nescq->hw_cq.cq_number); - ret = -ETIME; - } else if (cqp_request.major_code) { - nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", - nescq->hw_cq.cq_number); - ret = -EIO; - } else { - ret = 0; - } + wait_event_timeout(cqp_request.waitq, cqp_request.request_done, + NES_EVENT_TIMEOUT); + nes_debug( + NES_DBG_CQ, + "Destroy iWARP CQ%u completed CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nescq->hw_cq.cq_number, cqp_request.major_code, + cqp_request.minor_code); if (nescq->cq_mem_size) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); kfree(nescq); - - return ret; } /** diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 5127e2ea4bdd..b2dd4e0a4be2 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1888,14 +1888,13 @@ mem_err: return status; } -int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) +void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) { - int status = -ENOMEM; struct ocrdma_destroy_cq *cmd; cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd)); if (!cmd) - return status; + return; ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); @@ -1903,11 +1902,10 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & OCRDMA_DESTROY_CQ_QID_MASK; - status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); ocrdma_unbind_eq(dev, cq->eqn); dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); kfree(cmd); - return status; } int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index 06ec59326a90..12c23a7652b9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -122,7 +122,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, u32 pd_id, int acc); int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *, int entries, int dpp_cq, u16 pd_id); -int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *); +void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq); int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 35ec87015792..94e4f7f9b1f7 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1070,7 +1070,7 @@ static void ocrdma_flush_cq(struct ocrdma_cq *cq) spin_unlock_irqrestore(&cq->cq_lock, flags); } -int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); struct ocrdma_eq *eq = NULL; @@ -1080,14 +1080,13 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) dev->cq_tbl[cq->id] = NULL; indx = ocrdma_get_eq_table_index(dev, cq->eqn); - BUG_ON(indx == -EINVAL); eq = &dev->eq_tbl[indx]; irq = ocrdma_get_irq(dev, eq); synchronize_irq(irq); ocrdma_flush_cq(cq); - (void)ocrdma_mbx_destroy_cq(dev, cq); + ocrdma_mbx_destroy_cq(dev, cq); if (cq->ucontext) { pdid = cq->ucontext->cntxt_pd->id; ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, @@ -1098,7 +1097,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) } kfree(cq); - return 0; } static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index d76aae7ed0d3..89cebe05669e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -75,7 +75,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); -int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); struct ib_qp *ocrdma_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3c0dba072071..be29bbbc4b14 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -955,14 +955,13 @@ int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) #define QEDR_DESTROY_CQ_MAX_ITERATIONS (10) #define QEDR_DESTROY_CQ_ITER_DURATION (10) -int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct qedr_dev *dev = get_qedr_dev(ibcq->device); struct qed_rdma_destroy_cq_out_params oparams; struct qed_rdma_destroy_cq_in_params iparams; struct qedr_cq *cq = get_qedr_cq(ibcq); int iter; - int rc; DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid); @@ -973,10 +972,7 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) goto done; iparams.icid = cq->icid; - rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); - if (rc) - return rc; - + dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); dev->ops->common->chain_free(dev->cdev, &cq->pbl); if (udata) { @@ -1007,9 +1003,6 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) iter--; } - if (oparams.num_cq_notif != cq->cnq_notif) - goto err; - /* Note that we don't need to have explicit code to wait for the * completion of the event handler because it is invoked from the EQ. * Since the destroy CQ ramrod has also been received on the EQ we can @@ -1019,15 +1012,6 @@ done: cq->sig = ~cq->sig; kfree(cq); - - return 0; - -err: - DP_ERR(dev, - "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n", - cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif); - - return -EINVAL; } static inline int get_gid_info_from_table(struct ib_qp *ibqp, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 9328c80375ef..32d7ce77e339 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -54,7 +54,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); -int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, struct ib_udata *); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index e9352750e029..5686d14b86fe 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -604,11 +604,9 @@ struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, return cq; } -int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { - usnic_dbg("\n"); kfree(cq); - return 0; } struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 028f322f8e9b..0b9d993433a7 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -61,7 +61,7 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index d7deb19a2800..0682781f6555 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -246,10 +246,8 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) * pvrdma_destroy_cq - destroy completion queue * @cq: the completion queue to destroy. * @udata: user data or null for kernel object - * - * @return: 0 for success. */ -int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct pvrdma_cq *vcq = to_vcq(cq); union pvrdma_cmd_req req; @@ -275,8 +273,6 @@ int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) pvrdma_free_cq(dev, vcq); atomic_dec(&dev->num_cqs); - - return ret; } static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 9d7b021e1c59..f0dd6e4d058b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -412,7 +412,7 @@ int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index a06e6da7a026..8e76036fad4a 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -300,10 +300,8 @@ done: * @udata: user data or NULL for kernel object * * Called by ib_destroy_cq() in the generic verbs code. - * - * Return: always 0 */ -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; @@ -317,8 +315,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) else vfree(cq->queue); kfree(cq); - - return 0; } /** diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 3ad6faf18ecb..495d8c3e6580 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -54,7 +54,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 046129393215..b14881decbee 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -819,14 +819,13 @@ err1: return ERR_PTR(err); } -static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); rxe_cq_disable(cq); rxe_drop_ref(cq); - return 0; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d1f16a6c4810..bc1d94c9c9ba 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2462,7 +2462,7 @@ struct ib_device_ops { const struct ib_cq_init_attr *attr, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); - int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); + void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, From e39afe3d6dbd908d8fd189571a3c1561088a86c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2019 14:37:29 +0300 Subject: [PATCH 055/194] RDMA: Convert CQ allocations to be under core responsibility Ensure that CQ is allocated and freed by IB/core and not by drivers. Signed-off-by: Leon Romanovsky Acked-by: Gal Pressman Reviewed-by: Dennis Dalessandro Tested-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cq.c | 28 +++++---- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_cmd.c | 15 +++-- drivers/infiniband/core/uverbs_std_types_cq.c | 19 ++++-- drivers/infiniband/core/verbs.c | 30 ++++++---- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 20 +++---- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 7 +-- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 43 ++++++------- drivers/infiniband/hw/cxgb4/cq.c | 27 ++++----- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 5 +- drivers/infiniband/hw/cxgb4/provider.c | 1 + drivers/infiniband/hw/efa/efa.h | 5 +- drivers/infiniband/hw/efa/efa_main.c | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 48 +++++---------- drivers/infiniband/hw/hns/hns_roce_cq.c | 23 +++---- drivers/infiniband/hw/hns/hns_roce_device.h | 6 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 21 ++++--- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 33 ++++------ drivers/infiniband/hw/mlx4/cq.c | 25 +++----- drivers/infiniband/hw/mlx4/main.c | 1 + drivers/infiniband/hw/mlx4/mlx4_ib.h | 5 +- drivers/infiniband/hw/mlx5/cq.c | 32 ++++------ drivers/infiniband/hw/mlx5/main.c | 21 ++++--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 +- drivers/infiniband/hw/mthca/mthca_provider.c | 36 +++++------ drivers/infiniband/hw/nes/nes_verbs.c | 60 +++++++------------ drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 29 ++++----- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 5 +- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/verbs.c | 28 +++------ drivers/infiniband/hw/qedr/verbs.h | 5 +- drivers/infiniband/hw/usnic/usnic_ib.h | 4 ++ drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 18 ++---- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 5 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 34 ++++------- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 5 +- drivers/infiniband/sw/rdmavt/cq.c | 51 +++++----------- drivers/infiniband/sw/rdmavt/cq.h | 5 +- drivers/infiniband/sw/rdmavt/vt.c | 1 + drivers/infiniband/sw/rxe/rxe_pool.c | 1 + drivers/infiniband/sw/rxe/rxe_verbs.c | 30 ++++------ drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- include/rdma/ib_verbs.h | 6 +- 48 files changed, 317 insertions(+), 436 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 6ee62600a812..3b9412c69565 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -147,23 +147,26 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, struct ib_cq *cq; int ret = -ENOMEM; - cq = dev->ops.create_cq(dev, &cq_attr, NULL); - if (IS_ERR(cq)) - return cq; + cq = rdma_zalloc_drv_obj(dev, ib_cq); + if (!cq) + return ERR_PTR(ret); cq->device = dev; - cq->uobject = NULL; - cq->event_handler = NULL; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) - goto out_destroy_cq; + goto out_free_cq; cq->res.type = RDMA_RESTRACK_CQ; rdma_restrack_set_task(&cq->res, caller); + + ret = dev->ops.create_cq(cq, &cq_attr, NULL); + if (ret) + goto out_free_wc; + rdma_restrack_kadd(&cq->res); switch (cq->poll_ctx) { @@ -186,16 +189,18 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, break; default: ret = -EINVAL; - goto out_free_wc; + goto out_destroy_cq; } return cq; +out_destroy_cq: + rdma_restrack_del(&cq->res); + cq->device->ops.destroy_cq(cq, udata); out_free_wc: kfree(cq->wc); - rdma_restrack_del(&cq->res); -out_destroy_cq: - cq->device->ops.destroy_cq(cq, udata); +out_free_cq: + kfree(cq); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq_user); @@ -224,8 +229,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) WARN_ON_ONCE(1); } - kfree(cq->wc); rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); + kfree(cq->wc); + kfree(cq); } EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 357d74c8df2b..abb169f31d0f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2431,6 +2431,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5a3a1780ceea..5c00d9a5698a 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1010,12 +1010,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, attr.comp_vector = cmd->comp_vector; attr.flags = cmd->flags; - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_file; } - cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; @@ -1023,6 +1022,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + obj->uobject.object = cq; memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uobject.id; @@ -1043,7 +1046,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, err_cb: ib_destroy_cq(cq); - + cq = NULL; +err_free: + kfree(cq); err_file: if (ev_file) ib_uverbs_release_ucq(attrs->ufile, ev_file, obj); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index db5c46a1bb2d..06b8c7d017b7 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_event_file; } @@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; - obj->uobject.object = cq; - obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; + + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; rdma_restrack_uadd(&cq->res); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, @@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return 0; err_cq: ib_destroy_cq(cq); - + cq = NULL; +err_free: + kfree(cq); err_event_file: if (ev_file) uverbs_uobject_put(ev_file_uobj); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 933bc35701ad..585e100706aa 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1916,21 +1916,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, const char *caller) { struct ib_cq *cq; + int ret; - cq = device->ops.create_cq(device, cq_attr, NULL); + cq = rdma_zalloc_drv_obj(device, ib_cq); + if (!cq) + return ERR_PTR(-ENOMEM); - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); - rdma_restrack_kadd(&cq->res); + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_set_task(&cq->res, caller); + + ret = device->ops.create_cq(cq, cq_attr, NULL); + if (ret) { + kfree(cq); + return ERR_PTR(ret); } + rdma_restrack_kadd(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); @@ -1950,6 +1957,7 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); + kfree(cq); return 0; } EXPORT_SYMBOL(ib_destroy_cq_user); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 0127af45dcd1..44cc5f19df3b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2534,16 +2534,14 @@ void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) atomic_dec(&rdev->cq_count); nq->budget--; kfree(cq->cql); - kfree(cq); } -struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_cq *cq = NULL; + struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); int rc, entries; int cqe = attr->cqe; struct bnxt_qplib_nq *nq = NULL; @@ -2552,11 +2550,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, /* Validate CQ fields */ if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); cq->rdev = rdev; cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq); @@ -2634,15 +2629,14 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, } } - return &cq->ib_cq; + return 0; c2fail: if (udata) ib_umem_release(cq->umem); fail: kfree(cq->cql); - kfree(cq); - return ERR_PTR(rc); + return rc; } static u8 __req_to_ib_wc_status(u8 qstatus) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 828403ee0104..31662b1ee35a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -94,11 +94,11 @@ struct bnxt_re_qp { }; struct bnxt_re_cq { + struct ib_cq ib_cq; struct bnxt_re_dev *rdev; spinlock_t cq_lock; /* protect cq */ u16 cq_count; u16 cq_period; - struct ib_cq ib_cq; struct bnxt_qplib_cq qplib_cq; struct bnxt_qplib_cqe *cql; #define MAX_CQL_PER_POLL 1024 @@ -190,9 +190,8 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); -struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 351c420248a0..029babe713f3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -641,6 +641,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), + INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 5bde4ae93681..acba96f289cc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -100,16 +100,16 @@ static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) wait_event(chp->wait, !atomic_read(&chp->refcnt)); cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); - kfree(chp); } -static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int iwch_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; - struct iwch_dev *rhp; - struct iwch_cq *chp; + struct iwch_dev *rhp = to_iwch_dev(ibcq->device); + struct iwch_cq *chp = to_iwch_cq(ibcq); struct iwch_create_cq_resp uresp; struct iwch_create_cq_req ureq; static int warned; @@ -117,19 +117,13 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries); if (attr->flags) - return ERR_PTR(-EINVAL); - - rhp = to_iwch_dev(ibdev); - chp = kzalloc(sizeof(*chp), GFP_KERNEL); - if (!chp) - return ERR_PTR(-ENOMEM); + return -EINVAL; if (udata) { if (!t3a_device(rhp)) { - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { - kfree(chp); - return ERR_PTR(-EFAULT); - } + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return -EFAULT; + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; } } @@ -150,10 +144,9 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, entries = roundup_pow_of_two(entries); chp->cq.size_log2 = ilog2(entries); - if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) { - kfree(chp); - return ERR_PTR(-ENOMEM); - } + if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) + return -ENOMEM; + chp->rhp = rhp; chp->ibcq.cqe = 1 << chp->cq.size_log2; spin_lock_init(&chp->lock); @@ -162,8 +155,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, init_waitqueue_head(&chp->wait); if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) { cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); - kfree(chp); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } if (udata) { @@ -174,7 +166,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) { iwch_destroy_cq(&chp->ibcq, udata); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } uresp.cqid = chp->cq.cqid; uresp.size_log2 = chp->cq.size_log2; @@ -200,14 +192,14 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &uresp, resplen)) { kfree(mm); iwch_destroy_cq(&chp->ibcq, udata); - return ERR_PTR(-EFAULT); + return -EFAULT; } insert_mmap(ucontext, mm); } pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n", chp->cq.cqid, chp, (1 << chp->cq.size_log2), &chp->cq.dma_addr); - return &chp->ibcq; + return 0; } static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) @@ -1277,6 +1269,7 @@ static const struct ib_device_ops iwch_dev_ops = { .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index f49e6d271c42..3cc4d3331a3f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -986,17 +986,16 @@ void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx, chp->destroy_skb, chp->wr_waitp); c4iw_put_wr_wait(chp->wr_waitp); - kfree(chp); } -struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; - struct c4iw_dev *rhp; - struct c4iw_cq *chp; + struct c4iw_dev *rhp = to_c4iw_dev(ibcq->device); + struct c4iw_cq *chp = to_c4iw_cq(ibcq); struct c4iw_create_cq ucmd; struct c4iw_create_cq_resp uresp; int ret, wr_len; @@ -1007,22 +1006,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, pr_debug("ib_dev %p entries %d\n", ibdev, entries); if (attr->flags) - return ERR_PTR(-EINVAL); - - rhp = to_c4iw_dev(ibdev); + return -EINVAL; if (vector >= rhp->rdev.lldi.nciq) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { if (udata->inlen < sizeof(ucmd)) ucontext->is_32b_cqe = 1; } - chp = kzalloc(sizeof(*chp), GFP_KERNEL); - if (!chp) - return ERR_PTR(-ENOMEM); - chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL); if (!chp->wr_waitp) { ret = -ENOMEM; @@ -1132,10 +1125,11 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, mm2->len = PAGE_SIZE; insert_mmap(ucontext, mm2); } + pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n", chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize, &chp->cq.dma_addr); - return &chp->ibcq; + return 0; err_free_mm2: kfree(mm2); err_free_mm: @@ -1151,8 +1145,7 @@ err_free_skb: err_free_wr_wait: c4iw_put_wr_wait(chp->wr_waitp); err_free_chp: - kfree(chp); - return ERR_PTR(ret); + return ret; } int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 45e720288f0f..7d06b0f8d49a 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -993,9 +993,8 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); -struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask srq_attr_mask, diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 2b1f2443b7da..5e59c5708729 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -537,6 +537,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 52d894f0ad3e..119f8efec564 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -135,9 +135,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); -struct ib_cq *efa_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index b891ee239a67..46861461dd2d 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -224,6 +224,7 @@ static const struct ib_device_ops efa_dev_ops = { .reg_user_mr = efa_reg_mr, INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 42865cf4f149..a9372c9e4b30 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -859,8 +859,6 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) efa_destroy_cq_idx(dev, cq->cq_idx); dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size, DMA_FROM_DEVICE); - - kfree(cq); } static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, @@ -876,17 +874,20 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, return 0; } -static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, - int vector, struct ib_ucontext *ibucontext, - struct ib_udata *udata) +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct efa_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct efa_ucontext, ibucontext); struct efa_ibv_create_cq_resp resp = {}; struct efa_com_create_cq_params params; struct efa_com_create_cq_result result; + struct ib_device *ibdev = ibcq->device; struct efa_dev *dev = to_edev(ibdev); struct efa_ibv_create_cq cmd = {}; + struct efa_cq *cq = to_ecq(ibcq); bool cq_entry_inserted = false; - struct efa_cq *cq; + int entries = attr->cqe; int err; ibdev_dbg(ibdev, "create_cq entries %d\n", entries); @@ -944,19 +945,13 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, goto err_out; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - err = -ENOMEM; - goto err_out; - } - - cq->ucontext = to_eucontext(ibucontext); + cq->ucontext = ucontext; cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, DMA_FROM_DEVICE); if (!cq->cpu_addr) { err = -ENOMEM; - goto err_free_cq; + goto err_out; } params.uarn = cq->ucontext->uarn; @@ -975,8 +970,8 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, err = cq_mmap_entries_setup(dev, cq, &resp); if (err) { - ibdev_dbg(ibdev, - "Could not setup cq[%u] mmap entries\n", cq->cq_idx); + ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n", + cq->cq_idx); goto err_destroy_cq; } @@ -992,11 +987,10 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, } } - ibdev_dbg(ibdev, - "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", + ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr); - return &cq->ibcq; + return 0; err_destroy_cq: efa_destroy_cq_idx(dev, cq->cq_idx); @@ -1005,23 +999,9 @@ err_free_mapped: DMA_FROM_DEVICE); if (!cq_entry_inserted) free_pages_exact(cq->cpu_addr, cq->size); -err_free_cq: - kfree(cq); err_out: atomic64_inc(&dev->stats.sw_stats.create_cq_err); - return ERR_PTR(err); -} - -struct ib_cq *efa_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) -{ - struct efa_ucontext *ucontext = rdma_udata_to_drv_context(udata, - struct efa_ucontext, - ibucontext); - - return do_create_cq(ibdev, attr->cqe, attr->comp_vector, - &ucontext->ibucontext, udata); + return err; } static int umem_to_page_list(struct efa_dev *dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 0eb7c16c007b..7e198c9ffbfe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -299,15 +299,15 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev, &buf->hr_buf); } -struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int hns_roce_ib_create_cq(struct ib_cq *ib_cq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct device *dev = hr_dev->dev; struct hns_roce_ib_create_cq ucmd; struct hns_roce_ib_create_cq_resp resp = {}; - struct hns_roce_cq *hr_cq = NULL; + struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct hns_roce_uar *uar = NULL; int vector = attr->comp_vector; int cq_entries = attr->cqe; @@ -318,13 +318,9 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) { dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n", cq_entries, hr_dev->caps.max_cqes); - return ERR_PTR(-EINVAL); + return -EINVAL; } - hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL); - if (!hr_cq) - return ERR_PTR(-ENOMEM); - if (hr_dev->caps.min_cqes) cq_entries = max(cq_entries, hr_dev->caps.min_cqes); @@ -415,7 +411,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, goto err_cqc; } - return &hr_cq->ib_cq; + return 0; err_cqc: hns_roce_free_cq(hr_dev, hr_cq); @@ -438,8 +434,7 @@ err_db: hns_roce_free_db(hr_dev, &hr_cq->db); err_cq: - kfree(hr_cq); - return ERR_PTR(ret); + return ret; } EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq); @@ -471,8 +466,6 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) hns_roce_free_db(hr_dev, &hr_cq->db); } - - kfree(hr_cq); } EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2f7d9644fd24..303ea7c614a8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1205,9 +1205,9 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, __be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); -struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int hns_roce_ib_create_cq(struct ib_cq *ib_cq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index aa7b67d283af..c899879da222 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -717,7 +717,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) union ib_gid dgid; u64 subnet_prefix; int attr_mask = 0; - int ret = -ENOMEM; + int ret; int i, j; u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 }; u8 phy_port; @@ -730,10 +730,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) /* Reserved cq for loop qp */ cq_init_attr.cqe = HNS_ROCE_MIN_WQE_NUM * 2; cq_init_attr.comp_vector = 0; - cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL); - if (IS_ERR(cq)) { - dev_err(dev, "Create cq for reserved loop qp failed!"); + + ibdev = &hr_dev->ib_dev; + cq = rdma_zalloc_drv_obj(ibdev, ib_cq); + if (!cq) return -ENOMEM; + + ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL); + if (ret) { + dev_err(dev, "Create cq for reserved loop qp failed!"); + goto alloc_cq_failed; } free_mr->mr_free_cq = to_hr_cq(cq); free_mr->mr_free_cq->ib_cq.device = &hr_dev->ib_dev; @@ -743,7 +749,6 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) free_mr->mr_free_cq->ib_cq.cq_context = NULL; atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0); - ibdev = &hr_dev->ib_dev; pd = rdma_zalloc_drv_obj(ibdev, ib_pd); if (!pd) goto alloc_mem_failed; @@ -866,7 +871,8 @@ alloc_pd_failed: alloc_mem_failed: hns_roce_ib_destroy_cq(cq, NULL); - +alloc_cq_failed: + kfree(cq); return ret; } @@ -894,6 +900,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) } hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); + kfree(&free_mr->mr_free_cq->ib_cq); hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL); } @@ -3694,8 +3701,6 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz; hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf); } - - kfree(hr_cq); } static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index f07b2ec86ec2..3e45b119b0eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -446,6 +446,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .reg_user_mr = hns_roce_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, hns_roce_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 205053cb5f97..3100b0c31b0a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1075,27 +1075,27 @@ static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cq = &iwcq->sc_cq; i40iw_cq_wq_destroy(iwdev, cq); cq_free_resources(iwdev, iwcq); - kfree(iwcq); i40iw_rem_devusecount(iwdev); } /** * i40iw_create_cq - create cq - * @ibdev: device pointer from stack + * @ibcq: CQ allocated * @attr: attributes for cq * @udata: user data */ -static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int i40iw_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct i40iw_device *iwdev = to_iwdev(ibdev); - struct i40iw_cq *iwcq; + struct i40iw_cq *iwcq = to_iwcq(ibcq); struct i40iw_pbl *iwpbl; u32 cq_num = 0; struct i40iw_sc_cq *cq; struct i40iw_sc_dev *dev = &iwdev->sc_dev; - struct i40iw_cq_init_info info; + struct i40iw_cq_init_info info = {}; enum i40iw_status_code status; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; @@ -1105,22 +1105,16 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, int entries = attr->cqe; if (iwdev->closing) - return ERR_PTR(-ENODEV); + return -ENODEV; if (entries > iwdev->max_cqe) - return ERR_PTR(-EINVAL); - - iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL); - if (!iwcq) - return ERR_PTR(-ENOMEM); - - memset(&info, 0, sizeof(info)); + return -EINVAL; err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs, iwdev->max_cq, &cq_num, &iwdev->next_cq); if (err_code) - goto error; + return err_code; cq = &iwcq->sc_cq; cq->back_cq = (void *)iwcq; @@ -1227,15 +1221,13 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, } i40iw_add_devusecount(iwdev); - return (struct ib_cq *)iwcq; + return 0; cq_destroy: i40iw_cq_wq_destroy(iwdev, cq); cq_free_resources: cq_free_resources(iwdev, iwcq); -error: - kfree(iwcq); - return ERR_PTR(err_code); + return err_code; } /** @@ -2693,6 +2685,7 @@ static const struct ib_device_ops i40iw_dev_ops = { .reg_user_mr = i40iw_reg_user_mr, .req_notify_cq = i40iw_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 8eb7490dabb8..72f238ddafb5 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -172,14 +172,14 @@ err_buf: } #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; struct mlx4_ib_dev *dev = to_mdev(ibdev); - struct mlx4_ib_cq *cq; + struct mlx4_ib_cq *cq = to_mcq(ibcq); struct mlx4_uar *uar; void *buf_addr; int err; @@ -187,14 +187,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, udata, struct mlx4_ib_ucontext, ibucontext); if (entries < 1 || entries > dev->dev->caps.max_cqes) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); + return -EINVAL; entries = roundup_pow_of_two(entries + 1); cq->ibcq.cqe = entries - 1; @@ -269,7 +265,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_cq_free; } - return &cq->ibcq; + return 0; err_cq_free: mlx4_cq_free(dev->dev, &cq->mcq); @@ -289,11 +285,8 @@ err_mtt: err_db: if (!udata) mlx4_db_free(dev->dev, &cq->db); - err_cq: - kfree(cq); - - return ERR_PTR(err); + return err; } static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, @@ -506,8 +499,6 @@ void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } - - kfree(mcq); } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5d7a87842291..8790101facb7 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2565,6 +2565,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .resize_cq = mlx4_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index af5ee45a9f19..81b3d85e5167 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -743,9 +743,8 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index ebd01bd7f8f6..07b73df0e1a3 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -884,14 +884,14 @@ static void notify_soft_wc_handler(struct work_struct *work) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_cq *cq; + struct mlx5_ib_cq *cq = to_mcq(ibcq); int uninitialized_var(index); int uninitialized_var(inlen); u32 *cqb = NULL; @@ -903,18 +903,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (entries < 0 || (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) - return ERR_PTR(-EINVAL); + return -EINVAL; if (check_cq_create_flags(attr->flags)) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; entries = roundup_pow_of_two(entries + 1); if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); + return -EINVAL; cq->ibcq.cqe = entries - 1; mutex_init(&cq->resize_mutex); @@ -929,13 +925,13 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, &index, &inlen); if (err) - goto err_create; + return err; } else { cqe_size = cache_line_size() == 128 ? 128 : 64; err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index, &inlen); if (err) - goto err_create; + return err; INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } @@ -980,7 +976,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, kvfree(cqb); - return &cq->ibcq; + return 0; err_cmd: mlx5_core_destroy_cq(dev->mdev, &cq->mcq); @@ -991,11 +987,7 @@ err_cqb: destroy_cq_user(cq, udata); else destroy_cq_kernel(dev, cq); - -err_create: - kfree(cq); - - return ERR_PTR(err); + return err; } void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) @@ -1008,8 +1000,6 @@ void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) destroy_cq_user(mcq, udata); else destroy_cq_kernel(dev, mcq); - - kfree(mcq); } static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1e3d936ed809..99eb4a8b0b0d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4891,18 +4891,19 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) if (ret) goto error0; - devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); + devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq); + if (!devr->c0) { + ret = -ENOMEM; goto error1; } - devr->c0->device = &dev->ib_dev; - devr->c0->uobject = NULL; - devr->c0->comp_handler = NULL; - devr->c0->event_handler = NULL; - devr->c0->cq_context = NULL; + + devr->c0->device = &dev->ib_dev; atomic_set(&devr->c0->usecnt, 0); + ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL); + if (ret) + goto err_create_cq; + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); @@ -4994,6 +4995,8 @@ error3: mlx5_ib_dealloc_xrcd(devr->x0, NULL); error2: mlx5_ib_destroy_cq(devr->c0, NULL); +err_create_cq: + kfree(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0, NULL); error0: @@ -5012,6 +5015,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_xrcd(devr->x0, NULL); mlx5_ib_dealloc_xrcd(devr->x1, NULL); mlx5_ib_destroy_cq(devr->c0, NULL); + kfree(devr->c0); mlx5_ib_dealloc_pd(devr->p0, NULL); kfree(devr->p0); @@ -6182,6 +6186,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .resize_cq = mlx5_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bf697c5b3e79..f2ad0372d38d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1115,9 +1115,8 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int buflen, size_t *bc); int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, int buflen, size_t *bc); -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 81fc04e1c142..efd4e3d13ae2 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -601,10 +601,11 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) return 0; } -static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int mthca_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct mthca_create_cq ucmd; struct mthca_cq *cq; @@ -614,20 +615,20 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, udata, struct mthca_ucontext, ibucontext); if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) - return ERR_PTR(-EFAULT); + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) + return -EFAULT; err = mthca_map_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.set_db_index, ucmd.set_db_page); if (err) - return ERR_PTR(err); + return err; err = mthca_map_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.arm_db_index, @@ -636,11 +637,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, goto err_unmap_set; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - err = -ENOMEM; - goto err_unmap_arm; - } + cq = to_mcq(ibcq); if (udata) { cq->buf.mr.ibmr.lkey = ucmd.lkey; @@ -655,20 +652,17 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num, cq); if (err) - goto err_free; + goto err_unmap_arm; if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) { mthca_free_cq(to_mdev(ibdev), cq); err = -EFAULT; - goto err_free; + goto err_unmap_arm; } cq->resize_buf = NULL; - return &cq->ibcq; - -err_free: - kfree(cq); + return 0; err_unmap_arm: if (udata) @@ -680,7 +674,7 @@ err_unmap_set: mthca_unmap_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.set_db_index); - return ERR_PTR(err); + return err; } static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq, @@ -823,7 +817,6 @@ static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) to_mcq(cq)->set_ci_db_index); } mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); - kfree(cq); } static inline u32 convert_access(int acc) @@ -1187,6 +1180,7 @@ static const struct ib_device_ops mthca_dev_ops = { .resize_cq = mthca_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mthca_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index cac3fa624c4d..0420203820f6 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1374,16 +1374,17 @@ static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) /** * nes_create_cq */ -static struct ib_cq *nes_create_cq(struct ib_device *ibdev, +static int nes_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; u64 u64temp; struct nes_vnic *nesvnic = to_nesvnic(ibdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_cq *nescq; + struct nes_cq *nescq = to_nescq(ibcq); struct nes_ucontext *nes_ucontext = NULL; struct nes_cqp_request *cqp_request; void *mem = NULL; @@ -1399,22 +1400,15 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int ret; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries > nesadapter->max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ); - if (err) { - return ERR_PTR(err); - } - - nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL); - if (!nescq) { - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return ERR_PTR(-ENOMEM); - } + if (err) + return err; nescq->hw_cq.cq_size = max(entries + 1, 5); nescq->hw_cq.cq_number = cq_num; @@ -1424,10 +1418,10 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( udata, struct nes_ucontext, ibucontext); - if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) { + if (ib_copy_from_udata(&req, udata, + sizeof(struct nes_create_cq_req))) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } nesvnic->mcrq_ucontext = nes_ucontext; nes_ucontext->mcrqf = req.mcrqf; @@ -1441,8 +1435,6 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, nescq->mcrqf = nes_ucontext->mcrqf; nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); } - nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n", - (unsigned long)req.user_cq_buffer, entries); err = 1; list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { @@ -1455,8 +1447,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, } if (err) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } pbl_entries = nespbl->pbl_size >> 3; @@ -1472,15 +1463,11 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, if (!mem) { printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } nescq->hw_cq.cq_vbase = mem; nescq->hw_cq.cq_head = 0; - nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", - nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase, - (u32)nescq->hw_cq.cq_pbase); } nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; @@ -1500,8 +1487,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } cqp_request->waiting = 1; cqp_wqe = &cqp_request->cqp_wqe; @@ -1528,8 +1514,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } else { opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); nescq->virtual_cq = 2; @@ -1550,8 +1535,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } else { opcode |= NES_CQP_CQ_VIRT; nescq->virtual_cq = 1; @@ -1607,8 +1591,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EIO); + return -EIO; } nes_put_cqp_request(nesdev, cqp_request); @@ -1620,17 +1603,16 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, resp.cq_id = nescq->hw_cq.cq_number; resp.cq_size = nescq->hw_cq.cq_size; resp.mmap_db_index = 0; - if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) { + if (ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved))) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &nescq->ibcq; + return 0; } - /** * nes_destroy_cq */ @@ -1700,7 +1682,6 @@ static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) if (nescq->cq_mem_size) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); - kfree(nescq); } /** @@ -3584,6 +3565,7 @@ static const struct ib_device_ops nes_dev_ops = { .reg_user_mr = nes_reg_user_mr, .req_notify_cq = nes_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, nes_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index b326313d413f..c15cfc6cef81 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -182,6 +182,7 @@ static const struct ib_device_ops ocrdma_dev_ops = { .resize_cq = ocrdma_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, ocrdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 94e4f7f9b1f7..10b35edb286b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -977,12 +977,12 @@ err: return status; } -struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; - struct ocrdma_cq *cq; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context( udata, struct ocrdma_ucontext, ibucontext); @@ -991,16 +991,13 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, struct ocrdma_create_cq_ureq ureq; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return ERR_PTR(-EFAULT); + return -EFAULT; } else ureq.dpp_cq = 0; - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); spin_lock_init(&cq->cq_lock); spin_lock_init(&cq->comp_handler_lock); @@ -1011,10 +1008,9 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, pd_id = uctx->cntxt_pd->id; status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id); - if (status) { - kfree(cq); - return ERR_PTR(status); - } + if (status) + return status; + if (udata) { status = ocrdma_copy_cq_uresp(dev, cq, udata); if (status) @@ -1022,12 +1018,11 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, } cq->phase = OCRDMA_CQE_VALID; dev->cq_tbl[cq->id] = cq; - return &cq->ibcq; + return 0; ctx_err: ocrdma_mbx_destroy_cq(dev, cq); - kfree(cq); - return ERR_PTR(status); + return status; } int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, @@ -1095,8 +1090,6 @@ void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) ocrdma_get_db_addr(dev, pdid), dev->nic_info.db_page_size); } - - kfree(cq); } static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 89cebe05669e..32488da1b752 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -71,9 +71,8 @@ int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); -struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0bb07ba0f3c..a0a7ba0a5af4 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -224,6 +224,7 @@ static const struct ib_device_ops qedr_dev_ops = { .resize_cq = qedr_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index be29bbbc4b14..3fc7a4e901c3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -806,20 +806,20 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) return 0; } -struct ib_cq *qedr_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct qedr_ucontext *ctx = rdma_udata_to_drv_context( udata, struct qedr_ucontext, ibucontext); struct qed_rdma_destroy_cq_out_params destroy_oparams; struct qed_rdma_destroy_cq_in_params destroy_iparams; struct qedr_dev *dev = get_qedr_dev(ibdev); struct qed_rdma_create_cq_in_params params; - struct qedr_create_cq_ureq ureq; + struct qedr_create_cq_ureq ureq = {}; int vector = attr->comp_vector; int entries = attr->cqe; - struct qedr_cq *cq; + struct qedr_cq *cq = get_qedr_cq(ibcq); int chain_entries; int page_cnt; u64 pbl_ptr; @@ -834,18 +834,13 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, DP_ERR(dev, "create cq: the number of entries %d is too high. Must be equal or below %d.\n", entries, QEDR_MAX_CQES); - return ERR_PTR(-EINVAL); + return -EINVAL; } chain_entries = qedr_align_cq_entries(entries); chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES); - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); - if (udata) { - memset(&ureq, 0, sizeof(ureq)); if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { DP_ERR(dev, "create cq: problem copying data from user space\n"); @@ -923,7 +918,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n", cq->icid, cq, params.cq_size); - return &cq->ibcq; + return 0; err3: destroy_iparams.icid = cq->icid; @@ -938,8 +933,7 @@ err1: if (udata) ib_umem_release(cq->q.umem); err0: - kfree(cq); - return ERR_PTR(-EINVAL); + return -EINVAL; } int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) @@ -969,7 +963,7 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) /* GSIs CQs are handled by driver, so they don't exist in the FW */ if (cq->cq_type == QEDR_CQ_TYPE_GSI) - goto done; + return; iparams.icid = cq->icid; dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); @@ -1008,10 +1002,6 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) * Since the destroy CQ ramrod has also been received on the EQ we can * be certain that there's no event handler in process. */ -done: - cq->sig = ~cq->sig; - - kfree(cq); } static inline int get_gid_info_from_table(struct ib_qp *ibqp, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 32d7ce77e339..9aaa90283d6e 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -50,9 +50,8 @@ int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); -struct ib_cq *qedr_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h index 525bf272671e..84dd682d2334 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib.h +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -61,6 +61,10 @@ struct usnic_ib_pd { struct usnic_uiom_pd *umem_pd; }; +struct usnic_ib_cq { + struct ib_cq ibcq; +}; + struct usnic_ib_mr { struct ib_mr ibmr; struct usnic_uiom_reg *umem; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index e701322dc740..6ae5ce007fed 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -354,6 +354,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 5686d14b86fe..eeb07b245ef9 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -587,26 +587,18 @@ out_unlock: return status; } -struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct ib_cq *cq; - - usnic_dbg("\n"); if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-EBUSY); - - return cq; + return 0; } void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { - kfree(cq); + return; } struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 0b9d993433a7..2aedf78c13cf 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -58,9 +58,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); -struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 0682781f6555..38573fc0a9bf 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -92,20 +92,19 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, /** * pvrdma_create_cq - create completion queue - * @ibdev: the device + * @ibcq: Allocated CQ * @attr: completion queue attributes * @udata: user data * - * @return: ib_cq completion queue pointer on success, - * otherwise returns negative errno. + * @return: 0 on success */ -struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct pvrdma_dev *dev = to_vdev(ibdev); - struct pvrdma_cq *cq; + struct pvrdma_cq *cq = to_vcq(ibcq); int ret; int npages; unsigned long flags; @@ -113,7 +112,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_cq *cmd = &req.create_cq; struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp; - struct pvrdma_create_cq_resp cq_resp = {0}; + struct pvrdma_create_cq_resp cq_resp = {}; struct pvrdma_create_cq ucmd; struct pvrdma_ucontext *context = rdma_udata_to_drv_context( udata, struct pvrdma_ucontext, ibucontext); @@ -122,16 +121,10 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, entries = roundup_pow_of_two(entries); if (entries < 1 || entries > dev->dsr->caps.max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq)) - return ERR_PTR(-ENOMEM); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - atomic_dec(&dev->num_cqs); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; cq->ibcq.cqe = entries; cq->is_kernel = !udata; @@ -211,11 +204,11 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to copy back udata\n"); pvrdma_destroy_cq(&cq->ibcq, udata); - return ERR_PTR(-EINVAL); + return -EINVAL; } } - return &cq->ibcq; + return 0; err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); @@ -224,9 +217,7 @@ err_umem: ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); - kfree(cq); - - return ERR_PTR(ret); + return ret; } static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) @@ -239,7 +230,6 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) ib_umem_release(cq->umem); pvrdma_page_dir_cleanup(dev, &cq->pdir); - kfree(cq); } /** diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 0c48464ffff1..e580ae9cc55a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -182,6 +182,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .req_notify_cq = pvrdma_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index f0dd6e4d058b..e4a48f5c0c85 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -409,9 +409,8 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 8e76036fad4a..b46714a92b7a 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -166,43 +166,37 @@ static void send_complete(struct work_struct *work) /** * rvt_create_cq - create a completion queue - * @ibdev: the device this completion queue is attached to + * @ibcq: Allocated CQ * @attr: creation attributes * @udata: user data for libibverbs.so * * Called by ib_create_cq() in the generic verbs code. * - * Return: pointer to the completion queue or negative errno values - * for failure. + * Return: 0 on success */ -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq; + struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq); struct rvt_cq_wc *wc; - struct ib_cq *ret; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; + int err; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries < 1 || entries > rdi->dparms.props.max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; if (comp_vector < 0) comp_vector = 0; comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; - /* Allocate the completion queue structure. */ - cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); - if (!cq) - return ERR_PTR(-ENOMEM); - /* * Allocate the completion queue entries and head/tail pointers. * This is allocated separately so that it can be resized and @@ -218,36 +212,29 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, wc = udata ? vmalloc_user(sz) : vzalloc_node(sz, rdi->dparms.node); - if (!wc) { - ret = ERR_PTR(-ENOMEM); - goto bail_cq; - } - + if (!wc) + return -ENOMEM; /* * Return the address of the WC as the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - int err; - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); if (!cq->ip) { - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_wc; } err = ib_copy_to_udata(udata, &cq->ip->offset, sizeof(cq->ip->offset)); - if (err) { - ret = ERR_PTR(err); + if (err) goto bail_ip; - } } spin_lock_irq(&rdi->n_cqs_lock); if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) { spin_unlock_irq(&rdi->n_cqs_lock); - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_ip; } @@ -279,19 +266,14 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, INIT_WORK(&cq->comptask, send_complete); cq->queue = wc; - ret = &cq->ibcq; - trace_rvt_create_cq(cq, attr); - goto done; + return 0; bail_ip: kfree(cq->ip); bail_wc: vfree(wc); -bail_cq: - kfree(cq); -done: - return ret; + return err; } /** @@ -314,7 +296,6 @@ void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); - kfree(cq); } /** diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 495d8c3e6580..5e26a2eb19a4 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -51,9 +51,8 @@ #include #include -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 639ef8ac5400..18da1e1ea979 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -429,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = { .unmap_fmr = rvt_unmap_fmr, INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext), diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 56cf18af016a..fbcbac52290b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_CQ] = { .name = "rxe-cq", .size = sizeof(struct rxe_cq), + .flags = RXE_POOL_NO_ALLOC, .cleanup = rxe_cq_cleanup, }, [RXE_TYPE_MR] = { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index b14881decbee..4ebdfcf4d33e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -778,45 +778,34 @@ err1: return err; } -static struct ib_cq *rxe_create_cq(struct ib_device *dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { int err; + struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); - struct rxe_cq *cq; + struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; if (udata) { if (udata->outlen < sizeof(*uresp)) - return ERR_PTR(-EINVAL); + return -EINVAL; uresp = udata->outbuf; } if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) - goto err1; - - cq = rxe_alloc(&rxe->cq_pool); - if (!cq) { - err = -ENOMEM; - goto err1; - } + return err; err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) - goto err2; + return err; - return &cq->ibcq; - -err2: - rxe_drop_ref(cq); -err1: - return ERR_PTR(err); + return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem); } static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) @@ -1160,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = { .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index e8be7f44e3be..6c997d39a418 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -85,8 +85,8 @@ struct rxe_cqe { }; struct rxe_cq { - struct rxe_pool_entry pelem; struct ib_cq ibcq; + struct rxe_pool_entry pelem; struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bc1d94c9c9ba..f357e03a85a6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2458,9 +2458,8 @@ struct ib_device_ops { int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); - struct ib_cq *(*create_cq)(struct ib_device *device, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); @@ -2601,6 +2600,7 @@ struct ib_device_ops { int (*iw_destroy_listen)(struct iw_cm_id *cm_id); DECLARE_RDMA_OBJ_SIZE(ib_ah); + DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); From cbdc666f3e842b01d4537933e0a64f1e7cf17017 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 30 May 2019 16:18:17 +0300 Subject: [PATCH 056/194] RDMA/ipoib: Remove check for ETH_SS_TEST The default action for unlisted tests is "not-supported", so given that ipoib doesn't support ETH_SS_TEST, there is no need to check for it in the case statements, just let it get caught by the default: case. Fixes: e3614bc9dc44 ("IB/ipoib: Add readout of statistics using ethtool") Signed-off-by: Kamal Heib Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 58016532bf86..63e4f9d15fd9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -138,7 +138,6 @@ static void ipoib_get_strings(struct net_device __always_unused *dev, p += ETH_GSTRING_LEN; } break; - case ETH_SS_TEST: default: break; } @@ -149,7 +148,6 @@ static int ipoib_get_sset_count(struct net_device __always_unused *dev, switch (sset) { case ETH_SS_STATS: return IPOIB_GLOBAL_STATS_LEN; - case ETH_SS_TEST: default: break; } From 2d3c72ed504196edb2f22a08cb03a0f9fb7e564f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 10 Jun 2019 16:49:11 -0300 Subject: [PATCH 057/194] rdma: Remove nes This driver was first merged over 10 years ago and has not seen major activity by the authors in the last 7 years. However, in that time it has been patched 150 times to adapt it to changing kernel APIs. Further, the hardware has several issues, like not supporting 64 bit DMA, that make it rather uninteresting for use with modern systems and RDMA. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Reviewed-by: Shiraz Saleem Signed-off-by: Doug Ledford --- .../ABI/stable/sysfs-class-infiniband | 17 - MAINTAINERS | 8 - drivers/infiniband/Kconfig | 1 - drivers/infiniband/hw/Makefile | 1 - drivers/infiniband/hw/nes/Kconfig | 15 - drivers/infiniband/hw/nes/Makefile | 3 - drivers/infiniband/hw/nes/nes.c | 1205 ----- drivers/infiniband/hw/nes/nes.h | 574 --- drivers/infiniband/hw/nes/nes_cm.c | 3992 ----------------- drivers/infiniband/hw/nes/nes_cm.h | 470 -- drivers/infiniband/hw/nes/nes_context.h | 193 - drivers/infiniband/hw/nes/nes_hw.c | 3887 ---------------- drivers/infiniband/hw/nes/nes_hw.h | 1380 ------ drivers/infiniband/hw/nes/nes_mgt.c | 1155 ----- drivers/infiniband/hw/nes/nes_mgt.h | 97 - drivers/infiniband/hw/nes/nes_nic.c | 1870 -------- drivers/infiniband/hw/nes/nes_utils.c | 913 ---- drivers/infiniband/hw/nes/nes_verbs.c | 3721 --------------- drivers/infiniband/hw/nes/nes_verbs.h | 198 - 19 files changed, 19700 deletions(-) delete mode 100644 drivers/infiniband/hw/nes/Kconfig delete mode 100644 drivers/infiniband/hw/nes/Makefile delete mode 100644 drivers/infiniband/hw/nes/nes.c delete mode 100644 drivers/infiniband/hw/nes/nes.h delete mode 100644 drivers/infiniband/hw/nes/nes_cm.c delete mode 100644 drivers/infiniband/hw/nes/nes_cm.h delete mode 100644 drivers/infiniband/hw/nes/nes_context.h delete mode 100644 drivers/infiniband/hw/nes/nes_hw.c delete mode 100644 drivers/infiniband/hw/nes/nes_hw.h delete mode 100644 drivers/infiniband/hw/nes/nes_mgt.c delete mode 100644 drivers/infiniband/hw/nes/nes_mgt.h delete mode 100644 drivers/infiniband/hw/nes/nes_nic.c delete mode 100644 drivers/infiniband/hw/nes/nes_utils.c delete mode 100644 drivers/infiniband/hw/nes/nes_verbs.c delete mode 100644 drivers/infiniband/hw/nes/nes_verbs.h diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband index 17211ceb9bf4..aed21b8916a2 100644 --- a/Documentation/ABI/stable/sysfs-class-infiniband +++ b/Documentation/ABI/stable/sysfs-class-infiniband @@ -423,23 +423,6 @@ Description: (e.g. driver restart on the VM which owns the VF). -sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes) ---------------------------------------------------------------- - -What: /sys/class/infiniband/nesX/hw_rev -What: /sys/class/infiniband/nesX/hca_type -What: /sys/class/infiniband/nesX/board_id -Date: Feb, 2008 -KernelVersion: v2.6.25 -Contact: linux-rdma@vger.kernel.org -Description: - hw_rev: (RO) Hardware revision number - - hca_type: (RO) Host Channel Adapter type (NEX020) - - board_id: (RO) Manufacturing board id - - sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4) ----------------------------------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 5cfbea4ce575..9ac03f3e3bd5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10830,14 +10830,6 @@ F: driver/net/net_failover.c F: include/net/net_failover.h F: Documentation/networking/net_failover.rst -NETEFFECT IWARP RNIC DRIVER (IW_NES) -M: Faisal Latif -L: linux-rdma@vger.kernel.org -W: http://www.intel.com/Products/Server/Adapters/Server-Cluster/Server-Cluster-overview.htm -S: Supported -F: drivers/infiniband/hw/nes/ -F: include/uapi/rdma/nes-abi.h - NETEM NETWORK EMULATOR M: Stephen Hemminger L: netem@lists.linux-foundation.org (moderated for non-subscribers) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index cbaafa4e0302..0fe6f76e8fdc 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -86,7 +86,6 @@ source "drivers/infiniband/hw/efa/Kconfig" source "drivers/infiniband/hw/i40iw/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" -source "drivers/infiniband/hw/nes/Kconfig" source "drivers/infiniband/hw/ocrdma/Kconfig" source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/hw/usnic/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 77094be1b262..433fca59febd 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_INFINIBAND_EFA) += efa/ obj-$(CONFIG_INFINIBAND_I40IW) += i40iw/ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ -obj-$(CONFIG_INFINIBAND_NES) += nes/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig deleted file mode 100644 index 52caae954e4a..000000000000 --- a/drivers/infiniband/hw/nes/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -config INFINIBAND_NES - tristate "NetEffect RNIC Driver" - depends on PCI && INET - select LIBCRC32C - ---help--- - This is the RDMA Network Interface Card (RNIC) driver for - NetEffect Ethernet Cluster Server Adapters. - -config INFINIBAND_NES_DEBUG - bool "Verbose debugging output" - depends on INFINIBAND_NES - default n - ---help--- - This option enables debug messages from the NetEffect RNIC - driver. Select this if you are diagnosing a problem. diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile deleted file mode 100644 index 97820c23ecef..000000000000 --- a/drivers/infiniband/hw/nes/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o - -iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c deleted file mode 100644 index e00add6d78ec..000000000000 --- a/drivers/infiniband/hw/nes/nes.c +++ /dev/null @@ -1,1205 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nes.h" - -#include -#include -#include -#include - -MODULE_AUTHOR("NetEffect"); -MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); -MODULE_LICENSE("Dual BSD/GPL"); - -int interrupt_mod_interval = 0; - -/* Interoperability */ -int mpa_version = 1; -module_param(mpa_version, int, 0644); -MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)"); - -/* Interoperability */ -int disable_mpa_crc = 0; -module_param(disable_mpa_crc, int, 0644); -MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC"); - -unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU; -module_param(nes_drv_opt, int, 0644); -MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters"); - -unsigned int nes_debug_level = 0; -module_param_named(debug_level, nes_debug_level, uint, 0644); -MODULE_PARM_DESC(debug_level, "Enable debug output level"); - -unsigned int wqm_quanta = 0x10000; -module_param(wqm_quanta, int, 0644); -MODULE_PARM_DESC(wqm_quanta, "WQM quanta"); - -static bool limit_maxrdreqsz; -module_param(limit_maxrdreqsz, bool, 0644); -MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes"); - -LIST_HEAD(nes_adapter_list); -static LIST_HEAD(nes_dev_list); - -atomic_t qps_destroyed; - -static unsigned int ee_flsh_adapter; -static unsigned int sysfs_nonidx_addr; -static unsigned int sysfs_idx_addr; - -static const struct pci_device_id nes_pci_table[] = { - { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), }, - { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), }, - {0} -}; - -MODULE_DEVICE_TABLE(pci, nes_pci_table); - -static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); -static int nes_net_event(struct notifier_block *, unsigned long, void *); -static int nes_notifiers_registered; - - -static struct notifier_block nes_inetaddr_notifier = { - .notifier_call = nes_inetaddr_event -}; - -static struct notifier_block nes_net_notifier = { - .notifier_call = nes_net_event -}; - -/** - * nes_inetaddr_event - */ -static int nes_inetaddr_event(struct notifier_block *notifier, - unsigned long event, void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *event_netdev = ifa->ifa_dev->dev; - struct nes_device *nesdev; - struct net_device *netdev; - struct net_device *upper_dev; - struct nes_vnic *nesvnic; - unsigned int is_bonded; - - nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n", - &ifa->ifa_address, &ifa->ifa_mask); - list_for_each_entry(nesdev, &nes_dev_list, list) { - nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n", - nesdev, nesdev->netdev[0]->name); - netdev = nesdev->netdev[0]; - nesvnic = netdev_priv(netdev); - upper_dev = netdev_master_upper_dev_get(netdev); - is_bonded = netif_is_bond_slave(netdev) && - (upper_dev == event_netdev); - if ((netdev == event_netdev) || is_bonded) { - if (nesvnic->rdma_enabled == 0) { - nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" - " RDMA is not enabled.\n", - netdev->name); - return NOTIFY_OK; - } - /* we have ifa->ifa_address/mask here if we need it */ - switch (event) { - case NETDEV_DOWN: - nes_debug(NES_DBG_NETDEV, "event:DOWN\n"); - nes_write_indexed(nesdev, - NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0); - - nes_manage_arp_cache(netdev, netdev->dev_addr, - ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE); - nesvnic->local_ipaddr = 0; - if (is_bonded) - continue; - else - return NOTIFY_OK; - break; - case NETDEV_UP: - nes_debug(NES_DBG_NETDEV, "event:UP\n"); - - if (nesvnic->local_ipaddr != 0) { - nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n"); - return NOTIFY_OK; - } - /* fall through */ - case NETDEV_CHANGEADDR: - /* Add the address to the IP table */ - if (upper_dev) { - struct in_device *in; - - rcu_read_lock(); - in = __in_dev_get_rcu(upper_dev); - nesvnic->local_ipaddr = in->ifa_list->ifa_address; - rcu_read_unlock(); - } else { - nesvnic->local_ipaddr = ifa->ifa_address; - } - - nes_write_indexed(nesdev, - NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), - ntohl(nesvnic->local_ipaddr)); - nes_manage_arp_cache(netdev, netdev->dev_addr, - ntohl(nesvnic->local_ipaddr), NES_ARP_ADD); - if (is_bonded) - continue; - else - return NOTIFY_OK; - break; - default: - break; - } - } - } - - return NOTIFY_DONE; -} - - -/** - * nes_net_event - */ -static int nes_net_event(struct notifier_block *notifier, - unsigned long event, void *ptr) -{ - struct neighbour *neigh = ptr; - struct nes_device *nesdev; - struct net_device *netdev; - struct nes_vnic *nesvnic; - - switch (event) { - case NETEVENT_NEIGH_UPDATE: - list_for_each_entry(nesdev, &nes_dev_list, list) { - /* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */ - netdev = nesdev->netdev[0]; - nesvnic = netdev_priv(netdev); - if (netdev == neigh->dev) { - if (nesvnic->rdma_enabled == 0) { - nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n", - netdev->name); - } else { - if (neigh->nud_state & NUD_VALID) { - nes_manage_arp_cache(neigh->dev, neigh->ha, - ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD); - } else { - nes_manage_arp_cache(neigh->dev, neigh->ha, - ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE); - } - } - return NOTIFY_OK; - } - } - break; - default: - nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event); - break; - } - - return NOTIFY_DONE; -} - - -/** - * nes_add_ref - */ -void nes_add_ref(struct ib_qp *ibqp) -{ - struct nes_qp *nesqp; - - nesqp = to_nesqp(ibqp); - nes_debug(NES_DBG_QP, "Bumping refcount for QP%u. Pre-inc value = %u\n", - ibqp->qp_num, atomic_read(&nesqp->refcount)); - atomic_inc(&nesqp->refcount); -} - -static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) -{ - unsigned long flags; - struct nes_qp *nesqp = cqp_request->cqp_callback_pointer; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - atomic_inc(&qps_destroyed); - - /* Free the control structures */ - - if (nesqp->pbl_vbase) { - pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, - nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_256pbl++; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); - nesqp->pbl_vbase = NULL; - - } else { - pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, - nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); - } - nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id); - - nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL; - kfree(nesqp->allocated_buffer); - -} - -/** - * nes_rem_ref - */ -void nes_rem_ref(struct ib_qp *ibqp) -{ - u64 u64temp; - struct nes_qp *nesqp; - struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - u32 opcode; - - nesqp = to_nesqp(ibqp); - - if (atomic_read(&nesqp->refcount) == 0) { - printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n", - __func__, ibqp->qp_num, nesqp->last_aeq); - BUG(); - } - - if (atomic_dec_and_test(&nesqp->refcount)) { - if (nesqp->pau_mode) - nes_destroy_pau_qp(nesdev, nesqp); - - /* Destroy the QP */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); - return; - } - cqp_request->waiting = 0; - cqp_request->callback = 1; - cqp_request->cqp_callback = nes_cqp_rem_ref_callback; - cqp_request->cqp_callback_pointer = nesqp; - cqp_wqe = &cqp_request->cqp_wqe; - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP; - - if (nesqp->hte_added) { - opcode |= NES_CQP_QP_DEL_HTE; - nesqp->hte_added = 0; - } - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); - u64temp = (u64)nesqp->nesqp_context_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - nes_post_cqp_request(nesdev, cqp_request); - } -} - - -/** - * nes_get_qp - */ -struct ib_qp *nes_get_qp(struct ib_device *device, int qpn) -{ - struct nes_vnic *nesvnic = to_nesvnic(device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp))) - return NULL; - - return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp; -} - - -/** - * nes_print_macaddr - */ -static void nes_print_macaddr(struct net_device *netdev) -{ - nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n", - netdev->name, netdev->dev_addr, netdev->irq); -} - -/** - * nes_interrupt - handle interrupts - */ -static irqreturn_t nes_interrupt(int irq, void *dev_id) -{ - struct nes_device *nesdev = (struct nes_device *)dev_id; - int handled = 0; - u32 int_mask; - u32 int_req; - u32 int_stat; - u32 intf_int_stat; - u32 timer_stat; - - if (nesdev->msi_enabled) { - /* No need to read the interrupt pending register if msi is enabled */ - handled = 1; - } else { - if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) { - /* Master interrupt enable provides synchronization for kicking off bottom half - when interrupt sharing is going on */ - int_mask = nes_read32(nesdev->regs + NES_INT_MASK); - if (int_mask & 0x80000000) { - /* Check interrupt status to see if this might be ours */ - int_stat = nes_read32(nesdev->regs + NES_INT_STAT); - int_req = nesdev->int_req; - if (int_stat&int_req) { - /* if interesting CEQ or AEQ is pending, claim the interrupt */ - if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) { - handled = 1; - } else { - if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) { - /* Timer might be running but might be for another function */ - timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); - if ((timer_stat & nesdev->timer_int_req) != 0) { - handled = 1; - } - } - if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) && - (handled == 0)) { - intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); - if ((intf_int_stat & nesdev->intf_int_req) != 0) { - handled = 1; - } - } - } - if (handled) { - nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000)); - int_mask = nes_read32(nesdev->regs+NES_INT_MASK); - /* Save off the status to save an additional read */ - nesdev->int_stat = int_stat; - nesdev->napi_isr_ran = 1; - } - } - } - } else { - handled = nes_read32(nesdev->regs+NES_INT_PENDING); - } - } - - if (handled) { - - if (nes_napi_isr(nesdev) == 0) { - tasklet_schedule(&nesdev->dpc_tasklet); - - } - return IRQ_HANDLED; - } else { - return IRQ_NONE; - } -} - - -/** - * nes_probe - Device initialization - */ -static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) -{ - struct net_device *netdev = NULL; - struct nes_device *nesdev = NULL; - int ret = 0; - void __iomem *mmio_regs = NULL; - u8 hw_rev; - - printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", - DRV_VERSION, pci_name(pcidev)); - - ret = pci_enable_device(pcidev); - if (ret) { - printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev)); - goto bail0; - } - - nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n", - (long unsigned int)pci_resource_start(pcidev, BAR_0), - (long unsigned int)pci_resource_len(pcidev, BAR_0)); - nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n", - (long unsigned int)pci_resource_start(pcidev, BAR_1), - (long unsigned int)pci_resource_len(pcidev, BAR_1)); - - /* Make sure PCI base addr are MMIO */ - if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) || - !(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) { - printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); - ret = -ENODEV; - goto bail1; - } - - /* Reserve PCI I/O and memory resources */ - ret = pci_request_regions(pcidev, DRV_NAME); - if (ret) { - printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev)); - goto bail1; - } - - if ((sizeof(dma_addr_t) > 4)) { - ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); - if (ret < 0) { - printk(KERN_ERR PFX "64b DMA mask configuration failed\n"); - goto bail2; - } - ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64)); - if (ret) { - printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n"); - goto bail2; - } - } else { - ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); - if (ret < 0) { - printk(KERN_ERR PFX "32b DMA mask configuration failed\n"); - goto bail2; - } - ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32)); - if (ret) { - printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n"); - goto bail2; - } - } - - pci_set_master(pcidev); - - /* Allocate hardware structure */ - nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL); - if (!nesdev) { - ret = -ENOMEM; - goto bail2; - } - - nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev); - nesdev->pcidev = pcidev; - pci_set_drvdata(pcidev, nesdev); - - pci_read_config_byte(pcidev, 0x0008, &hw_rev); - nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev); - - spin_lock_init(&nesdev->indexed_regs_lock); - - /* Remap the PCI registers in adapter BAR0 to kernel VA space */ - mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), - pci_resource_len(pcidev, BAR_0)); - if (mmio_regs == NULL) { - printk(KERN_ERR PFX "Unable to remap BAR0\n"); - ret = -EIO; - goto bail3; - } - nesdev->regs = mmio_regs; - nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs; - - /* Ensure interrupts are disabled */ - nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); - - if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) { - if (!pci_enable_msi(nesdev->pcidev)) { - nesdev->msi_enabled = 1; - nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n", - pci_name(pcidev)); - } else { - nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n", - pci_name(pcidev)); - } - } else { - nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n", - pci_name(pcidev)); - } - - nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0); - nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1); - - /* Init the adapter */ - nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev); - if (!nesdev->nesadapter) { - printk(KERN_ERR PFX "Unable to initialize adapter.\n"); - ret = -ENOMEM; - goto bail5; - } - nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; - nesdev->nesadapter->wqm_quanta = wqm_quanta; - - /* nesdev->base_doorbell_index = - nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */ - nesdev->base_doorbell_index = 1; - nesdev->doorbell_start = nesdev->nesadapter->doorbell_start; - if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { - switch (PCI_FUNC(nesdev->pcidev->devfn) % - nesdev->nesadapter->port_count) { - case 1: - nesdev->mac_index = 2; - break; - case 2: - nesdev->mac_index = 1; - break; - case 3: - nesdev->mac_index = 3; - break; - case 0: - default: - nesdev->mac_index = 0; - } - } else { - nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) % - nesdev->nesadapter->port_count; - } - - if ((limit_maxrdreqsz || - ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) && - (hw_rev == NE020_REV1))) && - (pcie_get_readrq(pcidev) > 256)) { - if (pcie_set_readrq(pcidev, 256)) - printk(KERN_ERR PFX "Unable to set max read request" - " to 256 bytes\n"); - else - nes_debug(NES_DBG_INIT, "Max read request size set" - " to 256 bytes\n"); - } - - tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev); - - /* bring up the Control QP */ - if (nes_init_cqp(nesdev)) { - ret = -ENODEV; - goto bail6; - } - - /* Arm the CCQ */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - PCI_FUNC(nesdev->pcidev->devfn)); - nes_read32(nesdev->regs+NES_CQE_ALLOC); - - /* Enable the interrupts */ - nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) | - (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); - if (PCI_FUNC(nesdev->pcidev->devfn) < 4) { - nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24)); - } - - /* TODO: This really should be the first driver to load, not function 0 */ - if (PCI_FUNC(nesdev->pcidev->devfn) == 0) { - /* pick up PCI and critical errors if the first driver to load */ - nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR; - nesdev->int_req |= NES_INT_INTF; - } else { - nesdev->intf_int_req = 0; - } - nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804); - - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790); - - /* deal with both periodic and one_shot */ - nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn); - nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req; - nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n", - PCI_FUNC(nesdev->pcidev->devfn), - nesdev->timer_int_req, nesdev->nesadapter->timer_int_req); - - nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); - - list_add_tail(&nesdev->list, &nes_dev_list); - - /* Request an interrupt line for the driver */ - ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev); - if (ret) { - printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", - pci_name(pcidev), pcidev->irq); - goto bail65; - } - - nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); - - if (nes_notifiers_registered == 0) { - register_inetaddr_notifier(&nes_inetaddr_notifier); - register_netevent_notifier(&nes_net_notifier); - } - nes_notifiers_registered++; - - INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); - - /* Initialize network devices */ - netdev = nes_netdev_init(nesdev, mmio_regs); - if (netdev == NULL) { - ret = -ENOMEM; - goto bail7; - } - - /* Register network device */ - ret = register_netdev(netdev); - if (ret) { - printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret); - nes_netdev_destroy(netdev); - goto bail7; - } - - nes_print_macaddr(netdev); - - nesdev->netdev_count++; - nesdev->nesadapter->netdev_count++; - - printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n", - pci_name(pcidev)); - return 0; - - bail7: - printk(KERN_ERR PFX "bail7\n"); - while (nesdev->netdev_count > 0) { - nesdev->netdev_count--; - nesdev->nesadapter->netdev_count--; - - unregister_netdev(nesdev->netdev[nesdev->netdev_count]); - nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]); - } - - nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", - nesdev->netdev_count, nesdev->nesadapter->netdev_count); - - nes_notifiers_registered--; - if (nes_notifiers_registered == 0) { - unregister_netevent_notifier(&nes_net_notifier); - unregister_inetaddr_notifier(&nes_inetaddr_notifier); - } - - list_del(&nesdev->list); - nes_destroy_cqp(nesdev); - - bail65: - printk(KERN_ERR PFX "bail65\n"); - free_irq(pcidev->irq, nesdev); - if (nesdev->msi_enabled) { - pci_disable_msi(pcidev); - } - bail6: - printk(KERN_ERR PFX "bail6\n"); - tasklet_kill(&nesdev->dpc_tasklet); - /* Deallocate the Adapter Structure */ - nes_destroy_adapter(nesdev->nesadapter); - - bail5: - printk(KERN_ERR PFX "bail5\n"); - iounmap(nesdev->regs); - - bail3: - printk(KERN_ERR PFX "bail3\n"); - kfree(nesdev); - - bail2: - pci_release_regions(pcidev); - - bail1: - pci_disable_device(pcidev); - - bail0: - return ret; -} - - -/** - * nes_remove - unload from kernel - */ -static void nes_remove(struct pci_dev *pcidev) -{ - struct nes_device *nesdev = pci_get_drvdata(pcidev); - struct net_device *netdev; - int netdev_index = 0; - unsigned long flags; - - if (nesdev->netdev_count) { - netdev = nesdev->netdev[netdev_index]; - if (netdev) { - netif_stop_queue(netdev); - unregister_netdev(netdev); - nes_netdev_destroy(netdev); - - nesdev->netdev[netdev_index] = NULL; - nesdev->netdev_count--; - nesdev->nesadapter->netdev_count--; - } - } - - nes_notifiers_registered--; - if (nes_notifiers_registered == 0) { - unregister_netevent_notifier(&nes_net_notifier); - unregister_inetaddr_notifier(&nes_inetaddr_notifier); - } - - list_del(&nesdev->list); - nes_destroy_cqp(nesdev); - - free_irq(pcidev->irq, nesdev); - tasklet_kill(&nesdev->dpc_tasklet); - - spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); - if (nesdev->link_recheck) { - spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); - cancel_delayed_work_sync(&nesdev->work); - } else { - spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); - } - - /* Deallocate the Adapter Structure */ - nes_destroy_adapter(nesdev->nesadapter); - - if (nesdev->msi_enabled) { - pci_disable_msi(pcidev); - } - - iounmap(nesdev->regs); - kfree(nesdev); - - /* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */ - pci_release_regions(pcidev); - pci_disable_device(pcidev); - pci_set_drvdata(pcidev, NULL); -} - - -static ssize_t adapter_show(struct device_driver *ddp, char *buf) -{ - unsigned int devfn = 0xffffffff; - unsigned char bus_number = 0xff; - unsigned int i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - devfn = nesdev->pcidev->devfn; - bus_number = nesdev->pcidev->bus->number; - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn); -} - -static ssize_t adapter_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - - ee_flsh_adapter = simple_strtoul(p, &p, 10); - return strnlen(buf, count); -} - -static ssize_t eeprom_cmd_show(struct device_driver *ddp, char *buf) -{ - u32 eeprom_cmd = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND); - break; - } - i++; - } - return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd); -} - -static ssize_t eeprom_cmd_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t eeprom_data_show(struct device_driver *ddp, char *buf) -{ - u32 eeprom_data = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA); - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data); -} - -static ssize_t eeprom_data_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write32(nesdev->regs + NES_EEPROM_DATA, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t flash_cmd_show(struct device_driver *ddp, char *buf) -{ - u32 flash_cmd = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND); - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd); -} - -static ssize_t flash_cmd_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write32(nesdev->regs + NES_FLASH_COMMAND, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t flash_data_show(struct device_driver *ddp, char *buf) -{ - u32 flash_data = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA); - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data); -} - -static ssize_t flash_data_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write32(nesdev->regs + NES_FLASH_DATA, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t nonidx_addr_show(struct device_driver *ddp, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr); -} - -static ssize_t nonidx_addr_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') - sysfs_nonidx_addr = simple_strtoul(p, &p, 16); - - return strnlen(buf, count); -} - -static ssize_t nonidx_data_show(struct device_driver *ddp, char *buf) -{ - u32 nonidx_data = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr); - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data); -} - -static ssize_t nonidx_data_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write32(nesdev->regs + sysfs_nonidx_addr, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t idx_addr_show(struct device_driver *ddp, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr); -} - -static ssize_t idx_addr_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') - sysfs_idx_addr = simple_strtoul(p, &p, 16); - - return strnlen(buf, count); -} - -static ssize_t idx_data_show(struct device_driver *ddp, char *buf) -{ - u32 idx_data = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - idx_data = nes_read_indexed(nesdev, sysfs_idx_addr); - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data); -} - -static ssize_t idx_data_store(struct device_driver *ddp, - const char *buf, size_t count) -{ - char *p = (char *)buf; - u32 val; - u32 i = 0; - struct nes_device *nesdev; - - if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { - val = simple_strtoul(p, &p, 16); - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nes_write_indexed(nesdev, sysfs_idx_addr, val); - break; - } - i++; - } - } - return strnlen(buf, count); -} - -static ssize_t wqm_quanta_show(struct device_driver *ddp, char *buf) -{ - u32 wqm_quanta_value = 0xdead; - u32 i = 0; - struct nes_device *nesdev; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - wqm_quanta_value = nesdev->nesadapter->wqm_quanta; - break; - } - i++; - } - - return snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value); -} - -static ssize_t wqm_quanta_store(struct device_driver *ddp, const char *buf, - size_t count) -{ - unsigned long wqm_quanta_value; - u32 wqm_config1; - u32 i = 0; - struct nes_device *nesdev; - - if (kstrtoul(buf, 0, &wqm_quanta_value) < 0) - return -EINVAL; - - list_for_each_entry(nesdev, &nes_dev_list, list) { - if (i == ee_flsh_adapter) { - nesdev->nesadapter->wqm_quanta = wqm_quanta_value; - wqm_config1 = nes_read_indexed(nesdev, - NES_IDX_WQM_CONFIG1); - nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, - ((wqm_quanta_value << 1) | - (wqm_config1 & 0x00000001))); - break; - } - i++; - } - return strnlen(buf, count); -} - -static DRIVER_ATTR_RW(adapter); -static DRIVER_ATTR_RW(eeprom_cmd); -static DRIVER_ATTR_RW(eeprom_data); -static DRIVER_ATTR_RW(flash_cmd); -static DRIVER_ATTR_RW(flash_data); -static DRIVER_ATTR_RW(nonidx_addr); -static DRIVER_ATTR_RW(nonidx_data); -static DRIVER_ATTR_RW(idx_addr); -static DRIVER_ATTR_RW(idx_data); -static DRIVER_ATTR_RW(wqm_quanta); - -static struct attribute *nes_attrs[] = { - &driver_attr_adapter.attr, - &driver_attr_eeprom_cmd.attr, - &driver_attr_eeprom_data.attr, - &driver_attr_flash_cmd.attr, - &driver_attr_flash_data.attr, - &driver_attr_nonidx_addr.attr, - &driver_attr_nonidx_data.attr, - &driver_attr_idx_addr.attr, - &driver_attr_idx_data.attr, - &driver_attr_wqm_quanta.attr, - NULL, -}; -ATTRIBUTE_GROUPS(nes); - -static struct pci_driver nes_pci_driver = { - .name = DRV_NAME, - .id_table = nes_pci_table, - .probe = nes_probe, - .remove = nes_remove, - .groups = nes_groups, -}; - - -/** - * nes_init_module - module initialization entry point - */ -static int __init nes_init_module(void) -{ - int retval; - - retval = nes_cm_start(); - if (retval) { - printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); - return retval; - } - return pci_register_driver(&nes_pci_driver); -} - - -/** - * nes_exit_module - module unload entry point - */ -static void __exit nes_exit_module(void) -{ - nes_cm_stop(); - - pci_unregister_driver(&nes_pci_driver); -} - - -module_init(nes_init_module); -module_exit(nes_exit_module); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h deleted file mode 100644 index a895fe980d10..000000000000 --- a/drivers/infiniband/hw/nes/nes.h +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __NES_H -#define __NES_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define NES_SEND_FIRST_WRITE - -#define QUEUE_DISCONNECTS - -#define DRV_NAME "iw_nes" -#define DRV_VERSION "1.5.0.1" -#define PFX DRV_NAME ": " - -/* - * NetEffect PCI vendor id and NE010 PCI device id. - */ -#ifndef PCI_VENDOR_ID_NETEFFECT /* not in pci.ids yet */ -#define PCI_VENDOR_ID_NETEFFECT 0x1678 -#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100 -#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110 -#endif - -#define NE020_REV 4 -#define NE020_REV1 5 - -#define BAR_0 0 -#define BAR_1 2 - -#define RX_BUF_SIZE (1536 + 8) -#define NES_REG0_SIZE (4 * 1024) -#define NES_TX_TIMEOUT (6*HZ) -#define NES_FIRST_QPN 64 -#define NES_SW_CONTEXT_ALIGN 1024 - -#define NES_MAX_MTU 9000 - -#define NES_NIC_MAX_NICS 16 -#define NES_MAX_ARP_TABLE_SIZE 4096 - -#define NES_NIC_CEQ_SIZE 8 -/* NICs will be on a separate CQ */ -#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32) - -#define NES_MAX_PORT_COUNT 4 - -#define MAX_DPC_ITERATIONS 128 - -#define NES_DRV_OPT_ENABLE_MPA_VER_0 0x00000001 -#define NES_DRV_OPT_DISABLE_MPA_CRC 0x00000002 -#define NES_DRV_OPT_DISABLE_FIRST_WRITE 0x00000004 -#define NES_DRV_OPT_DISABLE_INTF 0x00000008 -#define NES_DRV_OPT_ENABLE_MSI 0x00000010 -#define NES_DRV_OPT_DUAL_LOGICAL_PORT 0x00000020 -#define NES_DRV_OPT_SUPRESS_OPTION_BC 0x00000040 -#define NES_DRV_OPT_NO_INLINE_DATA 0x00000080 -#define NES_DRV_OPT_DISABLE_INT_MOD 0x00000100 -#define NES_DRV_OPT_DISABLE_VIRT_WQ 0x00000200 -#define NES_DRV_OPT_ENABLE_PAU 0x00000400 - -#define NES_AEQ_EVENT_TIMEOUT 2500 -#define NES_DISCONNECT_EVENT_TIMEOUT 2000 - -/* debug levels */ -/* must match userspace */ -#define NES_DBG_HW 0x00000001 -#define NES_DBG_INIT 0x00000002 -#define NES_DBG_ISR 0x00000004 -#define NES_DBG_PHY 0x00000008 -#define NES_DBG_NETDEV 0x00000010 -#define NES_DBG_CM 0x00000020 -#define NES_DBG_CM1 0x00000040 -#define NES_DBG_NIC_RX 0x00000080 -#define NES_DBG_NIC_TX 0x00000100 -#define NES_DBG_CQP 0x00000200 -#define NES_DBG_MMAP 0x00000400 -#define NES_DBG_MR 0x00000800 -#define NES_DBG_PD 0x00001000 -#define NES_DBG_CQ 0x00002000 -#define NES_DBG_QP 0x00004000 -#define NES_DBG_MOD_QP 0x00008000 -#define NES_DBG_AEQ 0x00010000 -#define NES_DBG_IW_RX 0x00020000 -#define NES_DBG_IW_TX 0x00040000 -#define NES_DBG_SHUTDOWN 0x00080000 -#define NES_DBG_PAU 0x00100000 -#define NES_DBG_NLMSG 0x00200000 -#define NES_DBG_RSVD1 0x10000000 -#define NES_DBG_RSVD2 0x20000000 -#define NES_DBG_RSVD3 0x40000000 -#define NES_DBG_RSVD4 0x80000000 -#define NES_DBG_ALL 0xffffffff - -#ifdef CONFIG_INFINIBAND_NES_DEBUG -#define nes_debug(level, fmt, args...) \ -do { \ - if (level & nes_debug_level) \ - printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ -} while (0) - -#define NES_EVENT_TIMEOUT 1200000 -#else -#define nes_debug(level, fmt, args...) no_printk(fmt, ##args) - -#define NES_EVENT_TIMEOUT 100000 -#endif - -#include "nes_hw.h" -#include "nes_verbs.h" -#include "nes_context.h" -#include -#include "nes_cm.h" -#include "nes_mgt.h" - -extern int interrupt_mod_interval; -extern int nes_if_count; -extern int mpa_version; -extern int disable_mpa_crc; -extern unsigned int nes_drv_opt; -extern unsigned int nes_debug_level; -extern unsigned int wqm_quanta; -extern struct list_head nes_adapter_list; - -extern atomic_t cm_connects; -extern atomic_t cm_accepts; -extern atomic_t cm_disconnects; -extern atomic_t cm_closes; -extern atomic_t cm_connecteds; -extern atomic_t cm_connect_reqs; -extern atomic_t cm_rejects; -extern atomic_t mod_qp_timouts; -extern atomic_t qps_created; -extern atomic_t qps_destroyed; -extern atomic_t sw_qps_destroyed; -extern u32 mh_detected; -extern u32 mh_pauses_sent; -extern u32 cm_packets_sent; -extern u32 cm_packets_bounced; -extern u32 cm_packets_created; -extern u32 cm_packets_received; -extern u32 cm_packets_dropped; -extern u32 cm_packets_retrans; -extern atomic_t cm_listens_created; -extern atomic_t cm_listens_destroyed; -extern u32 cm_backlog_drops; -extern atomic_t cm_loopbacks; -extern atomic_t cm_nodes_created; -extern atomic_t cm_nodes_destroyed; -extern atomic_t cm_accel_dropped_pkts; -extern atomic_t cm_resets_recvd; -extern atomic_t pau_qps_created; -extern atomic_t pau_qps_destroyed; - -extern u32 int_mod_timer_init; -extern u32 int_mod_cq_depth_256; -extern u32 int_mod_cq_depth_128; -extern u32 int_mod_cq_depth_32; -extern u32 int_mod_cq_depth_24; -extern u32 int_mod_cq_depth_16; -extern u32 int_mod_cq_depth_4; -extern u32 int_mod_cq_depth_1; - -struct nes_device { - struct nes_adapter *nesadapter; - void __iomem *regs; - void __iomem *index_reg; - struct pci_dev *pcidev; - struct net_device *netdev[NES_NIC_MAX_NICS]; - u64 link_status_interrupts; - struct tasklet_struct dpc_tasklet; - spinlock_t indexed_regs_lock; - unsigned long csr_start; - unsigned long doorbell_region; - unsigned long doorbell_start; - unsigned long mac_tx_errors; - unsigned long mac_pause_frames_sent; - unsigned long mac_pause_frames_received; - unsigned long mac_rx_errors; - unsigned long mac_rx_crc_errors; - unsigned long mac_rx_symbol_err_frames; - unsigned long mac_rx_jabber_frames; - unsigned long mac_rx_oversized_frames; - unsigned long mac_rx_short_frames; - unsigned long port_rx_discards; - unsigned long port_tx_discards; - unsigned int mac_index; - unsigned int nes_stack_start; - - /* Control Structures */ - void *cqp_vbase; - dma_addr_t cqp_pbase; - u32 cqp_mem_size; - u8 ceq_index; - u8 nic_ceq_index; - struct nes_hw_cqp cqp; - struct nes_hw_cq ccq; - struct list_head cqp_avail_reqs; - struct list_head cqp_pending_reqs; - struct nes_cqp_request *nes_cqp_requests; - - u32 int_req; - u32 int_stat; - u32 timer_int_req; - u32 timer_only_int_count; - u32 intf_int_req; - u32 last_mac_tx_pauses; - u32 last_used_chunks_tx; - struct list_head list; - - u16 base_doorbell_index; - u16 currcq_count; - u16 deepcq_count; - u8 iw_status; - u8 msi_enabled; - u8 netdev_count; - u8 napi_isr_ran; - u8 disable_rx_flow_control; - u8 disable_tx_flow_control; - - struct delayed_work work; - u8 link_recheck; -}; - -/* Receive skb private area - must fit in skb->cb area */ -struct nes_rskb_cb { - u64 busaddr; - u32 maplen; - u32 seqnum; - u8 *data_start; - struct nes_qp *nesqp; -}; - -static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad) -{ - u32 crc_value; - crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad)); - - /* - * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc - * state in cpu order"), behavior of crc32c changes on - * big-endian platforms. Our algorithm expects the previous - * behavior; otherwise we have RDMA connection establishment - * issue on big-endian. - */ - return cpu_to_le32(crc_value); -} - -static inline void -set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value) -{ - wqe_words[index] = cpu_to_le32((u32) value); - wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value)); -} - -static inline void -set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value) -{ - wqe_words[index] = cpu_to_le32(value); -} - -static inline void -nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev) -{ - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX] = 0; -} - -static inline void -nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head) -{ - u32 value; - value = ((u32)((unsigned long) nesqp)) | head; - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX, - (u32)(upper_32_bits((unsigned long)(nesqp)))); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value); -} - -/* Read from memory-mapped device */ -static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index) -{ - unsigned long flags; - void __iomem *addr = nesdev->index_reg; - u32 value; - - spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); - - writel(reg_index, addr); - value = readl((void __iomem *)addr + 4); - - spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); - return value; -} - -static inline u32 nes_read32(const void __iomem *addr) -{ - return readl(addr); -} - -static inline u16 nes_read16(const void __iomem *addr) -{ - return readw(addr); -} - -static inline u8 nes_read8(const void __iomem *addr) -{ - return readb(addr); -} - -/* Write to memory-mapped device */ -static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val) -{ - unsigned long flags; - void __iomem *addr = nesdev->index_reg; - - spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); - - writel(reg_index, addr); - writel(val, (void __iomem *)addr + 4); - - spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); -} - -static inline void nes_write32(void __iomem *addr, u32 val) -{ - writel(val, addr); -} - -static inline void nes_write16(void __iomem *addr, u16 val) -{ - writew(val, addr); -} - -static inline void nes_write8(void __iomem *addr, u8 val) -{ - writeb(val, addr); -} - -enum nes_resource { - NES_RESOURCE_MW = 1, - NES_RESOURCE_FAST_MR, - NES_RESOURCE_PHYS_MR, - NES_RESOURCE_USER_MR, - NES_RESOURCE_PD, - NES_RESOURCE_QP, - NES_RESOURCE_CQ, - NES_RESOURCE_ARP -}; - -static inline int nes_alloc_resource(struct nes_adapter *nesadapter, - unsigned long *resource_array, u32 max_resources, - u32 *req_resource_num, u32 *next, enum nes_resource resource_type) -{ - unsigned long flags; - u32 resource_num; - - spin_lock_irqsave(&nesadapter->resource_lock, flags); - - resource_num = find_next_zero_bit(resource_array, max_resources, *next); - if (resource_num >= max_resources) { - resource_num = find_first_zero_bit(resource_array, max_resources); - if (resource_num >= max_resources) { - printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type); - spin_unlock_irqrestore(&nesadapter->resource_lock, flags); - return -EMFILE; - } - } - set_bit(resource_num, resource_array); - *next = resource_num+1; - if (*next == max_resources) { - *next = 0; - } - spin_unlock_irqrestore(&nesadapter->resource_lock, flags); - *req_resource_num = resource_num; - - return 0; -} - -static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter, - unsigned long *resource_array, u32 resource_num) -{ - unsigned long flags; - int bit_is_set; - - spin_lock_irqsave(&nesadapter->resource_lock, flags); - - bit_is_set = test_bit(resource_num, resource_array); - nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n", - resource_num, (bit_is_set ? "": " not")); - spin_unlock_irqrestore(&nesadapter->resource_lock, flags); - - return bit_is_set; -} - -static inline void nes_free_resource(struct nes_adapter *nesadapter, - unsigned long *resource_array, u32 resource_num) -{ - unsigned long flags; - - spin_lock_irqsave(&nesadapter->resource_lock, flags); - clear_bit(resource_num, resource_array); - spin_unlock_irqrestore(&nesadapter->resource_lock, flags); -} - -static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev) -{ - return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic; -} - -static inline struct nes_pd *to_nespd(struct ib_pd *ibpd) -{ - return container_of(ibpd, struct nes_pd, ibpd); -} - -static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext) -{ - return container_of(ibucontext, struct nes_ucontext, ibucontext); -} - -static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr) -{ - return container_of(ibmr, struct nes_mr, ibmr); -} - -static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct nes_mr, ibfmr); -} - -static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw) -{ - return container_of(ibmw, struct nes_mr, ibmw); -} - -static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr) -{ - return container_of(nesmr, struct nes_fmr, nesmr); -} - -static inline struct nes_cq *to_nescq(struct ib_cq *ibcq) -{ - return container_of(ibcq, struct nes_cq, ibcq); -} - -static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp) -{ - return container_of(ibqp, struct nes_qp, ibqp); -} - - - -/* nes.c */ -void nes_add_ref(struct ib_qp *); -void nes_rem_ref(struct ib_qp *); -struct ib_qp *nes_get_qp(struct ib_device *, int); - - -/* nes_hw.c */ -struct nes_adapter *nes_init_adapter(struct nes_device *, u8); -void nes_nic_init_timer_defaults(struct nes_device *, u8); -void nes_destroy_adapter(struct nes_adapter *); -int nes_init_cqp(struct nes_device *); -int nes_init_phy(struct nes_device *); -int nes_init_nic_qp(struct nes_device *, struct net_device *); -void nes_destroy_nic_qp(struct nes_vnic *); -int nes_napi_isr(struct nes_device *); -void nes_dpc(unsigned long); -void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *); -void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); -int nes_destroy_cqp(struct nes_device *); -int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); -void nes_recheck_link_status(struct work_struct *work); -void nes_terminate_timeout(struct timer_list *t); - -/* nes_nic.c */ -struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); -void nes_netdev_destroy(struct net_device *); -int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); - -/* nes_cm.c */ -void *nes_cm_create(struct net_device *); -int nes_cm_recv(struct sk_buff *, struct net_device *); -void nes_update_arp(unsigned char *, u32, u32, u16, u16); -void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32); -void nes_sock_release(struct nes_qp *, unsigned long *); -void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32); -int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32); -int nes_cm_disconn(struct nes_qp *); -void nes_cm_disconn_worker(void *); - -/* nes_verbs.c */ -int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32); -int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); -struct nes_ib_device *nes_init_ofa_device(struct net_device *); -void nes_port_ibevent(struct nes_vnic *nesvnic); -void nes_destroy_ofa_device(struct nes_ib_device *); -int nes_register_ofa_device(struct nes_ib_device *); - -/* nes_util.c */ -int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *); -void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16); -void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *); -void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16); -void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16); -struct nes_cqp_request *nes_get_cqp_request(struct nes_device *); -void nes_free_cqp_request(struct nes_device *nesdev, - struct nes_cqp_request *cqp_request); -void nes_put_cqp_request(struct nes_device *nesdev, - struct nes_cqp_request *cqp_request); -void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *); -int nes_arp_table(struct nes_device *, u32, u8 *, u32); -void nes_mh_fix(struct timer_list *t); -void nes_clc(struct timer_list *t); -void nes_dump_mem(unsigned int, void *, int); -u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32); - -#endif /* __NES_H */ diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c deleted file mode 100644 index 62bf986eba67..000000000000 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ /dev/null @@ -1,3992 +0,0 @@ -/* - * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - - -#define TCPOPT_TIMESTAMP 8 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nes.h" - -u32 cm_packets_sent; -u32 cm_packets_bounced; -u32 cm_packets_dropped; -u32 cm_packets_retrans; -u32 cm_packets_created; -u32 cm_packets_received; -atomic_t cm_listens_created; -atomic_t cm_listens_destroyed; -u32 cm_backlog_drops; -atomic_t cm_loopbacks; -atomic_t cm_nodes_created; -atomic_t cm_nodes_destroyed; -atomic_t cm_accel_dropped_pkts; -atomic_t cm_resets_recvd; - -static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *); -static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *); -static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *); -static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *); -static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *); -static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *); -static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *); -static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); -static int mini_cm_dealloc_core(struct nes_cm_core *); -static int mini_cm_get(struct nes_cm_core *); -static int mini_cm_set(struct nes_cm_core *, u32, u32); - -static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); -static int add_ref_cm_node(struct nes_cm_node *); -static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); - -static int nes_cm_disconn_true(struct nes_qp *); -static int nes_cm_post_event(struct nes_cm_event *event); -static int nes_disconnect(struct nes_qp *nesqp, int abrupt); -static void nes_disconnect_worker(struct work_struct *work); - -static int send_mpa_request(struct nes_cm_node *, struct sk_buff *); -static int send_mpa_reject(struct nes_cm_node *); -static int send_syn(struct nes_cm_node *, u32, struct sk_buff *); -static int send_reset(struct nes_cm_node *, struct sk_buff *); -static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb); -static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb); -static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *); - -static void active_open_err(struct nes_cm_node *, struct sk_buff *, int); -static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int); -static void cleanup_retrans_entry(struct nes_cm_node *); -static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *); -static void free_retrans_entry(struct nes_cm_node *cm_node); -static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive); - -/* CM event handler functions */ -static void cm_event_connected(struct nes_cm_event *); -static void cm_event_connect_error(struct nes_cm_event *); -static void cm_event_reset(struct nes_cm_event *); -static void cm_event_mpa_req(struct nes_cm_event *); -static void cm_event_mpa_reject(struct nes_cm_event *); -static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node); - -/* MPA build functions */ -static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8); -static void build_mpa_v2(struct nes_cm_node *, void *, u8); -static void build_mpa_v1(struct nes_cm_node *, void *, u8); -static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); - -static void print_core(struct nes_cm_core *core); -static void record_ird_ord(struct nes_cm_node *, u16, u16); - -/* External CM API Interface */ -/* instance of function pointers for client API */ -/* set address of this instance to cm_core->cm_ops at cm_core alloc */ -static const struct nes_cm_ops nes_cm_api = { - .accelerated = mini_cm_accelerated, - .listen = mini_cm_listen, - .stop_listener = mini_cm_del_listen, - .connect = mini_cm_connect, - .close = mini_cm_close, - .accept = mini_cm_accept, - .reject = mini_cm_reject, - .recv_pkt = mini_cm_recv_pkt, - .destroy_cm_core = mini_cm_dealloc_core, - .get = mini_cm_get, - .set = mini_cm_set -}; - -static struct nes_cm_core *g_cm_core; - -atomic_t cm_connects; -atomic_t cm_accepts; -atomic_t cm_disconnects; -atomic_t cm_closes; -atomic_t cm_connecteds; -atomic_t cm_connect_reqs; -atomic_t cm_rejects; - -int nes_add_ref_cm_node(struct nes_cm_node *cm_node) -{ - return add_ref_cm_node(cm_node); -} - -int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) -{ - return rem_ref_cm_node(cm_node->cm_core, cm_node); -} -/** - * create_event - */ -static struct nes_cm_event *create_event(struct nes_cm_node * cm_node, - enum nes_cm_event_type type) -{ - struct nes_cm_event *event; - - if (!cm_node->cm_id) - return NULL; - - /* allocate an empty event */ - event = kzalloc(sizeof(*event), GFP_ATOMIC); - - if (!event) - return NULL; - - event->type = type; - event->cm_node = cm_node; - event->cm_info.rem_addr = cm_node->rem_addr; - event->cm_info.loc_addr = cm_node->loc_addr; - event->cm_info.rem_port = cm_node->rem_port; - event->cm_info.loc_port = cm_node->loc_port; - event->cm_info.cm_id = cm_node->cm_id; - - nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, " - "dst_addr=%08x[%x], src_addr=%08x[%x]\n", - cm_node, event, type, event->cm_info.loc_addr, - event->cm_info.loc_port, event->cm_info.rem_addr, - event->cm_info.rem_port); - - nes_cm_post_event(event); - return event; -} - - -/** - * send_mpa_request - */ -static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - u8 start_addr = 0; - u8 *start_ptr = &start_addr; - u8 **start_buff = &start_ptr; - u16 buff_len = 0; - - if (!skb) { - nes_debug(NES_DBG_CM, "skb set to NULL\n"); - return -1; - } - - /* send an MPA Request frame */ - cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST); - form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK); - - return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); -} - - - -static int send_mpa_reject(struct nes_cm_node *cm_node) -{ - struct sk_buff *skb = NULL; - u8 start_addr = 0; - u8 *start_ptr = &start_addr; - u8 **start_buff = &start_ptr; - u16 buff_len = 0; - struct ietf_mpa_v1 *mpa_frame; - - skb = dev_alloc_skb(MAX_CM_BUFFER); - if (!skb) { - nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); - return -ENOMEM; - } - - /* send an MPA reject frame */ - cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY); - mpa_frame = (struct ietf_mpa_v1 *)*start_buff; - mpa_frame->flags |= IETF_MPA_FLAGS_REJECT; - form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN); - - cm_node->state = NES_CM_STATE_FIN_WAIT1; - return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); -} - - -/** - * recv_mpa - process a received TCP pkt, we are expecting an - * IETF MPA frame - */ -static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, - u32 len) -{ - struct ietf_mpa_v1 *mpa_frame; - struct ietf_mpa_v2 *mpa_v2_frame; - struct ietf_rtr_msg *rtr_msg; - int mpa_hdr_len; - int priv_data_len; - - *type = NES_MPA_REQUEST_ACCEPT; - - /* assume req frame is in tcp data payload */ - if (len < sizeof(struct ietf_mpa_v1)) { - nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len); - return -EINVAL; - } - - /* points to the beginning of the frame, which could be MPA V1 or V2 */ - mpa_frame = (struct ietf_mpa_v1 *)buffer; - mpa_hdr_len = sizeof(struct ietf_mpa_v1); - priv_data_len = ntohs(mpa_frame->priv_data_len); - - /* make sure mpa private data len is less than 512 bytes */ - if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) { - nes_debug(NES_DBG_CM, "The received Length of Private" - " Data field exceeds 512 octets\n"); - return -EINVAL; - } - /* - * make sure MPA receiver interoperate with the - * received MPA version and MPA key information - * - */ - if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) { - nes_debug(NES_DBG_CM, "The received mpa version" - " is not supported\n"); - return -EINVAL; - } - /* - * backwards compatibility only - */ - if (mpa_frame->rev > cm_node->mpa_frame_rev) { - nes_debug(NES_DBG_CM, "The received mpa version" - " can not be interoperated\n"); - return -EINVAL; - } else { - cm_node->mpa_frame_rev = mpa_frame->rev; - } - - if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { - if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { - nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); - return -EINVAL; - } - } else { - if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) { - nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); - return -EINVAL; - } - } - - if (priv_data_len + mpa_hdr_len != len) { - nes_debug(NES_DBG_CM, "The received ietf buffer was not right" - " complete (%x + %x != %x)\n", - priv_data_len, mpa_hdr_len, len); - return -EINVAL; - } - /* make sure it does not exceed the max size */ - if (len > MAX_CM_BUFFER) { - nes_debug(NES_DBG_CM, "The received ietf buffer was too large" - " (%x + %x != %x)\n", - priv_data_len, mpa_hdr_len, len); - return -EINVAL; - } - - cm_node->mpa_frame_size = priv_data_len; - - switch (mpa_frame->rev) { - case IETF_MPA_V2: { - u16 ird_size; - u16 ord_size; - u16 rtr_ctrl_ird; - u16 rtr_ctrl_ord; - - mpa_v2_frame = (struct ietf_mpa_v2 *)buffer; - mpa_hdr_len += IETF_RTR_MSG_SIZE; - cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE; - rtr_msg = &mpa_v2_frame->rtr_msg; - - /* parse rtr message */ - rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird); - rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord); - ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD; - ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD; - - if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) { - /* send reset */ - return -EINVAL; - } - if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) - cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; - - if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) { - /* responder */ - if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { - /* we are still negotiating */ - if (ord_size > NES_MAX_IRD) { - cm_node->ird_size = NES_MAX_IRD; - } else { - cm_node->ird_size = ord_size; - if (ord_size == 0 && - (rtr_ctrl_ord & IETF_RDMA0_READ)) { - cm_node->ird_size = 1; - nes_debug(NES_DBG_CM, - "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", - __func__, ord_size); - } - } - if (ird_size > NES_MAX_ORD) - cm_node->ord_size = NES_MAX_ORD; - else - cm_node->ord_size = ird_size; - } else { /* initiator */ - if (ord_size > NES_MAX_IRD) { - nes_debug(NES_DBG_CM, - "%s: Unable to support the requested (ord =%u)\n", - __func__, ord_size); - return -EINVAL; - } - cm_node->ird_size = ord_size; - - if (ird_size > NES_MAX_ORD) { - cm_node->ord_size = NES_MAX_ORD; - } else { - if (ird_size == 0 && - (rtr_ctrl_ord & IETF_RDMA0_READ)) { - nes_debug(NES_DBG_CM, - "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", - __func__, ird_size); - return -EINVAL; - } else { - cm_node->ord_size = ird_size; - } - } - } - } - - if (rtr_ctrl_ord & IETF_RDMA0_READ) { - cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; - - } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { - cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; - } else { /* Not supported RDMA0 operation */ - return -EINVAL; - } - break; - } - case IETF_MPA_V1: - default: - break; - } - - /* copy entire MPA frame to our cm_node's frame */ - memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size); - - if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT) - *type = NES_MPA_REQUEST_REJECT; - return 0; -} - - -/** - * form_cm_frame - get a free packet and build empty frame Use - * node info to build. - */ -static void form_cm_frame(struct sk_buff *skb, - struct nes_cm_node *cm_node, void *options, u32 optionsize, - void *data, u32 datasize, u8 flags) -{ - struct tcphdr *tcph; - struct iphdr *iph; - struct ethhdr *ethh; - u8 *buf; - u16 packetsize = sizeof(*iph); - - packetsize += sizeof(*tcph); - packetsize += optionsize + datasize; - - skb_trim(skb, 0); - memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph)); - - buf = skb_put(skb, packetsize + ETH_HLEN); - - ethh = (struct ethhdr *)buf; - buf += ETH_HLEN; - - iph = (struct iphdr *)buf; - buf += sizeof(*iph); - tcph = (struct tcphdr *)buf; - skb_reset_mac_header(skb); - skb_set_network_header(skb, ETH_HLEN); - skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph)); - buf += sizeof(*tcph); - - skb->ip_summed = CHECKSUM_PARTIAL; - if (!(cm_node->netdev->features & NETIF_F_IP_CSUM)) - skb->ip_summed = CHECKSUM_NONE; - skb->protocol = htons(0x800); - skb->data_len = 0; - skb->mac_len = ETH_HLEN; - - memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN); - memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN); - ethh->h_proto = htons(0x0800); - - iph->version = IPVERSION; - iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ - iph->tos = 0; - iph->tot_len = htons(packetsize); - iph->id = htons(++cm_node->tcp_cntxt.loc_id); - - iph->frag_off = htons(0x4000); - iph->ttl = 0x40; - iph->protocol = 0x06; /* IPPROTO_TCP */ - - iph->saddr = htonl(cm_node->loc_addr); - iph->daddr = htonl(cm_node->rem_addr); - - tcph->source = htons(cm_node->loc_port); - tcph->dest = htons(cm_node->rem_port); - tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); - - if (flags & SET_ACK) { - cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt; - tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num); - tcph->ack = 1; - } else { - tcph->ack_seq = 0; - } - - if (flags & SET_SYN) { - cm_node->tcp_cntxt.loc_seq_num++; - tcph->syn = 1; - } else { - cm_node->tcp_cntxt.loc_seq_num += datasize; - } - - if (flags & SET_FIN) { - cm_node->tcp_cntxt.loc_seq_num++; - tcph->fin = 1; - } - - if (flags & SET_RST) - tcph->rst = 1; - - tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2); - tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd); - tcph->urg_ptr = 0; - if (optionsize) - memcpy(buf, options, optionsize); - buf += optionsize; - if (datasize) - memcpy(buf, data, datasize); - - skb_shinfo(skb)->nr_frags = 0; - cm_packets_created++; -} - -/** - * print_core - dump a cm core - */ -static void print_core(struct nes_cm_core *core) -{ - nes_debug(NES_DBG_CM, "---------------------------------------------\n"); - nes_debug(NES_DBG_CM, "CM Core -- (core = %p )\n", core); - if (!core) - return; - nes_debug(NES_DBG_CM, "---------------------------------------------\n"); - - nes_debug(NES_DBG_CM, "State : %u \n", core->state); - - nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); - nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); - - nes_debug(NES_DBG_CM, "core : %p \n", core); - - nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); -} - -static void record_ird_ord(struct nes_cm_node *cm_node, - u16 conn_ird, u16 conn_ord) -{ - if (conn_ird > NES_MAX_IRD) - conn_ird = NES_MAX_IRD; - - if (conn_ord > NES_MAX_ORD) - conn_ord = NES_MAX_ORD; - - cm_node->ird_size = conn_ird; - cm_node->ord_size = conn_ord; -} - -/** - * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame - */ -static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff, - u16 *buff_len, u8 *pci_mem, u8 mpa_key) -{ - int ret = 0; - - *start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0]; - - switch (cm_node->mpa_frame_rev) { - case IETF_MPA_V1: - *start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg); - *buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size; - build_mpa_v1(cm_node, *start_buff, mpa_key); - break; - case IETF_MPA_V2: - *buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size; - build_mpa_v2(cm_node, *start_buff, mpa_key); - break; - default: - ret = -EINVAL; - } - return ret; -} - -/** - * build_mpa_v2 - build a MPA V2 frame - */ -static void build_mpa_v2(struct nes_cm_node *cm_node, - void *start_addr, u8 mpa_key) -{ - struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; - struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; - u16 ctrl_ird; - u16 ctrl_ord; - - /* initialize the upper 5 bytes of the frame */ - build_mpa_v1(cm_node, start_addr, mpa_key); - mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */ - mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); - - /* initialize RTR msg */ - if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { - ctrl_ird = IETF_NO_IRD_ORD; - ctrl_ord = IETF_NO_IRD_ORD; - } else { - ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; - ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; - } - ctrl_ird |= IETF_PEER_TO_PEER; - - switch (mpa_key) { - case MPA_KEY_REQUEST: - ctrl_ord |= IETF_RDMA0_WRITE; - ctrl_ord |= IETF_RDMA0_READ; - break; - case MPA_KEY_REPLY: - switch (cm_node->send_rdma0_op) { - case SEND_RDMA_WRITE_ZERO: - ctrl_ord |= IETF_RDMA0_WRITE; - break; - case SEND_RDMA_READ_ZERO: - ctrl_ord |= IETF_RDMA0_READ; - break; - } - } - rtr_msg->ctrl_ird = htons(ctrl_ird); - rtr_msg->ctrl_ord = htons(ctrl_ord); -} - -/** - * build_mpa_v1 - build a MPA V1 frame - */ -static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key) -{ - struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr; - - switch (mpa_key) { - case MPA_KEY_REQUEST: - memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); - break; - case MPA_KEY_REPLY: - memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); - break; - } - mpa_frame->flags = IETF_MPA_FLAGS_CRC; - mpa_frame->rev = cm_node->mpa_frame_rev; - mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size); -} - -static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr) -{ - u64 u64temp; - struct nes_qp *nesqp = *nesqp_addr; - struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; - - u64temp = (unsigned long)nesqp->nesuqp_addr; - u64temp |= NES_SW_CONTEXT_ALIGN >> 1; - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); - - wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; - - switch (cm_node->send_rdma0_op) { - case SEND_RDMA_WRITE_ZERO: - nes_debug(NES_DBG_CM, "Sending first write.\n"); - wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = - cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; - break; - - case SEND_RDMA_READ_ZERO: - default: - if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) - WARN(1, "Unsupported RDMA0 len operation=%u\n", - cm_node->send_rdma0_op); - nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); - wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = - cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1; - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0; - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1; - wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1; - break; - } - - if (nesqp->sq_kmapped) { - nesqp->sq_kmapped = 0; - kunmap(nesqp->page); - } - - /*use the reserved spot on the WQ for the extra first WQE*/ - nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | - NES_QPCONTEXT_ORDIRD_WRPDU | - NES_QPCONTEXT_ORDIRD_ALSMM)); - nesqp->skip_lsmm = 1; - nesqp->hwqp.sq_tail = 0; -} - -/** - * schedule_nes_timer - * note - cm_node needs to be protected before calling this. Encase in: - * rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node); - */ -int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, - enum nes_timer_type type, int send_retrans, - int close_when_complete) -{ - unsigned long flags; - struct nes_cm_core *cm_core = cm_node->cm_core; - struct nes_timer_entry *new_send; - int ret = 0; - - new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); - if (!new_send) - return -ENOMEM; - - /* new_send->timetosend = currenttime */ - new_send->retrycount = NES_DEFAULT_RETRYS; - new_send->retranscount = NES_DEFAULT_RETRANS; - new_send->skb = skb; - new_send->timetosend = jiffies; - new_send->type = type; - new_send->netdev = cm_node->netdev; - new_send->send_retrans = send_retrans; - new_send->close_when_complete = close_when_complete; - - if (type == NES_TIMER_TYPE_CLOSE) { - new_send->timetosend += (HZ / 10); - if (cm_node->recv_entry) { - kfree(new_send); - WARN_ON(1); - return -EINVAL; - } - cm_node->recv_entry = new_send; - } - - if (type == NES_TIMER_TYPE_SEND) { - new_send->seq_num = ntohl(tcp_hdr(skb)->seq); - refcount_inc(&new_send->skb->users); - spin_lock_irqsave(&cm_node->retrans_list_lock, flags); - cm_node->send_entry = new_send; - add_ref_cm_node(cm_node); - spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); - new_send->timetosend = jiffies + NES_RETRY_TIMEOUT; - - ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev); - if (ret != NETDEV_TX_OK) { - nes_debug(NES_DBG_CM, "Error sending packet %p " - "(jiffies = %lu)\n", new_send, jiffies); - new_send->timetosend = jiffies; - ret = NETDEV_TX_OK; - } else { - cm_packets_sent++; - if (!send_retrans) { - cleanup_retrans_entry(cm_node); - if (close_when_complete) - rem_ref_cm_node(cm_core, cm_node); - return ret; - } - } - } - - if (!timer_pending(&cm_core->tcp_timer)) - mod_timer(&cm_core->tcp_timer, new_send->timetosend); - - return ret; -} - -static void nes_retrans_expired(struct nes_cm_node *cm_node) -{ - struct iw_cm_id *cm_id = cm_node->cm_id; - enum nes_cm_node_state state = cm_node->state; - cm_node->state = NES_CM_STATE_CLOSED; - - switch (state) { - case NES_CM_STATE_SYN_RCVD: - case NES_CM_STATE_CLOSING: - rem_ref_cm_node(cm_node->cm_core, cm_node); - break; - case NES_CM_STATE_LAST_ACK: - case NES_CM_STATE_FIN_WAIT1: - if (cm_node->cm_id) - cm_id->rem_ref(cm_id); - send_reset(cm_node, NULL); - break; - default: - add_ref_cm_node(cm_node); - send_reset(cm_node, NULL); - create_event(cm_node, NES_CM_EVENT_ABORTED); - } -} - -static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node) -{ - struct nes_timer_entry *recv_entry = cm_node->recv_entry; - struct iw_cm_id *cm_id = cm_node->cm_id; - struct nes_qp *nesqp; - unsigned long qplockflags; - - if (!recv_entry) - return; - nesqp = (struct nes_qp *)recv_entry->skb; - if (nesqp) { - spin_lock_irqsave(&nesqp->lock, qplockflags); - if (nesqp->cm_id) { - nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " - "refcount = %d: HIT A " - "NES_TIMER_TYPE_CLOSE with something " - "to do!!!\n", nesqp->hwqp.qp_id, cm_id, - atomic_read(&nesqp->refcount)); - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; - nesqp->ibqp_state = IB_QPS_ERR; - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - nes_cm_disconn(nesqp); - } else { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " - "refcount = %d: HIT A " - "NES_TIMER_TYPE_CLOSE with nothing " - "to do!!!\n", nesqp->hwqp.qp_id, cm_id, - atomic_read(&nesqp->refcount)); - } - } else if (rem_node) { - /* TIME_WAIT state */ - rem_ref_cm_node(cm_node->cm_core, cm_node); - } - if (cm_node->cm_id) - cm_id->rem_ref(cm_id); - kfree(recv_entry); - cm_node->recv_entry = NULL; -} - -/** - * nes_cm_timer_tick - */ -static void nes_cm_timer_tick(struct timer_list *unused) -{ - unsigned long flags; - unsigned long nexttimeout = jiffies + NES_LONG_TIME; - struct nes_cm_node *cm_node; - struct nes_timer_entry *send_entry, *recv_entry; - struct list_head *list_core_temp; - struct list_head *list_node; - struct nes_cm_core *cm_core = g_cm_core; - u32 settimer = 0; - unsigned long timetosend; - int ret = NETDEV_TX_OK; - - struct list_head timer_list; - - INIT_LIST_HEAD(&timer_list); - spin_lock_irqsave(&cm_core->ht_lock, flags); - - list_for_each_safe(list_node, list_core_temp, - &cm_core->connected_nodes) { - cm_node = container_of(list_node, struct nes_cm_node, list); - if ((cm_node->recv_entry) || (cm_node->send_entry)) { - add_ref_cm_node(cm_node); - list_add(&cm_node->timer_entry, &timer_list); - } - } - spin_unlock_irqrestore(&cm_core->ht_lock, flags); - - list_for_each_safe(list_node, list_core_temp, &timer_list) { - cm_node = container_of(list_node, struct nes_cm_node, - timer_entry); - recv_entry = cm_node->recv_entry; - - if (recv_entry) { - if (time_after(recv_entry->timetosend, jiffies)) { - if (nexttimeout > recv_entry->timetosend || - !settimer) { - nexttimeout = recv_entry->timetosend; - settimer = 1; - } - } else { - handle_recv_entry(cm_node, 1); - } - } - - spin_lock_irqsave(&cm_node->retrans_list_lock, flags); - do { - send_entry = cm_node->send_entry; - if (!send_entry) - break; - if (time_after(send_entry->timetosend, jiffies)) { - if (cm_node->state != NES_CM_STATE_TSA) { - if ((nexttimeout > - send_entry->timetosend) || - !settimer) { - nexttimeout = - send_entry->timetosend; - settimer = 1; - } - } else { - free_retrans_entry(cm_node); - } - break; - } - - if ((cm_node->state == NES_CM_STATE_TSA) || - (cm_node->state == NES_CM_STATE_CLOSED)) { - free_retrans_entry(cm_node); - break; - } - - if (!send_entry->retranscount || - !send_entry->retrycount) { - cm_packets_dropped++; - free_retrans_entry(cm_node); - - spin_unlock_irqrestore( - &cm_node->retrans_list_lock, flags); - nes_retrans_expired(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - spin_lock_irqsave(&cm_node->retrans_list_lock, - flags); - break; - } - refcount_inc(&send_entry->skb->users); - cm_packets_retrans++; - nes_debug(NES_DBG_CM, "Retransmitting send_entry %p " - "for node %p, jiffies = %lu, time to send = " - "%lu, retranscount = %u, send_entry->seq_num = " - "0x%08X, cm_node->tcp_cntxt.rem_ack_num = " - "0x%08X\n", send_entry, cm_node, jiffies, - send_entry->timetosend, - send_entry->retranscount, - send_entry->seq_num, - cm_node->tcp_cntxt.rem_ack_num); - - spin_unlock_irqrestore(&cm_node->retrans_list_lock, - flags); - ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev); - spin_lock_irqsave(&cm_node->retrans_list_lock, flags); - if (ret != NETDEV_TX_OK) { - nes_debug(NES_DBG_CM, "rexmit failed for " - "node=%p\n", cm_node); - cm_packets_bounced++; - send_entry->retrycount--; - nexttimeout = jiffies + NES_SHORT_TIME; - settimer = 1; - break; - } else { - cm_packets_sent++; - } - nes_debug(NES_DBG_CM, "Packet Sent: retrans count = " - "%u, retry count = %u.\n", - send_entry->retranscount, - send_entry->retrycount); - if (send_entry->send_retrans) { - send_entry->retranscount--; - timetosend = (NES_RETRY_TIMEOUT << - (NES_DEFAULT_RETRANS - send_entry->retranscount)); - - send_entry->timetosend = jiffies + - min(timetosend, NES_MAX_TIMEOUT); - if (nexttimeout > send_entry->timetosend || - !settimer) { - nexttimeout = send_entry->timetosend; - settimer = 1; - } - } else { - int close_when_complete; - close_when_complete = - send_entry->close_when_complete; - nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n", - cm_node, cm_node->state); - free_retrans_entry(cm_node); - if (close_when_complete) - rem_ref_cm_node(cm_node->cm_core, - cm_node); - } - } while (0); - - spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); - rem_ref_cm_node(cm_node->cm_core, cm_node); - } - - if (settimer) { - if (!timer_pending(&cm_core->tcp_timer)) - mod_timer(&cm_core->tcp_timer, nexttimeout); - } -} - - -/** - * send_syn - */ -static int send_syn(struct nes_cm_node *cm_node, u32 sendack, - struct sk_buff *skb) -{ - int ret; - int flags = SET_SYN; - char optionsbuffer[sizeof(struct option_mss) + - sizeof(struct option_windowscale) + sizeof(struct option_base) + - TCP_OPTIONS_PADDING]; - - int optionssize = 0; - /* Sending MSS option */ - union all_known_options *options; - - if (!cm_node) - return -EINVAL; - - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_mss.optionnum = OPTION_NUMBER_MSS; - options->as_mss.length = sizeof(struct option_mss); - options->as_mss.mss = htons(cm_node->tcp_cntxt.mss); - optionssize += sizeof(struct option_mss); - - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE; - options->as_windowscale.length = sizeof(struct option_windowscale); - options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale; - optionssize += sizeof(struct option_windowscale); - - if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) { - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_base.optionnum = OPTION_NUMBER_WRITE0; - options->as_base.length = sizeof(struct option_base); - optionssize += sizeof(struct option_base); - /* we need the size to be a multiple of 4 */ - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_end = 1; - optionssize += 1; - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_end = 1; - optionssize += 1; - } - - options = (union all_known_options *)&optionsbuffer[optionssize]; - options->as_end = OPTION_NUMBER_END; - optionssize += 1; - - if (!skb) - skb = dev_alloc_skb(MAX_CM_BUFFER); - if (!skb) { - nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); - return -1; - } - - if (sendack) - flags |= SET_ACK; - - form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags); - ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); - - return ret; -} - - -/** - * send_reset - */ -static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - int ret; - int flags = SET_RST | SET_ACK; - - if (!skb) - skb = dev_alloc_skb(MAX_CM_BUFFER); - if (!skb) { - nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); - return -ENOMEM; - } - - form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags); - ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1); - - return ret; -} - - -/** - * send_ack - */ -static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - int ret; - - if (!skb) - skb = dev_alloc_skb(MAX_CM_BUFFER); - - if (!skb) { - nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); - return -1; - } - - form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK); - ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0); - - return ret; -} - - -/** - * send_fin - */ -static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - int ret; - - /* if we didn't get a frame get one */ - if (!skb) - skb = dev_alloc_skb(MAX_CM_BUFFER); - - if (!skb) { - nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); - return -1; - } - - form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN); - ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); - - return ret; -} - - -/** - * find_node - find a cm node that matches the reference cm node - */ -static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, - u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) -{ - unsigned long flags; - struct list_head *hte; - struct nes_cm_node *cm_node; - - /* get a handle on the hte */ - hte = &cm_core->connected_nodes; - - /* walk list and find cm_node associated with this session ID */ - spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_entry(cm_node, hte, list) { - /* compare quad, return node handle if a match */ - nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n", - cm_node->loc_addr, cm_node->loc_port, - loc_addr, loc_port, - cm_node->rem_addr, cm_node->rem_port, - rem_addr, rem_port); - if ((cm_node->loc_addr == loc_addr) && - (cm_node->loc_port == loc_port) && - (cm_node->rem_addr == rem_addr) && - (cm_node->rem_port == rem_port)) { - add_ref_cm_node(cm_node); - spin_unlock_irqrestore(&cm_core->ht_lock, flags); - return cm_node; - } - } - spin_unlock_irqrestore(&cm_core->ht_lock, flags); - - /* no owner node */ - return NULL; -} - - -/** - * find_listener - find a cm node listening on this addr-port pair - */ -static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, - enum nes_cm_listener_state listener_state) -{ - unsigned long flags; - struct nes_cm_listener *listen_node; - nes_addr_t listen_addr; - u16 listen_port; - - /* walk list and find cm_node associated with this session ID */ - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { - listen_addr = listen_node->loc_addr; - listen_port = listen_node->loc_port; - - /* compare node pair, return node handle if a match */ - if (((listen_addr == dst_addr) || - listen_addr == 0x00000000) && - (listen_port == dst_port) && - (listener_state & listen_node->listener_state)) { - atomic_inc(&listen_node->ref_count); - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - return listen_node; - } - } - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - - /* no listener */ - return NULL; -} - -/** - * add_hte_node - add a cm node to the hash table - */ -static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) -{ - unsigned long flags; - struct list_head *hte; - - if (!cm_node || !cm_core) - return -EINVAL; - - nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n", - cm_node); - - spin_lock_irqsave(&cm_core->ht_lock, flags); - - /* get a handle on the hash table element (list head for this slot) */ - hte = &cm_core->connected_nodes; - list_add_tail(&cm_node->list, hte); - atomic_inc(&cm_core->ht_node_cnt); - - spin_unlock_irqrestore(&cm_core->ht_lock, flags); - - return 0; -} - - -/** - * mini_cm_dec_refcnt_listen - */ -static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, - struct nes_cm_listener *listener, int free_hanging_nodes) -{ - int ret = -EINVAL; - int err = 0; - unsigned long flags; - struct list_head *list_pos = NULL; - struct list_head *list_temp = NULL; - struct nes_cm_node *cm_node = NULL; - struct list_head reset_list; - - nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, " - "refcnt=%d\n", listener, free_hanging_nodes, - atomic_read(&listener->ref_count)); - /* free non-accelerated child nodes for this listener */ - INIT_LIST_HEAD(&reset_list); - if (free_hanging_nodes) { - spin_lock_irqsave(&cm_core->ht_lock, flags); - list_for_each_safe(list_pos, list_temp, - &g_cm_core->connected_nodes) { - cm_node = container_of(list_pos, struct nes_cm_node, - list); - if ((cm_node->listener == listener) && - (!cm_node->accelerated)) { - add_ref_cm_node(cm_node); - list_add(&cm_node->reset_entry, &reset_list); - } - } - spin_unlock_irqrestore(&cm_core->ht_lock, flags); - } - - list_for_each_safe(list_pos, list_temp, &reset_list) { - cm_node = container_of(list_pos, struct nes_cm_node, - reset_entry); - { - struct nes_cm_node *loopback = cm_node->loopbackpartner; - enum nes_cm_node_state old_state; - if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) { - rem_ref_cm_node(cm_node->cm_core, cm_node); - } else { - if (!loopback) { - cleanup_retrans_entry(cm_node); - err = send_reset(cm_node, NULL); - if (err) { - cm_node->state = - NES_CM_STATE_CLOSED; - WARN_ON(1); - } else { - old_state = cm_node->state; - cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; - if (old_state != NES_CM_STATE_MPAREQ_RCVD) - rem_ref_cm_node( - cm_node->cm_core, - cm_node); - } - } else { - struct nes_cm_event event; - - event.cm_node = loopback; - event.cm_info.rem_addr = - loopback->rem_addr; - event.cm_info.loc_addr = - loopback->loc_addr; - event.cm_info.rem_port = - loopback->rem_port; - event.cm_info.loc_port = - loopback->loc_port; - event.cm_info.cm_id = loopback->cm_id; - add_ref_cm_node(loopback); - loopback->state = NES_CM_STATE_CLOSED; - cm_event_connect_error(&event); - cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; - - rem_ref_cm_node(cm_node->cm_core, - cm_node); - - } - } - } - } - - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - if (!atomic_dec_return(&listener->ref_count)) { - list_del(&listener->list); - - /* decrement our listen node count */ - atomic_dec(&cm_core->listen_node_cnt); - - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - - if (listener->nesvnic) { - nes_manage_apbvt(listener->nesvnic, - listener->loc_port, - PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), - NES_MANAGE_APBVT_DEL); - - nes_debug(NES_DBG_NLMSG, - "Delete APBVT loc_port = %04X\n", - listener->loc_port); - } - - nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); - - kfree(listener); - listener = NULL; - ret = 0; - atomic_inc(&cm_listens_destroyed); - } else { - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - } - if (listener) { - if (atomic_read(&listener->pend_accepts_cnt) > 0) - nes_debug(NES_DBG_CM, "destroying listener (%p)" - " with non-zero pending accepts=%u\n", - listener, atomic_read(&listener->pend_accepts_cnt)); - } - - return ret; -} - - -/** - * mini_cm_del_listen - */ -static int mini_cm_del_listen(struct nes_cm_core *cm_core, - struct nes_cm_listener *listener) -{ - listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE; - listener->cm_id = NULL; /* going to be destroyed pretty soon */ - return mini_cm_dec_refcnt_listen(cm_core, listener, 1); -} - - -/** - * mini_cm_accelerated - */ -static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, - struct nes_cm_node *cm_node) -{ - cm_node->accelerated = true; - - if (cm_node->accept_pend) { - BUG_ON(!cm_node->listener); - atomic_dec(&cm_node->listener->pend_accepts_cnt); - cm_node->accept_pend = 0; - BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); - } - - if (!timer_pending(&cm_core->tcp_timer)) - mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME)); - - return 0; -} - - -/** - * nes_addr_resolve_neigh - */ -static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex) -{ - struct rtable *rt; - struct neighbour *neigh; - int rc = arpindex; - struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; - __be32 dst_ipaddr = htonl(dst_ip); - - rt = ip_route_output(&init_net, dst_ipaddr, nesvnic->local_ipaddr, 0, 0); - if (IS_ERR(rt)) { - printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n", - __func__, dst_ip); - return rc; - } - - neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr); - - rcu_read_lock(); - if (neigh) { - if (neigh->nud_state & NUD_VALID) { - nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X" - " is %pM, Gateway is 0x%08X \n", dst_ip, - neigh->ha, ntohl(rt->rt_gw4)); - - if (arpindex >= 0) { - if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) { - /* Mac address same as in nes_arp_table */ - goto out; - } - - nes_manage_arp_cache(nesvnic->netdev, - nesadapter->arp_table[arpindex].mac_addr, - dst_ip, NES_ARP_DELETE); - } - - nes_manage_arp_cache(nesvnic->netdev, neigh->ha, - dst_ip, NES_ARP_ADD); - rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL, - NES_ARP_RESOLVE); - } else { - neigh_event_send(neigh, NULL); - } - } -out: - rcu_read_unlock(); - - if (neigh) - neigh_release(neigh); - - ip_rt_put(rt); - return rc; -} - -/** - * make_cm_node - create a new instance of a cm node - */ -static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, - struct nes_cm_listener *listener) -{ - struct nes_cm_node *cm_node; - int oldarpindex = 0; - int arpindex = 0; - struct nes_device *nesdev; - struct nes_adapter *nesadapter; - - /* create an hte and cm_node for this instance */ - cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC); - if (!cm_node) - return NULL; - - /* set our node specific transport info */ - if (listener) { - cm_node->loc_addr = listener->loc_addr; - cm_node->loc_port = listener->loc_port; - } else { - cm_node->loc_addr = cm_info->loc_addr; - cm_node->loc_port = cm_info->loc_port; - } - cm_node->rem_addr = cm_info->rem_addr; - cm_node->rem_port = cm_info->rem_port; - - cm_node->mpa_frame_rev = mpa_version; - cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; - cm_node->mpav2_ird_ord = 0; - cm_node->ird_size = 0; - cm_node->ord_size = 0; - - nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", - &cm_node->loc_addr, cm_node->loc_port, - &cm_node->rem_addr, cm_node->rem_port); - cm_node->listener = listener; - if (listener) - cm_node->tos = listener->tos; - cm_node->netdev = nesvnic->netdev; - cm_node->cm_id = cm_info->cm_id; - memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN); - - nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener, - cm_node->cm_id); - - spin_lock_init(&cm_node->retrans_list_lock); - - cm_node->loopbackpartner = NULL; - atomic_set(&cm_node->ref_count, 1); - /* associate our parent CM core */ - cm_node->cm_core = cm_core; - cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID; - cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; - cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> - NES_CM_DEFAULT_RCV_WND_SCALE; - cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr), - htonl(cm_node->rem_addr), - htons(cm_node->loc_port), - htons(cm_node->rem_port)); - cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - - sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; - cm_node->tcp_cntxt.rcv_nxt = 0; - /* get a unique session ID , add thread_id to an upcounter to handle race */ - atomic_inc(&cm_core->node_cnt); - cm_node->conn_type = cm_info->conn_type; - cm_node->apbvt_set = 0; - cm_node->accept_pend = 0; - - cm_node->nesvnic = nesvnic; - /* get some device handles, for arp lookup */ - nesdev = nesvnic->nesdev; - nesadapter = nesdev->nesadapter; - - cm_node->loopbackpartner = NULL; - - /* get the mac addr for the remote node */ - oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, - NULL, NES_ARP_RESOLVE); - arpindex = nes_addr_resolve_neigh(nesvnic, cm_node->rem_addr, - oldarpindex); - if (arpindex < 0) { - kfree(cm_node); - return NULL; - } - - /* copy the mac addr to node context */ - memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN); - nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n", - cm_node->rem_mac); - - add_hte_node(cm_core, cm_node); - atomic_inc(&cm_nodes_created); - - return cm_node; -} - - -/** - * add_ref_cm_node - destroy an instance of a cm node - */ -static int add_ref_cm_node(struct nes_cm_node *cm_node) -{ - atomic_inc(&cm_node->ref_count); - return 0; -} - - -/** - * rem_ref_cm_node - destroy an instance of a cm node - */ -static int rem_ref_cm_node(struct nes_cm_core *cm_core, - struct nes_cm_node *cm_node) -{ - unsigned long flags; - struct nes_qp *nesqp; - - if (!cm_node) - return -EINVAL; - - spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags); - if (atomic_dec_return(&cm_node->ref_count)) { - spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); - return 0; - } - list_del(&cm_node->list); - atomic_dec(&cm_core->ht_node_cnt); - spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); - - /* if the node is destroyed before connection was accelerated */ - if (!cm_node->accelerated && cm_node->accept_pend) { - BUG_ON(!cm_node->listener); - atomic_dec(&cm_node->listener->pend_accepts_cnt); - BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); - } - WARN_ON(cm_node->send_entry); - if (cm_node->recv_entry) - handle_recv_entry(cm_node, 0); - if (cm_node->listener) { - mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); - } else { - if (cm_node->apbvt_set && cm_node->nesvnic) { - nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, - PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), - NES_MANAGE_APBVT_DEL); - } - nes_debug(NES_DBG_NLMSG, "Delete APBVT loc_port = %04X\n", - cm_node->loc_port); - } - - atomic_dec(&cm_core->node_cnt); - atomic_inc(&cm_nodes_destroyed); - nesqp = cm_node->nesqp; - if (nesqp) { - nesqp->cm_node = NULL; - nes_rem_ref(&nesqp->ibqp); - cm_node->nesqp = NULL; - } - - kfree(cm_node); - return 0; -} - -/** - * process_options - */ -static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, - u32 optionsize, u32 syn_packet) -{ - u32 tmp; - u32 offset = 0; - union all_known_options *all_options; - char got_mss_option = 0; - - while (offset < optionsize) { - all_options = (union all_known_options *)(optionsloc + offset); - switch (all_options->as_base.optionnum) { - case OPTION_NUMBER_END: - offset = optionsize; - break; - case OPTION_NUMBER_NONE: - offset += 1; - continue; - case OPTION_NUMBER_MSS: - nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d " - "Size: %d\n", __func__, - all_options->as_mss.length, offset, optionsize); - got_mss_option = 1; - if (all_options->as_mss.length != 4) { - return 1; - } else { - tmp = ntohs(all_options->as_mss.mss); - if (tmp > 0 && tmp < - cm_node->tcp_cntxt.mss) - cm_node->tcp_cntxt.mss = tmp; - } - break; - case OPTION_NUMBER_WINDOW_SCALE: - cm_node->tcp_cntxt.snd_wscale = - all_options->as_windowscale.shiftcount; - break; - default: - nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n", - all_options->as_base.optionnum); - break; - } - offset += all_options->as_base.length; - } - if ((!got_mss_option) && (syn_packet)) - cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS; - return 0; -} - -static void drop_packet(struct sk_buff *skb) -{ - atomic_inc(&cm_accel_dropped_pkts); - dev_kfree_skb_any(skb); -} - -static void handle_fin_pkt(struct nes_cm_node *cm_node) -{ - nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. " - "refcnt=%d\n", cm_node, cm_node->state, - atomic_read(&cm_node->ref_count)); - switch (cm_node->state) { - case NES_CM_STATE_SYN_RCVD: - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_MPAREJ_RCVD: - cm_node->tcp_cntxt.rcv_nxt++; - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_LAST_ACK; - send_fin(cm_node, NULL); - break; - case NES_CM_STATE_MPAREQ_SENT: - create_event(cm_node, NES_CM_EVENT_ABORTED); - cm_node->tcp_cntxt.rcv_nxt++; - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - add_ref_cm_node(cm_node); - send_reset(cm_node, NULL); - break; - case NES_CM_STATE_FIN_WAIT1: - cm_node->tcp_cntxt.rcv_nxt++; - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSING; - send_ack(cm_node, NULL); - /* Wait for ACK as this is simultaneous close.. - * After we receive ACK, do not send anything.. - * Just rm the node.. Done.. */ - break; - case NES_CM_STATE_FIN_WAIT2: - cm_node->tcp_cntxt.rcv_nxt++; - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_TIME_WAIT; - send_ack(cm_node, NULL); - schedule_nes_timer(cm_node, NULL, NES_TIMER_TYPE_CLOSE, 1, 0); - break; - case NES_CM_STATE_TIME_WAIT: - cm_node->tcp_cntxt.rcv_nxt++; - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - rem_ref_cm_node(cm_node->cm_core, cm_node); - break; - case NES_CM_STATE_TSA: - default: - nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n", - cm_node, cm_node->state); - break; - } -} - - -static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) -{ - - int reset = 0; /* whether to send reset in case of err.. */ - atomic_inc(&cm_resets_recvd); - nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u." - " refcnt=%d\n", cm_node, cm_node->state, - atomic_read(&cm_node->ref_count)); - cleanup_retrans_entry(cm_node); - switch (cm_node->state) { - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_MPAREQ_SENT: - nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " - "listener=%p state=%d\n", __func__, __LINE__, cm_node, - cm_node->listener, cm_node->state); - switch (cm_node->mpa_frame_rev) { - case IETF_MPA_V2: - cm_node->mpa_frame_rev = IETF_MPA_V1; - /* send a syn and goto syn sent state */ - cm_node->state = NES_CM_STATE_SYN_SENT; - if (send_syn(cm_node, 0, NULL)) { - active_open_err(cm_node, skb, reset); - } - break; - case IETF_MPA_V1: - default: - active_open_err(cm_node, skb, reset); - break; - } - break; - case NES_CM_STATE_MPAREQ_RCVD: - atomic_inc(&cm_node->passive_state); - dev_kfree_skb_any(skb); - break; - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_SYN_RCVD: - case NES_CM_STATE_LISTENING: - nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__); - passive_open_err(cm_node, skb, reset); - break; - case NES_CM_STATE_TSA: - active_open_err(cm_node, skb, reset); - break; - case NES_CM_STATE_CLOSED: - drop_packet(skb); - break; - case NES_CM_STATE_FIN_WAIT2: - case NES_CM_STATE_FIN_WAIT1: - case NES_CM_STATE_LAST_ACK: - cm_node->cm_id->rem_ref(cm_node->cm_id); - /* fall through */ - case NES_CM_STATE_TIME_WAIT: - cm_node->state = NES_CM_STATE_CLOSED; - rem_ref_cm_node(cm_node->cm_core, cm_node); - drop_packet(skb); - break; - default: - drop_packet(skb); - break; - } -} - - -static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - int ret = 0; - int datasize = skb->len; - u8 *dataloc = skb->data; - - enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN; - u32 res_type; - - ret = parse_mpa(cm_node, dataloc, &res_type, datasize); - if (ret) { - nes_debug(NES_DBG_CM, "didn't like MPA Request\n"); - if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) { - nes_debug(NES_DBG_CM, "%s[%u] create abort for " - "cm_node=%p listener=%p state=%d\n", __func__, - __LINE__, cm_node, cm_node->listener, - cm_node->state); - active_open_err(cm_node, skb, 1); - } else { - passive_open_err(cm_node, skb, 1); - } - return; - } - - switch (cm_node->state) { - case NES_CM_STATE_ESTABLISHED: - if (res_type == NES_MPA_REQUEST_REJECT) - /*BIG problem as we are receiving the MPA.. So should - * not be REJECT.. This is Passive Open.. We can - * only receive it Reject for Active Open...*/ - WARN_ON(1); - cm_node->state = NES_CM_STATE_MPAREQ_RCVD; - type = NES_CM_EVENT_MPA_REQ; - atomic_set(&cm_node->passive_state, - NES_PASSIVE_STATE_INDICATED); - break; - case NES_CM_STATE_MPAREQ_SENT: - cleanup_retrans_entry(cm_node); - if (res_type == NES_MPA_REQUEST_REJECT) { - type = NES_CM_EVENT_MPA_REJECT; - cm_node->state = NES_CM_STATE_MPAREJ_RCVD; - } else { - type = NES_CM_EVENT_CONNECTED; - cm_node->state = NES_CM_STATE_TSA; - } - send_ack(cm_node, NULL); - break; - default: - WARN_ON(1); - break; - } - dev_kfree_skb_any(skb); - create_event(cm_node, type); -} - -static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb) -{ - switch (cm_node->state) { - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_MPAREQ_SENT: - nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " - "listener=%p state=%d\n", __func__, __LINE__, cm_node, - cm_node->listener, cm_node->state); - active_open_err(cm_node, skb, 1); - break; - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_SYN_RCVD: - passive_open_err(cm_node, skb, 1); - break; - case NES_CM_STATE_TSA: - default: - drop_packet(skb); - } -} - -static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb) -{ - int err; - - err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1; - if (err) - active_open_err(cm_node, skb, 1); - - return err; -} - -static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb) -{ - int err = 0; - u32 seq; - u32 ack_seq; - u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num; - u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt; - u32 rcv_wnd; - - seq = ntohl(tcph->seq); - ack_seq = ntohl(tcph->ack_seq); - rcv_wnd = cm_node->tcp_cntxt.rcv_wnd; - if (ack_seq != loc_seq_num) - err = 1; - else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd))) - err = 1; - if (err) { - nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " - "listener=%p state=%d\n", __func__, __LINE__, cm_node, - cm_node->listener, cm_node->state); - indicate_pkt_err(cm_node, skb); - nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X " - "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt, - rcv_wnd); - } - return err; -} - -/* - * handle_syn_pkt() is for Passive node. The syn packet is received when a node - * is created with a listener or it may comein as rexmitted packet which in - * that case will be just dropped. - */ -static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) -{ - int ret; - u32 inc_sequence; - int optionsize; - - optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); - skb_trim(skb, 0); - inc_sequence = ntohl(tcph->seq); - - switch (cm_node->state) { - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_MPAREQ_SENT: - /* Rcvd syn on active open connection*/ - active_open_err(cm_node, skb, 1); - break; - case NES_CM_STATE_LISTENING: - /* Passive OPEN */ - if (atomic_read(&cm_node->listener->pend_accepts_cnt) > - cm_node->listener->backlog) { - nes_debug(NES_DBG_CM, "drop syn due to backlog " - "pressure \n"); - cm_backlog_drops++; - passive_open_err(cm_node, skb, 0); - break; - } - ret = handle_tcp_options(cm_node, tcph, skb, optionsize, - 1); - if (ret) { - passive_open_err(cm_node, skb, 0); - /* drop pkt */ - break; - } - cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; - BUG_ON(cm_node->send_entry); - cm_node->accept_pend = 1; - atomic_inc(&cm_node->listener->pend_accepts_cnt); - - cm_node->state = NES_CM_STATE_SYN_RCVD; - send_syn(cm_node, 1, skb); - break; - case NES_CM_STATE_CLOSED: - cleanup_retrans_entry(cm_node); - add_ref_cm_node(cm_node); - send_reset(cm_node, skb); - break; - case NES_CM_STATE_TSA: - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_FIN_WAIT1: - case NES_CM_STATE_FIN_WAIT2: - case NES_CM_STATE_MPAREQ_RCVD: - case NES_CM_STATE_LAST_ACK: - case NES_CM_STATE_CLOSING: - case NES_CM_STATE_UNKNOWN: - default: - drop_packet(skb); - break; - } -} - -static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) -{ - int ret; - u32 inc_sequence; - int optionsize; - - optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); - skb_trim(skb, 0); - inc_sequence = ntohl(tcph->seq); - switch (cm_node->state) { - case NES_CM_STATE_SYN_SENT: - cleanup_retrans_entry(cm_node); - /* active open */ - if (check_syn(cm_node, tcph, skb)) - return; - cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); - /* setup options */ - ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0); - if (ret) { - nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n", - cm_node); - break; - } - cleanup_retrans_entry(cm_node); - cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; - send_mpa_request(cm_node, skb); - cm_node->state = NES_CM_STATE_MPAREQ_SENT; - break; - case NES_CM_STATE_MPAREQ_RCVD: - /* passive open, so should not be here */ - passive_open_err(cm_node, skb, 1); - break; - case NES_CM_STATE_LISTENING: - cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - send_reset(cm_node, skb); - break; - case NES_CM_STATE_CLOSED: - cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); - cleanup_retrans_entry(cm_node); - add_ref_cm_node(cm_node); - send_reset(cm_node, skb); - break; - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_FIN_WAIT1: - case NES_CM_STATE_FIN_WAIT2: - case NES_CM_STATE_LAST_ACK: - case NES_CM_STATE_TSA: - case NES_CM_STATE_CLOSING: - case NES_CM_STATE_UNKNOWN: - case NES_CM_STATE_MPAREQ_SENT: - default: - drop_packet(skb); - break; - } -} - -static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct tcphdr *tcph) -{ - int datasize = 0; - u32 inc_sequence; - int ret = 0; - int optionsize; - - optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); - - if (check_seq(cm_node, tcph, skb)) - return -EINVAL; - - skb_pull(skb, tcph->doff << 2); - inc_sequence = ntohl(tcph->seq); - datasize = skb->len; - switch (cm_node->state) { - case NES_CM_STATE_SYN_RCVD: - /* Passive OPEN */ - cleanup_retrans_entry(cm_node); - ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1); - if (ret) - break; - cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); - cm_node->state = NES_CM_STATE_ESTABLISHED; - if (datasize) { - cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; - handle_rcv_mpa(cm_node, skb); - } else { /* rcvd ACK only */ - dev_kfree_skb_any(skb); - } - break; - case NES_CM_STATE_ESTABLISHED: - /* Passive OPEN */ - cleanup_retrans_entry(cm_node); - if (datasize) { - cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; - handle_rcv_mpa(cm_node, skb); - } else { - drop_packet(skb); - } - break; - case NES_CM_STATE_MPAREQ_SENT: - cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); - if (datasize) { - cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; - handle_rcv_mpa(cm_node, skb); - } else { /* Could be just an ack pkt.. */ - dev_kfree_skb_any(skb); - } - break; - case NES_CM_STATE_LISTENING: - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - send_reset(cm_node, skb); - break; - case NES_CM_STATE_CLOSED: - cleanup_retrans_entry(cm_node); - add_ref_cm_node(cm_node); - send_reset(cm_node, skb); - break; - case NES_CM_STATE_LAST_ACK: - case NES_CM_STATE_CLOSING: - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - cm_node->cm_id->rem_ref(cm_node->cm_id); - rem_ref_cm_node(cm_node->cm_core, cm_node); - drop_packet(skb); - break; - case NES_CM_STATE_FIN_WAIT1: - cleanup_retrans_entry(cm_node); - drop_packet(skb); - cm_node->state = NES_CM_STATE_FIN_WAIT2; - break; - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_FIN_WAIT2: - case NES_CM_STATE_TSA: - case NES_CM_STATE_MPAREQ_RCVD: - case NES_CM_STATE_UNKNOWN: - default: - cleanup_retrans_entry(cm_node); - drop_packet(skb); - break; - } - return ret; -} - - - -static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, - struct sk_buff *skb, int optionsize, int passive) -{ - u8 *optionsloc = (u8 *)&tcph[1]; - - if (optionsize) { - if (process_options(cm_node, optionsloc, optionsize, - (u32)tcph->syn)) { - nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", - __func__, cm_node); - if (passive) - passive_open_err(cm_node, skb, 1); - else - active_open_err(cm_node, skb, 1); - return 1; - } - } - - cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) << - cm_node->tcp_cntxt.snd_wscale; - - if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) - cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd; - return 0; -} - -/* - * active_open_err() will send reset() if flag set.. - * It will also send ABORT event. - */ -static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, - int reset) -{ - cleanup_retrans_entry(cm_node); - if (reset) { - nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, " - "state=%d\n", cm_node, cm_node->state); - add_ref_cm_node(cm_node); - send_reset(cm_node, skb); - } else { - dev_kfree_skb_any(skb); - } - - cm_node->state = NES_CM_STATE_CLOSED; - create_event(cm_node, NES_CM_EVENT_ABORTED); -} - -/* - * passive_open_err() will either do a reset() or will free up the skb and - * remove the cm_node. - */ -static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, - int reset) -{ - cleanup_retrans_entry(cm_node); - cm_node->state = NES_CM_STATE_CLOSED; - if (reset) { - nes_debug(NES_DBG_CM, "passive_open_err sending RST for " - "cm_node=%p state =%d\n", cm_node, cm_node->state); - send_reset(cm_node, skb); - } else { - dev_kfree_skb_any(skb); - rem_ref_cm_node(cm_node->cm_core, cm_node); - } -} - -/* - * free_retrans_entry() routines assumes that the retrans_list_lock has - * been acquired before calling. - */ -static void free_retrans_entry(struct nes_cm_node *cm_node) -{ - struct nes_timer_entry *send_entry; - - send_entry = cm_node->send_entry; - if (send_entry) { - cm_node->send_entry = NULL; - dev_kfree_skb_any(send_entry->skb); - kfree(send_entry); - rem_ref_cm_node(cm_node->cm_core, cm_node); - } -} - -static void cleanup_retrans_entry(struct nes_cm_node *cm_node) -{ - unsigned long flags; - - spin_lock_irqsave(&cm_node->retrans_list_lock, flags); - free_retrans_entry(cm_node); - spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); -} - -/** - * process_packet - * Returns skb if to be freed, else it will return NULL if already used.. - */ -static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, - struct nes_cm_core *cm_core) -{ - enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN; - struct tcphdr *tcph = tcp_hdr(skb); - u32 fin_set = 0; - int ret = 0; - - skb_pull(skb, ip_hdr(skb)->ihl << 2); - - nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d " - "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn, - tcph->ack, tcph->rst, tcph->fin); - - if (tcph->rst) { - pkt_type = NES_PKT_TYPE_RST; - } else if (tcph->syn) { - pkt_type = NES_PKT_TYPE_SYN; - if (tcph->ack) - pkt_type = NES_PKT_TYPE_SYNACK; - } else if (tcph->ack) { - pkt_type = NES_PKT_TYPE_ACK; - } - if (tcph->fin) - fin_set = 1; - - switch (pkt_type) { - case NES_PKT_TYPE_SYN: - handle_syn_pkt(cm_node, skb, tcph); - break; - case NES_PKT_TYPE_SYNACK: - handle_synack_pkt(cm_node, skb, tcph); - break; - case NES_PKT_TYPE_ACK: - ret = handle_ack_pkt(cm_node, skb, tcph); - if (fin_set && !ret) - handle_fin_pkt(cm_node); - break; - case NES_PKT_TYPE_RST: - handle_rst_pkt(cm_node, skb, tcph); - break; - default: - if ((fin_set) && (!check_seq(cm_node, tcph, skb))) - handle_fin_pkt(cm_node); - drop_packet(skb); - break; - } -} - -/** - * mini_cm_listen - create a listen node with params - */ -static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) -{ - struct nes_cm_listener *listener; - unsigned long flags; - - nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", - cm_info->loc_addr, cm_info->loc_port); - - /* cannot have multiple matching listeners */ - listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, - NES_CM_LISTENER_EITHER_STATE); - - if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { - /* find automatically incs ref count ??? */ - atomic_dec(&listener->ref_count); - nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n"); - return NULL; - } - - if (!listener) { - /* create a CM listen node (1/2 node to compare incoming traffic to) */ - listener = kzalloc(sizeof(*listener), GFP_ATOMIC); - if (!listener) - return NULL; - - listener->loc_addr = cm_info->loc_addr; - listener->loc_port = cm_info->loc_port; - listener->reused_node = 0; - - atomic_set(&listener->ref_count, 1); - } - /* pasive case */ - /* find already inc'ed the ref count */ - else { - listener->reused_node = 1; - } - - listener->cm_id = cm_info->cm_id; - atomic_set(&listener->pend_accepts_cnt, 0); - listener->cm_core = cm_core; - listener->nesvnic = nesvnic; - atomic_inc(&cm_core->node_cnt); - - listener->conn_type = cm_info->conn_type; - listener->backlog = cm_info->backlog; - listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE; - - if (!listener->reused_node) { - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_add(&listener->list, &cm_core->listen_list.list); - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - atomic_inc(&cm_core->listen_node_cnt); - } - - nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x," - " listener = %p, backlog = %d, cm_id = %p.\n", - cm_info->loc_addr, cm_info->loc_port, - listener, listener->backlog, listener->cm_id); - - return listener; -} - - -/** - * mini_cm_connect - make a connection node with params - */ -static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, u16 private_data_len, - void *private_data, struct nes_cm_info *cm_info) -{ - int ret = 0; - struct nes_cm_node *cm_node; - struct nes_cm_listener *loopbackremotelistener; - struct nes_cm_node *loopbackremotenode; - struct nes_cm_info loopback_cm_info; - u8 *start_buff; - - /* create a CM connection node */ - cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL); - if (!cm_node) - return NULL; - - /* set our node side to client (active) side */ - cm_node->tcp_cntxt.client = 1; - cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; - - if (cm_info->loc_addr == cm_info->rem_addr) { - loopbackremotelistener = find_listener(cm_core, - cm_node->loc_addr, cm_node->rem_port, - NES_CM_LISTENER_ACTIVE_STATE); - if (loopbackremotelistener == NULL) { - create_event(cm_node, NES_CM_EVENT_ABORTED); - } else { - loopback_cm_info = *cm_info; - loopback_cm_info.loc_port = cm_info->rem_port; - loopback_cm_info.rem_port = cm_info->loc_port; - loopback_cm_info.loc_port = - cm_info->rem_port; - loopback_cm_info.rem_port = - cm_info->loc_port; - loopback_cm_info.cm_id = loopbackremotelistener->cm_id; - loopbackremotenode = make_cm_node(cm_core, nesvnic, - &loopback_cm_info, loopbackremotelistener); - if (!loopbackremotenode) { - rem_ref_cm_node(cm_node->cm_core, cm_node); - return NULL; - } - atomic_inc(&cm_loopbacks); - loopbackremotenode->loopbackpartner = cm_node; - loopbackremotenode->tcp_cntxt.rcv_wscale = - NES_CM_DEFAULT_RCV_WND_SCALE; - cm_node->loopbackpartner = loopbackremotenode; - memcpy(loopbackremotenode->mpa_frame_buf, private_data, - private_data_len); - loopbackremotenode->mpa_frame_size = private_data_len; - - /* we are done handling this state. */ - /* set node to a TSA state */ - cm_node->state = NES_CM_STATE_TSA; - cm_node->tcp_cntxt.rcv_nxt = - loopbackremotenode->tcp_cntxt.loc_seq_num; - loopbackremotenode->tcp_cntxt.rcv_nxt = - cm_node->tcp_cntxt.loc_seq_num; - cm_node->tcp_cntxt.max_snd_wnd = - loopbackremotenode->tcp_cntxt.rcv_wnd; - loopbackremotenode->tcp_cntxt.max_snd_wnd = - cm_node->tcp_cntxt.rcv_wnd; - cm_node->tcp_cntxt.snd_wnd = - loopbackremotenode->tcp_cntxt.rcv_wnd; - loopbackremotenode->tcp_cntxt.snd_wnd = - cm_node->tcp_cntxt.rcv_wnd; - cm_node->tcp_cntxt.snd_wscale = - loopbackremotenode->tcp_cntxt.rcv_wscale; - loopbackremotenode->tcp_cntxt.snd_wscale = - cm_node->tcp_cntxt.rcv_wscale; - loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD; - create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ); - } - return cm_node; - } - - start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); - cm_node->mpa_frame_size = private_data_len; - - memcpy(start_buff, private_data, private_data_len); - - /* send a syn and goto syn sent state */ - cm_node->state = NES_CM_STATE_SYN_SENT; - ret = send_syn(cm_node, 0, NULL); - - if (ret) { - /* error in sending the syn free up the cm_node struct */ - nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest " - "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_addr, cm_node->rem_port, cm_node, - cm_node->cm_id); - rem_ref_cm_node(cm_node->cm_core, cm_node); - cm_node = NULL; - } - - if (cm_node) { - nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X," - "port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_addr, cm_node->rem_port, cm_node, - cm_node->cm_id); - } - - return cm_node; -} - - -/** - * mini_cm_accept - accept a connection - * This function is never called - */ -static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) -{ - return 0; -} - - -/** - * mini_cm_reject - reject and teardown a connection - */ -static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) -{ - int ret = 0; - int err = 0; - int passive_state; - struct nes_cm_event event; - struct iw_cm_id *cm_id = cm_node->cm_id; - struct nes_cm_node *loopback = cm_node->loopbackpartner; - - nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n", - __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); - - if (cm_node->tcp_cntxt.client) - return ret; - cleanup_retrans_entry(cm_node); - - if (!loopback) { - passive_state = atomic_add_return(1, &cm_node->passive_state); - if (passive_state == NES_SEND_RESET_EVENT) { - cm_node->state = NES_CM_STATE_CLOSED; - rem_ref_cm_node(cm_core, cm_node); - } else { - if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { - rem_ref_cm_node(cm_core, cm_node); - } else { - ret = send_mpa_reject(cm_node); - if (ret) { - cm_node->state = NES_CM_STATE_CLOSED; - err = send_reset(cm_node, NULL); - if (err) - WARN_ON(1); - } else { - cm_id->add_ref(cm_id); - } - } - } - } else { - cm_node->cm_id = NULL; - if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { - rem_ref_cm_node(cm_core, cm_node); - rem_ref_cm_node(cm_core, loopback); - } else { - event.cm_node = loopback; - event.cm_info.rem_addr = loopback->rem_addr; - event.cm_info.loc_addr = loopback->loc_addr; - event.cm_info.rem_port = loopback->rem_port; - event.cm_info.loc_port = loopback->loc_port; - event.cm_info.cm_id = loopback->cm_id; - cm_event_mpa_reject(&event); - rem_ref_cm_node(cm_core, cm_node); - loopback->state = NES_CM_STATE_CLOSING; - - cm_id = loopback->cm_id; - rem_ref_cm_node(cm_core, loopback); - cm_id->rem_ref(cm_id); - } - } - - return ret; -} - - -/** - * mini_cm_close - */ -static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) -{ - int ret = 0; - - if (!cm_core || !cm_node) - return -EINVAL; - - switch (cm_node->state) { - case NES_CM_STATE_SYN_RCVD: - case NES_CM_STATE_SYN_SENT: - case NES_CM_STATE_ONE_SIDE_ESTABLISHED: - case NES_CM_STATE_ESTABLISHED: - case NES_CM_STATE_ACCEPTING: - case NES_CM_STATE_MPAREQ_SENT: - case NES_CM_STATE_MPAREQ_RCVD: - cleanup_retrans_entry(cm_node); - send_reset(cm_node, NULL); - break; - case NES_CM_STATE_CLOSE_WAIT: - cm_node->state = NES_CM_STATE_LAST_ACK; - send_fin(cm_node, NULL); - break; - case NES_CM_STATE_FIN_WAIT1: - case NES_CM_STATE_FIN_WAIT2: - case NES_CM_STATE_LAST_ACK: - case NES_CM_STATE_TIME_WAIT: - case NES_CM_STATE_CLOSING: - ret = -1; - break; - case NES_CM_STATE_LISTENING: - cleanup_retrans_entry(cm_node); - send_reset(cm_node, NULL); - break; - case NES_CM_STATE_MPAREJ_RCVD: - case NES_CM_STATE_UNKNOWN: - case NES_CM_STATE_INITED: - case NES_CM_STATE_CLOSED: - case NES_CM_STATE_LISTENER_DESTROYED: - ret = rem_ref_cm_node(cm_core, cm_node); - break; - case NES_CM_STATE_TSA: - if (cm_node->send_entry) - printk(KERN_ERR "ERROR Close got called from STATE_TSA " - "send_entry=%p\n", cm_node->send_entry); - ret = rem_ref_cm_node(cm_core, cm_node); - break; - } - return ret; -} - - -/** - * recv_pkt - recv an ETHERNET packet, and process it through CM - * node state machine - */ -static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, - struct nes_vnic *nesvnic, struct sk_buff *skb) -{ - struct nes_cm_node *cm_node = NULL; - struct nes_cm_listener *listener = NULL; - struct iphdr *iph; - struct tcphdr *tcph; - struct nes_cm_info nfo; - int skb_handled = 1; - __be32 tmp_daddr, tmp_saddr; - - if (!skb) - return 0; - if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) - return 0; - - iph = (struct iphdr *)skb->data; - tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); - - nfo.loc_addr = ntohl(iph->daddr); - nfo.loc_port = ntohs(tcph->dest); - nfo.rem_addr = ntohl(iph->saddr); - nfo.rem_port = ntohs(tcph->source); - - tmp_daddr = cpu_to_be32(iph->daddr); - tmp_saddr = cpu_to_be32(iph->saddr); - - nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n", - &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source); - - do { - cm_node = find_node(cm_core, - nfo.rem_port, nfo.rem_addr, - nfo.loc_port, nfo.loc_addr); - - if (!cm_node) { - /* Only type of packet accepted are for */ - /* the PASSIVE open (syn only) */ - if ((!tcph->syn) || (tcph->ack)) { - skb_handled = 0; - break; - } - listener = find_listener(cm_core, nfo.loc_addr, - nfo.loc_port, - NES_CM_LISTENER_ACTIVE_STATE); - if (!listener) { - nfo.cm_id = NULL; - nfo.conn_type = 0; - nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n"); - skb_handled = 0; - break; - } - nfo.cm_id = listener->cm_id; - nfo.conn_type = listener->conn_type; - cm_node = make_cm_node(cm_core, nesvnic, &nfo, - listener); - if (!cm_node) { - nes_debug(NES_DBG_CM, "Unable to allocate " - "node\n"); - cm_packets_dropped++; - atomic_dec(&listener->ref_count); - dev_kfree_skb_any(skb); - break; - } - if (!tcph->rst && !tcph->fin) { - cm_node->state = NES_CM_STATE_LISTENING; - } else { - cm_packets_dropped++; - rem_ref_cm_node(cm_core, cm_node); - dev_kfree_skb_any(skb); - break; - } - add_ref_cm_node(cm_node); - } else if (cm_node->state == NES_CM_STATE_TSA) { - if (cm_node->nesqp->pau_mode) - nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp); - else { - rem_ref_cm_node(cm_core, cm_node); - atomic_inc(&cm_accel_dropped_pkts); - dev_kfree_skb_any(skb); - } - break; - } - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(*tcph)); - skb->len = ntohs(iph->tot_len); - process_packet(cm_node, skb, cm_core); - rem_ref_cm_node(cm_core, cm_node); - } while (0); - return skb_handled; -} - - -/** - * nes_cm_alloc_core - allocate a top level instance of a cm core - */ -static struct nes_cm_core *nes_cm_alloc_core(void) -{ - struct nes_cm_core *cm_core; - - /* setup the CM core */ - /* alloc top level core control structure */ - cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL); - if (!cm_core) - return NULL; - - INIT_LIST_HEAD(&cm_core->connected_nodes); - timer_setup(&cm_core->tcp_timer, nes_cm_timer_tick, 0); - - cm_core->mtu = NES_CM_DEFAULT_MTU; - cm_core->state = NES_CM_STATE_INITED; - cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS; - - atomic_set(&cm_core->events_posted, 0); - - cm_core->api = &nes_cm_api; - - spin_lock_init(&cm_core->ht_lock); - spin_lock_init(&cm_core->listen_list_lock); - - INIT_LIST_HEAD(&cm_core->listen_list.list); - - nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core); - - nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n"); - cm_core->event_wq = alloc_ordered_workqueue("nesewq", 0); - if (!cm_core->event_wq) - goto out_free_cmcore; - cm_core->post_event = nes_cm_post_event; - nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n"); - cm_core->disconn_wq = alloc_ordered_workqueue("nesdwq", 0); - if (!cm_core->disconn_wq) - goto out_free_wq; - - print_core(cm_core); - return cm_core; - -out_free_wq: - destroy_workqueue(cm_core->event_wq); -out_free_cmcore: - kfree(cm_core); - return NULL; -} - - -/** - * mini_cm_dealloc_core - deallocate a top level instance of a cm core - */ -static int mini_cm_dealloc_core(struct nes_cm_core *cm_core) -{ - nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core); - - if (!cm_core) - return -EINVAL; - - barrier(); - - if (timer_pending(&cm_core->tcp_timer)) - del_timer(&cm_core->tcp_timer); - - destroy_workqueue(cm_core->event_wq); - destroy_workqueue(cm_core->disconn_wq); - nes_debug(NES_DBG_CM, "\n"); - kfree(cm_core); - - return 0; -} - - -/** - * mini_cm_get - */ -static int mini_cm_get(struct nes_cm_core *cm_core) -{ - return cm_core->state; -} - - -/** - * mini_cm_set - */ -static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value) -{ - int ret = 0; - - switch (type) { - case NES_CM_SET_PKT_SIZE: - cm_core->mtu = value; - break; - case NES_CM_SET_FREE_PKT_Q_SIZE: - cm_core->free_tx_pkt_max = value; - break; - default: - /* unknown set option */ - ret = -EINVAL; - } - - return ret; -} - - -/** - * nes_cm_init_tsa_conn setup HW; MPA frames must be - * successfully exchanged when this is called - */ -static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node) -{ - int ret = 0; - - if (!nesqp) - return -EINVAL; - - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 | - NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | - NES_QPCONTEXT_MISC_DROS); - - if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale) - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE); - - nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT); - - nesqp->nesqp_context->misc2 |= cpu_to_le32( - cm_node->tos << NES_QPCONTEXT_MISC2_TOS_SHIFT); - - nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16); - - nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32( - (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); - - nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( - (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & - NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); - - nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( - (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & - NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); - - nesqp->nesqp_context->keepalive = cpu_to_le32(0x80); - nesqp->nesqp_context->ts_recent = 0; - nesqp->nesqp_context->ts_age = 0; - nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); - nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd); - nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); - nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd << - cm_node->tcp_cntxt.rcv_wscale); - nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); - nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); - nesqp->nesqp_context->srtt = 0; - nesqp->nesqp_context->rttvar = cpu_to_le32(0x6); - nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000); - nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss); - nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); - nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); - nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd); - - nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X," - " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", - nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), - le32_to_cpu(nesqp->nesqp_context->snd_nxt), - cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), - le32_to_cpu(nesqp->nesqp_context->rcv_wnd), - le32_to_cpu(nesqp->nesqp_context->misc)); - nes_debug(NES_DBG_CM, " snd_wnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd)); - nes_debug(NES_DBG_CM, " snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd)); - nes_debug(NES_DBG_CM, " max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd)); - - nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n"); - cm_node->state = NES_CM_STATE_TSA; - - return ret; -} - - -/** - * nes_cm_disconn - */ -int nes_cm_disconn(struct nes_qp *nesqp) -{ - struct disconn_work *work; - - work = kzalloc(sizeof *work, GFP_ATOMIC); - if (!work) - return -ENOMEM; /* Timer will clean up */ - - nes_add_ref(&nesqp->ibqp); - work->nesqp = nesqp; - INIT_WORK(&work->work, nes_disconnect_worker); - queue_work(g_cm_core->disconn_wq, &work->work); - return 0; -} - - -/** - * nes_disconnect_worker - */ -static void nes_disconnect_worker(struct work_struct *work) -{ - struct disconn_work *dwork = container_of(work, struct disconn_work, work); - struct nes_qp *nesqp = dwork->nesqp; - - kfree(dwork); - nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", - nesqp->last_aeq, nesqp->hwqp.qp_id); - nes_cm_disconn_true(nesqp); - nes_rem_ref(&nesqp->ibqp); -} - - -/** - * nes_cm_disconn_true - */ -static int nes_cm_disconn_true(struct nes_qp *nesqp) -{ - unsigned long flags; - int ret = 0; - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - struct nes_vnic *nesvnic; - u16 last_ae; - u8 original_hw_tcp_state; - u8 original_ibqp_state; - int disconn_status = 0; - int issue_disconn = 0; - int issue_close = 0; - int issue_flush = 0; - u32 flush_q = NES_CQP_FLUSH_RQ; - struct ib_event ibevent; - - if (!nesqp) { - nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); - return -1; - } - - spin_lock_irqsave(&nesqp->lock, flags); - cm_id = nesqp->cm_id; - /* make sure we havent already closed this connection */ - if (!cm_id) { - nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n", - nesqp->hwqp.qp_id); - spin_unlock_irqrestore(&nesqp->lock, flags); - return -1; - } - - nesvnic = to_nesvnic(nesqp->ibqp.device); - nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id); - - original_hw_tcp_state = nesqp->hw_tcp_state; - original_ibqp_state = nesqp->ibqp_state; - last_ae = nesqp->last_aeq; - - if (nesqp->term_flags) { - issue_disconn = 1; - issue_close = 1; - nesqp->cm_id = NULL; - del_timer(&nesqp->terminate_timer); - if (nesqp->flush_issued == 0) { - nesqp->flush_issued = 1; - issue_flush = 1; - } - } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || - ((original_ibqp_state == IB_QPS_RTS) && - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { - issue_disconn = 1; - if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) - disconn_status = -ECONNRESET; - } - - if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || - (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || - (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { - issue_close = 1; - nesqp->cm_id = NULL; - if (nesqp->flush_issued == 0) { - nesqp->flush_issued = 1; - issue_flush = 1; - } - } - - spin_unlock_irqrestore(&nesqp->lock, flags); - - if ((issue_flush) && (nesqp->destroyed == 0)) { - /* Flush the queue(s) */ - if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE) - flush_q |= NES_CQP_FLUSH_SQ; - flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1); - - if (nesqp->term_flags) { - ibevent.device = nesqp->ibqp.device; - ibevent.event = nesqp->terminate_eventtype; - ibevent.element.qp = &nesqp->ibqp; - if (nesqp->ibqp.event_handler) - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - } - - if ((cm_id) && (cm_id->event_handler)) { - if (issue_disconn) { - atomic_inc(&cm_disconnects); - cm_event.event = IW_CM_EVENT_DISCONNECT; - cm_event.status = disconn_status; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - - nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event" - " for QP%u, SQ Head = %u, SQ Tail = %u. " - "cm_id = %p, refcount = %u.\n", - nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, - nesqp->hwqp.sq_tail, cm_id, - atomic_read(&nesqp->refcount)); - - ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) - nes_debug(NES_DBG_CM, "OFA CM event_handler " - "returned, ret=%d\n", ret); - } - - if (issue_close) { - atomic_inc(&cm_closes); - nes_disconnect(nesqp, 1); - - cm_id->provider_data = nesqp; - /* Send up the close complete event */ - cm_event.event = IW_CM_EVENT_CLOSE; - cm_event.status = 0; - cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - - ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) - nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); - - cm_id->rem_ref(cm_id); - } - } - - return 0; -} - - -/** - * nes_disconnect - */ -static int nes_disconnect(struct nes_qp *nesqp, int abrupt) -{ - int ret = 0; - struct nes_vnic *nesvnic; - struct nes_device *nesdev; - struct nes_ib_device *nesibdev; - - nesvnic = to_nesvnic(nesqp->ibqp.device); - if (!nesvnic) - return -EINVAL; - - nesdev = nesvnic->nesdev; - nesibdev = nesvnic->nesibdev; - - nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", - netdev_refcnt_read(nesvnic->netdev)); - - if (nesqp->active_conn) { - - /* indicate this connection is NOT active */ - nesqp->active_conn = 0; - } else { - /* Need to free the Last Streaming Mode Message */ - if (nesqp->ietf_frame) { - if (nesqp->lsmm_mr) - nesibdev->ibdev.ops.dereg_mr(nesqp->lsmm_mr, - NULL); - pci_free_consistent(nesdev->pcidev, - nesqp->private_data_len + nesqp->ietf_frame_size, - nesqp->ietf_frame, nesqp->ietf_frame_pbase); - } - } - - /* close the CM node down if it is still active */ - if (nesqp->cm_node) { - nes_debug(NES_DBG_CM, "Call close API\n"); - - g_cm_core->api->close(g_cm_core, nesqp->cm_node); - } - - return ret; -} - - -/** - * nes_accept - */ -int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) -{ - u64 u64temp; - struct ib_qp *ibqp; - struct nes_qp *nesqp; - struct nes_vnic *nesvnic; - struct nes_device *nesdev; - struct nes_cm_node *cm_node; - struct nes_adapter *adapter; - struct ib_qp_attr attr; - struct iw_cm_event cm_event; - struct nes_hw_qp_wqe *wqe; - struct nes_v4_quad nes_quad; - u32 crc_value; - int ret; - int passive_state; - struct ib_mr *ibmr = NULL; - struct nes_pd *nespd; - u64 tagged_offset; - u8 mpa_frame_offset = 0; - struct ietf_mpa_v2 *mpa_v2_frame; - u8 start_addr = 0; - u8 *start_ptr = &start_addr; - u8 **start_buff = &start_ptr; - u16 buff_len = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; - - ibqp = nes_get_qp(cm_id->device, conn_param->qpn); - if (!ibqp) - return -EINVAL; - - /* get all our handles */ - nesqp = to_nesqp(ibqp); - nesvnic = to_nesvnic(nesqp->ibqp.device); - nesdev = nesvnic->nesdev; - adapter = nesdev->nesadapter; - - cm_node = (struct nes_cm_node *)cm_id->provider_data; - nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p," - "%s\n", cm_node, nesvnic, nesvnic->netdev, - nesvnic->netdev->name); - - if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) { - if (cm_node->loopbackpartner) - rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner); - rem_ref_cm_node(cm_node->cm_core, cm_node); - return -EINVAL; - } - - passive_state = atomic_add_return(1, &cm_node->passive_state); - if (passive_state == NES_SEND_RESET_EVENT) { - rem_ref_cm_node(cm_node->cm_core, cm_node); - return -ECONNRESET; - } - /* associate the node with the QP */ - nesqp->cm_node = (void *)cm_node; - cm_node->nesqp = nesqp; - - - nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n", - nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener); - atomic_inc(&cm_accepts); - - nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", - netdev_refcnt_read(nesvnic->netdev)); - - nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2); - /* allocate the ietf frame and space for private data */ - nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, - nesqp->ietf_frame_size + conn_param->private_data_len, - &nesqp->ietf_frame_pbase); - - if (!nesqp->ietf_frame) { - nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n"); - return -ENOMEM; - } - mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame; - - if (cm_node->mpa_frame_rev == IETF_MPA_V1) - mpa_frame_offset = 4; - - if (cm_node->mpa_frame_rev == IETF_MPA_V1 || - cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { - record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); - } - - memcpy(mpa_v2_frame->priv_data, conn_param->private_data, - conn_param->private_data_len); - - cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY); - nesqp->private_data_len = conn_param->private_data_len; - - /* setup our first outgoing iWarp send WQE (the IETF frame response) */ - wqe = &nesqp->hwqp.sq_vbase[0]; - - if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) { - u64temp = (unsigned long)nesqp; - nespd = nesqp->nespd; - tagged_offset = (u64)(unsigned long)*start_buff; - ibmr = nes_reg_phys_mr(&nespd->ibpd, - nesqp->ietf_frame_pbase + mpa_frame_offset, - buff_len, IB_ACCESS_LOCAL_WRITE, - &tagged_offset); - if (IS_ERR(ibmr)) { - nes_debug(NES_DBG_CM, "Unable to register memory region" - "for lSMM for cm_node = %p \n", - cm_node); - pci_free_consistent(nesdev->pcidev, - nesqp->private_data_len + nesqp->ietf_frame_size, - nesqp->ietf_frame, nesqp->ietf_frame_pbase); - return PTR_ERR(ibmr); - } - - ibmr->pd = &nespd->ibpd; - ibmr->device = nespd->ibpd.device; - nesqp->lsmm_mr = ibmr; - - u64temp |= NES_SW_CONTEXT_ALIGN >> 1; - set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, - u64temp); - wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = - cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | - NES_IWARP_SQ_WQE_WRPDU); - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = - cpu_to_le32(buff_len); - set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - (u64)(unsigned long)(*start_buff)); - wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = - cpu_to_le32(buff_len); - wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; - if (nesqp->sq_kmapped) { - nesqp->sq_kmapped = 0; - kunmap(nesqp->page); - } - - nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | - NES_QPCONTEXT_ORDIRD_WRPDU); - } else { - nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU); - } - nesqp->skip_lsmm = 1; - - /* Cache the cm_id in the qp */ - nesqp->cm_id = cm_id; - cm_node->cm_id = cm_id; - - /* nesqp->cm_node = (void *)cm_id->provider_data; */ - cm_id->provider_data = nesqp; - nesqp->active_conn = 0; - - if (cm_node->state == NES_CM_STATE_TSA) - nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n", - cm_node); - - nes_cm_init_tsa_conn(nesqp, cm_node); - - nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(cm_node->loc_port); - nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(cm_node->rem_port); - - nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr); - - nesqp->nesqp_context->misc2 |= cpu_to_le32( - (u32)PCI_FUNC(nesdev->pcidev->devfn) << - NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); - - nesqp->nesqp_context->arp_index_vlan |= - cpu_to_le32(nes_arp_table(nesdev, - le32_to_cpu(nesqp->nesqp_context->ip0), NULL, - NES_ARP_RESOLVE) << 16); - - nesqp->nesqp_context->ts_val_delta = cpu_to_le32( - jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); - - nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); - - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( - ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); - nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32((u32)cm_node->ord_size); - - memset(&nes_quad, 0, sizeof(nes_quad)); - nes_quad.DstIpAdrIndex = - cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = htonl(cm_node->rem_addr); - nes_quad.TcpPorts[0] = htons(cm_node->rem_port); - nes_quad.TcpPorts[1] = htons(cm_node->loc_port); - - /* Produce hash key */ - crc_value = get_crc_value(&nes_quad); - nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); - nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n", - nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); - - nesqp->hte_index &= adapter->hte_index_mask; - nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); - - cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); - - nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = " - "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " - "private data length=%u.\n", nesqp->hwqp.qp_id, - ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port), - ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port), - le32_to_cpu(nesqp->nesqp_context->rcv_nxt), - le32_to_cpu(nesqp->nesqp_context->snd_nxt), - buff_len); - - /* notify OF layer that accept event was successful */ - cm_id->add_ref(cm_id); - nes_add_ref(&nesqp->ibqp); - - cm_event.event = IW_CM_EVENT_ESTABLISHED; - cm_event.status = 0; - cm_event.provider_data = (void *)nesqp; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - cm_event.ird = cm_node->ird_size; - cm_event.ord = cm_node->ord_size; - - ret = cm_id->event_handler(cm_id, &cm_event); - attr.qp_state = IB_QPS_RTS; - nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); - if (cm_node->loopbackpartner) { - cm_node->loopbackpartner->mpa_frame_size = - nesqp->private_data_len; - /* copy entire MPA frame to our cm_node's frame */ - memcpy(cm_node->loopbackpartner->mpa_frame_buf, - conn_param->private_data, conn_param->private_data_len); - create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED); - } - if (ret) - printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); - - return 0; -} - - -/** - * nes_reject - */ -int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) -{ - struct nes_cm_node *cm_node; - struct nes_cm_node *loopback; - struct nes_cm_core *cm_core; - u8 *start_buff; - - atomic_inc(&cm_rejects); - cm_node = (struct nes_cm_node *)cm_id->provider_data; - loopback = cm_node->loopbackpartner; - cm_core = cm_node->cm_core; - cm_node->cm_id = cm_id; - - if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER) - return -EINVAL; - - if (loopback) { - memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); - loopback->mpa_frame.priv_data_len = pdata_len; - loopback->mpa_frame_size = pdata_len; - } else { - start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); - cm_node->mpa_frame_size = pdata_len; - memcpy(start_buff, pdata, pdata_len); - } - return cm_core->api->reject(cm_core, cm_node); -} - - -/** - * nes_connect - * setup and launch cm connect node - */ -int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) -{ - struct ib_qp *ibqp; - struct nes_qp *nesqp; - struct nes_vnic *nesvnic; - struct nes_device *nesdev; - struct nes_cm_node *cm_node; - struct nes_cm_info cm_info; - int apbvt_set = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; - - if (cm_id->remote_addr.ss_family != AF_INET) - return -ENOSYS; - ibqp = nes_get_qp(cm_id->device, conn_param->qpn); - if (!ibqp) - return -EINVAL; - nesqp = to_nesqp(ibqp); - if (!nesqp) - return -EINVAL; - nesvnic = to_nesvnic(nesqp->ibqp.device); - if (!nesvnic) - return -EINVAL; - nesdev = nesvnic->nesdev; - if (!nesdev) - return -EINVAL; - - if (!laddr->sin_port || !raddr->sin_port) - return -EINVAL; - - nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " - "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, - ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr), - ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr), - ntohs(laddr->sin_port)); - - atomic_inc(&cm_connects); - nesqp->active_conn = 1; - - /* cache the cm_id in the qp */ - nesqp->cm_id = cm_id; - cm_id->provider_data = nesqp; - nesqp->private_data_len = conn_param->private_data_len; - - nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); - nes_debug(NES_DBG_CM, "mpa private data len =%u\n", - conn_param->private_data_len); - - /* set up the connection params for the node */ - cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); - cm_info.loc_port = ntohs(laddr->sin_port); - cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); - cm_info.rem_port = ntohs(raddr->sin_port); - cm_info.cm_id = cm_id; - cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; - - if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { - nes_manage_apbvt(nesvnic, cm_info.loc_port, - PCI_FUNC(nesdev->pcidev->devfn), - NES_MANAGE_APBVT_ADD); - apbvt_set = 1; - } - - cm_id->add_ref(cm_id); - - /* create a connect CM node connection */ - cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, - conn_param->private_data_len, (void *)conn_param->private_data, - &cm_info); - if (!cm_node) { - if (apbvt_set) - nes_manage_apbvt(nesvnic, cm_info.loc_port, - PCI_FUNC(nesdev->pcidev->devfn), - NES_MANAGE_APBVT_DEL); - - nes_debug(NES_DBG_NLMSG, "Delete loc_port = %04X\n", - cm_info.loc_port); - cm_id->rem_ref(cm_id); - return -ENOMEM; - } - - record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); - if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && - cm_node->ord_size == 0) - cm_node->ord_size = 1; - - cm_node->apbvt_set = apbvt_set; - cm_node->tos = cm_id->tos; - nesqp->cm_node = cm_node; - cm_node->nesqp = nesqp; - nes_add_ref(&nesqp->ibqp); - - return 0; -} - - -/** - * nes_create_listen - */ -int nes_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - struct nes_vnic *nesvnic; - struct nes_cm_listener *cm_node; - struct nes_cm_info cm_info; - int err; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; - - nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", - cm_id, ntohs(laddr->sin_port)); - - if (cm_id->m_local_addr.ss_family != AF_INET) - return -ENOSYS; - nesvnic = to_nesvnic(cm_id->device); - if (!nesvnic) - return -EINVAL; - - nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n", - nesvnic, nesvnic->netdev, nesvnic->netdev->name); - - nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n", - nesvnic->local_ipaddr, laddr->sin_addr.s_addr); - - /* setup listen params in our api call struct */ - cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); - cm_info.loc_port = ntohs(laddr->sin_port); - cm_info.backlog = backlog; - cm_info.cm_id = cm_id; - - cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; - - cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); - if (!cm_node) { - printk(KERN_ERR "%s[%u] Error returned from listen API call\n", - __func__, __LINE__); - return -ENOMEM; - } - - cm_id->provider_data = cm_node; - cm_node->tos = cm_id->tos; - - if (!cm_node->reused_node) { - err = nes_manage_apbvt(nesvnic, cm_node->loc_port, - PCI_FUNC(nesvnic->nesdev->pcidev->devfn), - NES_MANAGE_APBVT_ADD); - if (err) { - printk(KERN_ERR "nes_manage_apbvt call returned %d.\n", - err); - g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node); - return err; - } - atomic_inc(&cm_listens_created); - } - - cm_id->add_ref(cm_id); - cm_id->provider_data = (void *)cm_node; - - - return 0; -} - - -/** - * nes_destroy_listen - */ -int nes_destroy_listen(struct iw_cm_id *cm_id) -{ - if (cm_id->provider_data) - g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data); - else - nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n"); - - cm_id->rem_ref(cm_id); - - return 0; -} - - -/** - * nes_cm_recv - */ -int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) -{ - int rc = 0; - - cm_packets_received++; - if ((g_cm_core) && (g_cm_core->api)) - rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); - else - nes_debug(NES_DBG_CM, "Unable to process packet for CM," - " cm is not setup properly.\n"); - - return rc; -} - - -/** - * nes_cm_start - * Start and init a cm core module - */ -int nes_cm_start(void) -{ - nes_debug(NES_DBG_CM, "\n"); - /* create the primary CM core, pass this handle to subsequent core inits */ - g_cm_core = nes_cm_alloc_core(); - if (g_cm_core) - return 0; - else - return -ENOMEM; -} - - -/** - * nes_cm_stop - * stop and dealloc all cm core instances - */ -int nes_cm_stop(void) -{ - g_cm_core->api->destroy_cm_core(g_cm_core); - return 0; -} - - -/** - * cm_event_connected - * handle a connected event, setup QPs and HW - */ -static void cm_event_connected(struct nes_cm_event *event) -{ - struct nes_qp *nesqp; - struct nes_vnic *nesvnic; - struct nes_device *nesdev; - struct nes_cm_node *cm_node; - struct nes_adapter *nesadapter; - struct ib_qp_attr attr; - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - struct nes_v4_quad nes_quad; - u32 crc_value; - int ret; - struct sockaddr_in *laddr; - struct sockaddr_in *raddr; - struct sockaddr_in *cm_event_laddr; - - /* get all our handles */ - cm_node = event->cm_node; - cm_id = cm_node->cm_id; - nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id); - nesqp = (struct nes_qp *)cm_id->provider_data; - nesvnic = to_nesvnic(nesqp->ibqp.device); - nesdev = nesvnic->nesdev; - nesadapter = nesdev->nesadapter; - laddr = (struct sockaddr_in *)&cm_id->m_local_addr; - raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; - cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr; - - if (nesqp->destroyed) - return; - atomic_inc(&cm_connecteds); - nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" - " local port 0x%04X. jiffies = %lu.\n", - nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr), - ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies); - - nes_cm_init_tsa_conn(nesqp, cm_node); - - /* set the QP tsa context */ - nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(cm_node->loc_port); - nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(cm_node->rem_port); - nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr); - - nesqp->nesqp_context->misc2 |= cpu_to_le32( - (u32)PCI_FUNC(nesdev->pcidev->devfn) << - NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); - nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32( - nes_arp_table(nesdev, - le32_to_cpu(nesqp->nesqp_context->ip0), - NULL, NES_ARP_RESOLVE) << 16); - nesqp->nesqp_context->ts_val_delta = cpu_to_le32( - jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); - nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); - nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32((u32)1 << - NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); - nesqp->nesqp_context->ird_ord_sizes |= - cpu_to_le32((u32)cm_node->ord_size); - - /* Adjust tail for not having a LSMM */ - /*nesqp->hwqp.sq_tail = 1;*/ - - build_rdma0_msg(cm_node, &nesqp); - - nes_write32(nesdev->regs + NES_WQE_ALLOC, - (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); - - memset(&nes_quad, 0, sizeof(nes_quad)); - - nes_quad.DstIpAdrIndex = - cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = htonl(cm_node->rem_addr); - nes_quad.TcpPorts[0] = htons(cm_node->rem_port); - nes_quad.TcpPorts[1] = htons(cm_node->loc_port); - - /* Produce hash key */ - crc_value = get_crc_value(&nes_quad); - nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); - nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n", - nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); - - nesqp->hte_index &= nesadapter->hte_index_mask; - nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); - - nesqp->ietf_frame = &cm_node->mpa_frame; - nesqp->private_data_len = (u8)cm_node->mpa_frame_size; - cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); - - /* notify OF layer we successfully created the requested connection */ - cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = 0; - cm_event.provider_data = cm_id->provider_data; - cm_event_laddr->sin_family = AF_INET; - cm_event_laddr->sin_port = laddr->sin_port; - cm_event.remote_addr = cm_id->m_remote_addr; - - cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size; - cm_event.ird = cm_node->ird_size; - cm_event.ord = cm_node->ord_size; - - cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); - ret = cm_id->event_handler(cm_id, &cm_event); - nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); - - if (ret) - printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); - attr.qp_state = IB_QPS_RTS; - nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); - - nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = " - "%lu\n", nesqp->hwqp.qp_id, jiffies); - - return; -} - - -/** - * cm_event_connect_error - */ -static void cm_event_connect_error(struct nes_cm_event *event) -{ - struct nes_qp *nesqp; - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - /* struct nes_cm_info cm_info; */ - int ret; - - if (!event->cm_node) - return; - - cm_id = event->cm_node->cm_id; - if (!cm_id) - return; - - nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id); - nesqp = cm_id->provider_data; - - if (!nesqp) - return; - - /* notify OF layer about this connection error event */ - /* cm_id->rem_ref(cm_id); */ - nesqp->cm_id = NULL; - cm_id->provider_data = NULL; - cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = -ECONNRESET; - cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - -#ifdef CONFIG_INFINIBAND_NES_DEBUG - { - struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) - &cm_event.local_addr; - struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) - &cm_event.remote_addr; - nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n", - cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr); - } -#endif - - ret = cm_id->event_handler(cm_id, &cm_event); - nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); - if (ret) - printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " - "ret=%d\n", __func__, __LINE__, ret); - cm_id->rem_ref(cm_id); - - rem_ref_cm_node(event->cm_node->cm_core, event->cm_node); - return; -} - - -/** - * cm_event_reset - */ -static void cm_event_reset(struct nes_cm_event *event) -{ - struct nes_qp *nesqp; - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - /* struct nes_cm_info cm_info; */ - int ret; - - if (!event->cm_node) - return; - - if (!event->cm_node->cm_id) - return; - - cm_id = event->cm_node->cm_id; - - nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); - nesqp = cm_id->provider_data; - if (!nesqp) - return; - - nesqp->cm_id = NULL; - /* cm_id->provider_data = NULL; */ - cm_event.event = IW_CM_EVENT_DISCONNECT; - cm_event.status = -ECONNRESET; - cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - - cm_id->add_ref(cm_id); - ret = cm_id->event_handler(cm_id, &cm_event); - atomic_inc(&cm_closes); - cm_event.event = IW_CM_EVENT_CLOSE; - cm_event.status = 0; - cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->m_local_addr; - cm_event.remote_addr = cm_id->m_remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node); - ret = cm_id->event_handler(cm_id, &cm_event); - - nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); - - - /* notify OF layer about this connection error event */ - cm_id->rem_ref(cm_id); - - return; -} - - -/** - * cm_event_mpa_req - */ -static void cm_event_mpa_req(struct nes_cm_event *event) -{ - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - int ret; - struct nes_cm_node *cm_node; - struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) - &cm_event.local_addr; - struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) - &cm_event.remote_addr; - - cm_node = event->cm_node; - if (!cm_node) - return; - cm_id = cm_node->cm_id; - - atomic_inc(&cm_connect_reqs); - nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", - cm_node, cm_id, jiffies); - - cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; - cm_event.status = 0; - cm_event.provider_data = (void *)cm_node; - - cm_event_laddr->sin_family = AF_INET; - cm_event_laddr->sin_port = htons(event->cm_info.loc_port); - cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); - - cm_event_raddr->sin_family = AF_INET; - cm_event_raddr->sin_port = htons(event->cm_info.rem_port); - cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); - cm_event.private_data = cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8)cm_node->mpa_frame_size; - if (cm_node->mpa_frame_rev == IETF_MPA_V1) { - cm_event.ird = NES_MAX_IRD; - cm_event.ord = NES_MAX_ORD; - } else { - cm_event.ird = cm_node->ird_size; - cm_event.ord = cm_node->ord_size; - } - - ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) - printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", - __func__, __LINE__, ret); - return; -} - - -static void cm_event_mpa_reject(struct nes_cm_event *event) -{ - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - struct nes_cm_node *cm_node; - int ret; - struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) - &cm_event.local_addr; - struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) - &cm_event.remote_addr; - - cm_node = event->cm_node; - if (!cm_node) - return; - cm_id = cm_node->cm_id; - - atomic_inc(&cm_connect_reqs); - nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", - cm_node, cm_id, jiffies); - - cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = -ECONNREFUSED; - cm_event.provider_data = cm_id->provider_data; - - cm_event_laddr->sin_family = AF_INET; - cm_event_laddr->sin_port = htons(event->cm_info.loc_port); - cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); - - cm_event_raddr->sin_family = AF_INET; - cm_event_raddr->sin_port = htons(event->cm_info.rem_port); - cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); - - cm_event.private_data = cm_node->mpa_frame_buf; - cm_event.private_data_len = (u8)cm_node->mpa_frame_size; - - nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, " - "remove_addr=%08x\n", - cm_event_laddr->sin_addr.s_addr, - cm_event_raddr->sin_addr.s_addr); - - ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) - printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", - __func__, __LINE__, ret); - - return; -} - - -static void nes_cm_event_handler(struct work_struct *); - -/** - * nes_cm_post_event - * post an event to the cm event handler - */ -static int nes_cm_post_event(struct nes_cm_event *event) -{ - atomic_inc(&event->cm_node->cm_core->events_posted); - add_ref_cm_node(event->cm_node); - event->cm_info.cm_id->add_ref(event->cm_info.cm_id); - INIT_WORK(&event->event_work, nes_cm_event_handler); - nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n", - event->cm_node, event); - - queue_work(event->cm_node->cm_core->event_wq, &event->event_work); - - nes_debug(NES_DBG_CM, "Exit\n"); - return 0; -} - - -/** - * nes_cm_event_handler - * worker function to handle cm events - * will free instance of nes_cm_event - */ -static void nes_cm_event_handler(struct work_struct *work) -{ - struct nes_cm_event *event = container_of(work, struct nes_cm_event, - event_work); - struct nes_cm_core *cm_core; - - if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) - return; - - cm_core = event->cm_node->cm_core; - nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n", - event, event->type, atomic_read(&cm_core->events_posted)); - - switch (event->type) { - case NES_CM_EVENT_MPA_REQ: - cm_event_mpa_req(event); - nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n", - event->cm_node); - break; - case NES_CM_EVENT_RESET: - nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n", - event->cm_node); - cm_event_reset(event); - break; - case NES_CM_EVENT_CONNECTED: - if ((!event->cm_node->cm_id) || - (event->cm_node->state != NES_CM_STATE_TSA)) - break; - cm_event_connected(event); - nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n"); - break; - case NES_CM_EVENT_MPA_REJECT: - if ((!event->cm_node->cm_id) || - (event->cm_node->state == NES_CM_STATE_TSA)) - break; - cm_event_mpa_reject(event); - nes_debug(NES_DBG_CM, "CM Event: REJECT\n"); - break; - - case NES_CM_EVENT_ABORTED: - if ((!event->cm_node->cm_id) || - (event->cm_node->state == NES_CM_STATE_TSA)) - break; - cm_event_connect_error(event); - nes_debug(NES_DBG_CM, "CM Event: ABORTED\n"); - break; - case NES_CM_EVENT_DROPPED_PKT: - nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n"); - break; - default: - nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n"); - break; - } - - atomic_dec(&cm_core->events_posted); - event->cm_info.cm_id->rem_ref(event->cm_info.cm_id); - rem_ref_cm_node(cm_core, event->cm_node); - kfree(event); - - return; -} diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h deleted file mode 100644 index b9cc02b4e8d5..000000000000 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ /dev/null @@ -1,470 +0,0 @@ -/* - * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef NES_CM_H -#define NES_CM_H - -#define QUEUE_EVENTS - -#define NES_MANAGE_APBVT_DEL 0 -#define NES_MANAGE_APBVT_ADD 1 - -#define NES_MPA_REQUEST_ACCEPT 1 -#define NES_MPA_REQUEST_REJECT 2 - -/* IETF MPA -- defines, enums, structs */ -#define IEFT_MPA_KEY_REQ "MPA ID Req Frame" -#define IEFT_MPA_KEY_REP "MPA ID Rep Frame" -#define IETF_MPA_KEY_SIZE 16 -#define IETF_MPA_VERSION 1 -#define IETF_MAX_PRIV_DATA_LEN 512 -#define IETF_MPA_FRAME_SIZE 20 -#define IETF_RTR_MSG_SIZE 4 -#define IETF_MPA_V2_FLAG 0x10 - -/* IETF RTR MSG Fields */ -#define IETF_PEER_TO_PEER 0x8000 -#define IETF_FLPDU_ZERO_LEN 0x4000 -#define IETF_RDMA0_WRITE 0x8000 -#define IETF_RDMA0_READ 0x4000 -#define IETF_NO_IRD_ORD 0x3FFF -#define NES_MAX_IRD 0x40 -#define NES_MAX_ORD 0x7F - -enum ietf_mpa_flags { - IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ - IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ - IETF_MPA_FLAGS_REJECT = 0x20, /* Reject */ -}; - -struct ietf_mpa_v1 { - u8 key[IETF_MPA_KEY_SIZE]; - u8 flags; - u8 rev; - __be16 priv_data_len; - u8 priv_data[0]; -}; - -#define ietf_mpa_req_resp_frame ietf_mpa_frame - -struct ietf_rtr_msg { - __be16 ctrl_ird; - __be16 ctrl_ord; -}; - -struct ietf_mpa_v2 { - u8 key[IETF_MPA_KEY_SIZE]; - u8 flags; - u8 rev; - __be16 priv_data_len; - struct ietf_rtr_msg rtr_msg; - u8 priv_data[0]; -}; - -struct nes_v4_quad { - u32 rsvd0; - __le32 DstIpAdrIndex; /* Only most significant 5 bits are valid */ - __be32 SrcIpadr; - __be16 TcpPorts[2]; /* src is low, dest is high */ -}; - -struct nes_cm_node; -enum nes_timer_type { - NES_TIMER_TYPE_SEND, - NES_TIMER_TYPE_RECV, - NES_TIMER_NODE_CLEANUP, - NES_TIMER_TYPE_CLOSE, -}; - -#define NES_PASSIVE_STATE_INDICATED 0 -#define NES_DO_NOT_SEND_RESET_EVENT 1 -#define NES_SEND_RESET_EVENT 2 - -#define MAX_NES_IFS 4 - -#define SET_ACK 1 -#define SET_SYN 2 -#define SET_FIN 4 -#define SET_RST 8 - -#define TCP_OPTIONS_PADDING 3 - -struct option_base { - u8 optionnum; - u8 length; -}; - -enum option_numbers { - OPTION_NUMBER_END, - OPTION_NUMBER_NONE, - OPTION_NUMBER_MSS, - OPTION_NUMBER_WINDOW_SCALE, - OPTION_NUMBER_SACK_PERM, - OPTION_NUMBER_SACK, - OPTION_NUMBER_WRITE0 = 0xbc -}; - -struct option_mss { - u8 optionnum; - u8 length; - __be16 mss; -}; - -struct option_windowscale { - u8 optionnum; - u8 length; - u8 shiftcount; -}; - -union all_known_options { - char as_end; - struct option_base as_base; - struct option_mss as_mss; - struct option_windowscale as_windowscale; -}; - -struct nes_timer_entry { - struct list_head list; - unsigned long timetosend; /* jiffies */ - struct sk_buff *skb; - u32 type; - u32 retrycount; - u32 retranscount; - u32 context; - u32 seq_num; - u32 send_retrans; - int close_when_complete; - struct net_device *netdev; -}; - -#define NES_DEFAULT_RETRYS 64 -#define NES_DEFAULT_RETRANS 8 -#ifdef CONFIG_INFINIBAND_NES_DEBUG -#define NES_RETRY_TIMEOUT (1000*HZ/1000) -#else -#define NES_RETRY_TIMEOUT (3000*HZ/1000) -#endif -#define NES_SHORT_TIME (10) -#define NES_LONG_TIME (2000*HZ/1000) -#define NES_MAX_TIMEOUT ((unsigned long) (12*HZ)) - -#define NES_CM_HASHTABLE_SIZE 1024 -#define NES_CM_TCP_TIMER_INTERVAL 3000 -#define NES_CM_DEFAULT_MTU 1540 -#define NES_CM_DEFAULT_FRAME_CNT 10 -#define NES_CM_THREAD_STACK_SIZE 256 -#define NES_CM_DEFAULT_RCV_WND 64240 // before we know that window scaling is allowed -#define NES_CM_DEFAULT_RCV_WND_SCALED 256960 // after we know that window scaling is allowed -#define NES_CM_DEFAULT_RCV_WND_SCALE 2 -#define NES_CM_DEFAULT_FREE_PKTS 0x000A -#define NES_CM_FREE_PKT_LO_WATERMARK 2 - -#define NES_CM_DEFAULT_MSS 536 - -#define NES_CM_DEF_SEQ 0x159bf75f -#define NES_CM_DEF_LOCAL_ID 0x3b47 - -#define NES_CM_DEF_SEQ2 0x18ed5740 -#define NES_CM_DEF_LOCAL_ID2 0xb807 -#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN) - -typedef u32 nes_addr_t; - -#define nes_cm_tsa_context nes_qp_context - -struct nes_qp; - -/* cm node transition states */ -enum nes_cm_node_state { - NES_CM_STATE_UNKNOWN, - NES_CM_STATE_INITED, - NES_CM_STATE_LISTENING, - NES_CM_STATE_SYN_RCVD, - NES_CM_STATE_SYN_SENT, - NES_CM_STATE_ONE_SIDE_ESTABLISHED, - NES_CM_STATE_ESTABLISHED, - NES_CM_STATE_ACCEPTING, - NES_CM_STATE_MPAREQ_SENT, - NES_CM_STATE_MPAREQ_RCVD, - NES_CM_STATE_MPAREJ_RCVD, - NES_CM_STATE_TSA, - NES_CM_STATE_FIN_WAIT1, - NES_CM_STATE_FIN_WAIT2, - NES_CM_STATE_CLOSE_WAIT, - NES_CM_STATE_TIME_WAIT, - NES_CM_STATE_LAST_ACK, - NES_CM_STATE_CLOSING, - NES_CM_STATE_LISTENER_DESTROYED, - NES_CM_STATE_CLOSED -}; - -enum mpa_frame_version { - IETF_MPA_V1 = 1, - IETF_MPA_V2 = 2 -}; - -enum mpa_frame_key { - MPA_KEY_REQUEST, - MPA_KEY_REPLY -}; - -enum send_rdma0 { - SEND_RDMA_READ_ZERO = 1, - SEND_RDMA_WRITE_ZERO = 2 -}; - -enum nes_tcpip_pkt_type { - NES_PKT_TYPE_UNKNOWN, - NES_PKT_TYPE_SYN, - NES_PKT_TYPE_SYNACK, - NES_PKT_TYPE_ACK, - NES_PKT_TYPE_FIN, - NES_PKT_TYPE_RST -}; - - -/* type of nes connection */ -enum nes_cm_conn_type { - NES_CM_IWARP_CONN_TYPE, -}; - -/* CM context params */ -struct nes_cm_tcp_context { - u8 client; - - u32 loc_seq_num; - u32 loc_ack_num; - u32 rem_ack_num; - u32 rcv_nxt; - - u32 loc_id; - u32 rem_id; - - u32 snd_wnd; - u32 max_snd_wnd; - - u32 rcv_wnd; - u32 mss; - u8 snd_wscale; - u8 rcv_wscale; - - struct nes_cm_tsa_context tsa_cntxt; -}; - - -enum nes_cm_listener_state { - NES_CM_LISTENER_PASSIVE_STATE = 1, - NES_CM_LISTENER_ACTIVE_STATE = 2, - NES_CM_LISTENER_EITHER_STATE = 3 -}; - -struct nes_cm_listener { - struct list_head list; - struct nes_cm_core *cm_core; - u8 loc_mac[ETH_ALEN]; - nes_addr_t loc_addr; - u16 loc_port; - struct iw_cm_id *cm_id; - enum nes_cm_conn_type conn_type; - atomic_t ref_count; - struct nes_vnic *nesvnic; - atomic_t pend_accepts_cnt; - int backlog; - enum nes_cm_listener_state listener_state; - u32 reused_node; - u8 tos; -}; - -/* per connection node and node state information */ -struct nes_cm_node { - nes_addr_t loc_addr, rem_addr; - u16 loc_port, rem_port; - - u8 loc_mac[ETH_ALEN]; - u8 rem_mac[ETH_ALEN]; - - enum nes_cm_node_state state; - struct nes_cm_tcp_context tcp_cntxt; - struct nes_cm_core *cm_core; - struct sk_buff_head resend_list; - atomic_t ref_count; - struct net_device *netdev; - - struct nes_cm_node *loopbackpartner; - - struct nes_timer_entry *send_entry; - struct nes_timer_entry *recv_entry; - spinlock_t retrans_list_lock; - enum send_rdma0 send_rdma0_op; - - union { - struct ietf_mpa_v1 mpa_frame; - struct ietf_mpa_v2 mpa_v2_frame; - u8 mpa_frame_buf[MAX_CM_BUFFER]; - }; - enum mpa_frame_version mpa_frame_rev; - u16 ird_size; - u16 ord_size; - u16 mpav2_ird_ord; - - u16 mpa_frame_size; - struct iw_cm_id *cm_id; - struct list_head list; - bool accelerated; - struct nes_cm_listener *listener; - enum nes_cm_conn_type conn_type; - struct nes_vnic *nesvnic; - int apbvt_set; - int accept_pend; - struct list_head timer_entry; - struct list_head reset_entry; - struct nes_qp *nesqp; - atomic_t passive_state; - u8 tos; -}; - -/* structure for client or CM to fill when making CM api calls. */ -/* - only need to set relevant data, based on op. */ -struct nes_cm_info { - union { - struct iw_cm_id *cm_id; - struct net_device *netdev; - }; - - u16 loc_port; - u16 rem_port; - nes_addr_t loc_addr; - nes_addr_t rem_addr; - enum nes_cm_conn_type conn_type; - int backlog; -}; - -/* CM event codes */ -enum nes_cm_event_type { - NES_CM_EVENT_UNKNOWN, - NES_CM_EVENT_ESTABLISHED, - NES_CM_EVENT_MPA_REQ, - NES_CM_EVENT_MPA_CONNECT, - NES_CM_EVENT_MPA_ACCEPT, - NES_CM_EVENT_MPA_REJECT, - NES_CM_EVENT_MPA_ESTABLISHED, - NES_CM_EVENT_CONNECTED, - NES_CM_EVENT_CLOSED, - NES_CM_EVENT_RESET, - NES_CM_EVENT_DROPPED_PKT, - NES_CM_EVENT_CLOSE_IMMED, - NES_CM_EVENT_CLOSE_HARD, - NES_CM_EVENT_CLOSE_CLEAN, - NES_CM_EVENT_ABORTED, - NES_CM_EVENT_SEND_FIRST -}; - -/* event to post to CM event handler */ -struct nes_cm_event { - enum nes_cm_event_type type; - - struct nes_cm_info cm_info; - struct work_struct event_work; - struct nes_cm_node *cm_node; -}; - -struct nes_cm_core { - enum nes_cm_node_state state; - - atomic_t listen_node_cnt; - struct nes_cm_node listen_list; - spinlock_t listen_list_lock; - - u32 mtu; - u32 free_tx_pkt_max; - u32 rx_pkt_posted; - atomic_t ht_node_cnt; - struct list_head connected_nodes; - /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */ - spinlock_t ht_lock; - - struct timer_list tcp_timer; - - const struct nes_cm_ops *api; - - int (*post_event)(struct nes_cm_event *event); - atomic_t events_posted; - struct workqueue_struct *event_wq; - struct workqueue_struct *disconn_wq; - - atomic_t node_cnt; - u64 aborted_connects; - u32 options; - - struct nes_cm_node *current_listen_node; -}; - - -#define NES_CM_SET_PKT_SIZE (1 << 1) -#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2) - -/* CM ops/API for client interface */ -struct nes_cm_ops { - int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *); - struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *, - struct nes_cm_info *); - int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *); - struct nes_cm_node * (*connect)(struct nes_cm_core *, - struct nes_vnic *, u16, void *, - struct nes_cm_info *); - int (*close)(struct nes_cm_core *, struct nes_cm_node *); - int (*accept)(struct nes_cm_core *, struct nes_cm_node *); - int (*reject)(struct nes_cm_core *, struct nes_cm_node *); - int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, - struct sk_buff *); - int (*destroy_cm_core)(struct nes_cm_core *); - int (*get)(struct nes_cm_core *); - int (*set)(struct nes_cm_core *, u32, u32); -}; - -int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, - enum nes_timer_type, int, int); - -int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); -int nes_reject(struct iw_cm_id *, const void *, u8); -int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); -int nes_create_listen(struct iw_cm_id *, int); -int nes_destroy_listen(struct iw_cm_id *); - -int nes_cm_recv(struct sk_buff *, struct net_device *); -int nes_cm_start(void); -int nes_cm_stop(void); -int nes_add_ref_cm_node(struct nes_cm_node *cm_node); -int nes_rem_ref_cm_node(struct nes_cm_node *cm_node); - -#endif /* NES_CM_H */ diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h deleted file mode 100644 index a69eef16d72d..000000000000 --- a/drivers/infiniband/hw/nes/nes_context.h +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef NES_CONTEXT_H -#define NES_CONTEXT_H - -struct nes_qp_context { - __le32 misc; - __le32 cqs; - __le32 sq_addr_low; - __le32 sq_addr_high; - __le32 rq_addr_low; - __le32 rq_addr_high; - __le32 misc2; - __le16 tcpPorts[2]; - __le32 ip0; - __le32 ip1; - __le32 ip2; - __le32 ip3; - __le32 mss; - __le32 arp_index_vlan; - __le32 tcp_state_flow_label; - __le32 pd_index_wscale; - __le32 keepalive; - u32 ts_recent; - u32 ts_age; - __le32 snd_nxt; - __le32 snd_wnd; - __le32 rcv_nxt; - __le32 rcv_wnd; - __le32 snd_max; - __le32 snd_una; - u32 srtt; - __le32 rttvar; - __le32 ssthresh; - __le32 cwnd; - __le32 snd_wl1; - __le32 snd_wl2; - __le32 max_snd_wnd; - __le32 ts_val_delta; - u32 retransmit; - u32 probe_cnt; - u32 hte_index; - __le32 q2_addr_low; - __le32 q2_addr_high; - __le32 ird_index; - u32 Rsvd3; - __le32 ird_ord_sizes; - u32 mrkr_offset; - __le32 aeq_token_low; - __le32 aeq_token_high; -}; - -/* QP Context Misc Field */ - -#define NES_QPCONTEXT_MISC_IWARP_VER_MASK 0x00000003 -#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT 0 -#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK 0x000000C0 -#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT 6 -#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK 0x00000300 -#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT 8 -#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK 0x00000c00 -#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT 10 -#define NES_QPCONTEXT_MISC_PCI_FCN_MASK 0x00007000 -#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT 12 -#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK 0x00070000 -#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT 16 - -enum nes_qp_context_misc_bits { - NES_QPCONTEXT_MISC_RX_WQE_SIZE = 0x00000004, - NES_QPCONTEXT_MISC_IPV4 = 0x00000008, - NES_QPCONTEXT_MISC_DO_NOT_FRAG = 0x00000010, - NES_QPCONTEXT_MISC_INSERT_VLAN = 0x00000020, - NES_QPCONTEXT_MISC_DROS = 0x00008000, - NES_QPCONTEXT_MISC_WSCALE = 0x00080000, - NES_QPCONTEXT_MISC_KEEPALIVE = 0x00100000, - NES_QPCONTEXT_MISC_TIMESTAMP = 0x00200000, - NES_QPCONTEXT_MISC_SACK = 0x00400000, - NES_QPCONTEXT_MISC_RDMA_WRITE_EN = 0x00800000, - NES_QPCONTEXT_MISC_RDMA_READ_EN = 0x01000000, - NES_QPCONTEXT_MISC_WBIND_EN = 0x10000000, - NES_QPCONTEXT_MISC_FAST_REGISTER_EN = 0x20000000, - NES_QPCONTEXT_MISC_PRIV_EN = 0x40000000, - NES_QPCONTEXT_MISC_NO_NAGLE = 0x80000000 -}; - -enum nes_qp_acc_wq_sizes { - HCONTEXT_TSA_WQ_SIZE_4 = 0, - HCONTEXT_TSA_WQ_SIZE_32 = 1, - HCONTEXT_TSA_WQ_SIZE_128 = 2, - HCONTEXT_TSA_WQ_SIZE_512 = 3 -}; - -/* QP Context Misc2 Fields */ -#define NES_QPCONTEXT_MISC2_TTL_MASK 0x000000ff -#define NES_QPCONTEXT_MISC2_TTL_SHIFT 0 -#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK 0x000000ff -#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT 0 -#define NES_QPCONTEXT_MISC2_LIMIT_MASK 0x00000300 -#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT 8 -#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK 0x0000fc00 -#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT 10 -#define NES_QPCONTEXT_MISC2_SRC_IP_MASK 0x001f0000 -#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT 16 -#define NES_QPCONTEXT_MISC2_TOS_MASK 0xff000000 -#define NES_QPCONTEXT_MISC2_TOS_SHIFT 24 -#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK 0xff000000 -#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24 - -/* QP Context Tcp State/Flow Label Fields */ -#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK 0x000fffff -#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT 0 -#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK 0xf0000000 -#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT 28 - -enum nes_qp_tcp_state { - NES_QPCONTEXT_TCPSTATE_CLOSED = 1, - NES_QPCONTEXT_TCPSTATE_EST = 5, - NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11, -}; - -/* QP Context PD Index/wscale Fields */ -#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK 0x0000000f -#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0 -#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK 0x00000f00 -#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8 -#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK 0xffff0000 -#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT 16 - -/* QP Context Keepalive Fields */ -#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK 0x0000ffff -#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT 0 -#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK 0x00ff0000 -#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16 -#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK 0xff000000 -#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT 24 - -/* QP Context ORD/IRD Fields */ -#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK 0x0000007f -#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT 0 -#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK 0x00030000 -#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT 16 -#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK 0x30000000 -#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT 28 - -enum nes_ord_ird_bits { - NES_QPCONTEXT_ORDIRD_WRPDU = 0x02000000, - NES_QPCONTEXT_ORDIRD_LSMM_PRESENT = 0x04000000, - NES_QPCONTEXT_ORDIRD_ALSMM = 0x08000000, - NES_QPCONTEXT_ORDIRD_AAH = 0x40000000, - NES_QPCONTEXT_ORDIRD_RNMC = 0x80000000 -}; - -enum nes_iwarp_qp_state { - NES_QPCONTEXT_IWARP_STATE_NONEXIST = 0, - NES_QPCONTEXT_IWARP_STATE_IDLE = 1, - NES_QPCONTEXT_IWARP_STATE_RTS = 2, - NES_QPCONTEXT_IWARP_STATE_CLOSING = 3, - NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5, - NES_QPCONTEXT_IWARP_STATE_ERROR = 6 -}; - - -#endif /* NES_CONTEXT_H */ diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c deleted file mode 100644 index 5517e392bc01..000000000000 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ /dev/null @@ -1,3887 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "nes.h" - -static int wide_ppm_offset; -module_param(wide_ppm_offset, int, 0644); -MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm"); - -static u32 crit_err_count; -u32 int_mod_timer_init; -u32 int_mod_cq_depth_256; -u32 int_mod_cq_depth_128; -u32 int_mod_cq_depth_32; -u32 int_mod_cq_depth_24; -u32 int_mod_cq_depth_16; -u32 int_mod_cq_depth_4; -u32 int_mod_cq_depth_1; -static const u8 nes_max_critical_error_count = 100; -#include "nes_cm.h" - -static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq); -static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count); -static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, - struct nes_adapter *nesadapter, u8 OneG_Mode); -static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq); -static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq); -static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq); -static void nes_process_iwarp_aeqe(struct nes_device *nesdev, - struct nes_hw_aeqe *aeqe); -static void process_critical_error(struct nes_device *nesdev); -static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); -static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); -static void nes_terminate_start_timer(struct nes_qp *nesqp); - -static const char *const nes_iwarp_state_str[] = { - "Non-Existent", - "Idle", - "RTS", - "Closing", - "RSVD1", - "Terminate", - "Error", - "RSVD2", -}; - -static const char *const nes_tcp_state_str[] = { - "Non-Existent", - "Closed", - "Listen", - "SYN Sent", - "SYN Rcvd", - "Established", - "Close Wait", - "FIN Wait 1", - "Closing", - "Last Ack", - "FIN Wait 2", - "Time Wait", - "RSVD1", - "RSVD2", - "RSVD3", - "RSVD4", -}; - -static inline void print_ip(struct nes_cm_node *cm_node) -{ - unsigned char *rem_addr; - if (cm_node) { - rem_addr = (unsigned char *)&cm_node->rem_addr; - printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr); - } -} - -/** - * nes_nic_init_timer_defaults - */ -void nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode) -{ - unsigned long flags; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; - - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - - shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW; - shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH; - if (jumbomode) { - shared_timer->threshold_low = DEFAULT_JUMBO_NES_QL_LOW; - shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET; - shared_timer->threshold_high = DEFAULT_JUMBO_NES_QL_HIGH; - } else { - shared_timer->threshold_low = DEFAULT_NES_QL_LOW; - shared_timer->threshold_target = DEFAULT_NES_QL_TARGET; - shared_timer->threshold_high = DEFAULT_NES_QL_HIGH; - } - - /* todo use netdev->mtu to set thresholds */ - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); -} - - -/** - * nes_nic_init_timer - */ -static void nes_nic_init_timer(struct nes_device *nesdev) -{ - unsigned long flags; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; - - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - - if (shared_timer->timer_in_use_old == 0) { - nesdev->deepcq_count = 0; - shared_timer->timer_direction_upward = 0; - shared_timer->timer_direction_downward = 0; - shared_timer->timer_in_use = NES_NIC_FAST_TIMER; - shared_timer->timer_in_use_old = 0; - - } - if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) { - shared_timer->timer_in_use_old = shared_timer->timer_in_use; - nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, - 0x80000000 | ((u32)(shared_timer->timer_in_use*8))); - } - /* todo use netdev->mtu to set thresholds */ - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); -} - - -/** - * nes_nic_tune_timer - */ -static void nes_nic_tune_timer(struct nes_device *nesdev) -{ - unsigned long flags; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; - u16 cq_count = nesdev->currcq_count; - - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - - if (shared_timer->cq_count_old <= cq_count) - shared_timer->cq_direction_downward = 0; - else - shared_timer->cq_direction_downward++; - shared_timer->cq_count_old = cq_count; - if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) { - if (cq_count <= shared_timer->threshold_low && - shared_timer->threshold_low > 4) { - shared_timer->threshold_low = shared_timer->threshold_low/2; - shared_timer->cq_direction_downward=0; - nesdev->currcq_count = 0; - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); - return; - } - } - - if (cq_count > 1) { - nesdev->deepcq_count += cq_count; - if (cq_count <= shared_timer->threshold_low) { /* increase timer gently */ - shared_timer->timer_direction_upward++; - shared_timer->timer_direction_downward = 0; - } else if (cq_count <= shared_timer->threshold_target) { /* balanced */ - shared_timer->timer_direction_upward = 0; - shared_timer->timer_direction_downward = 0; - } else if (cq_count <= shared_timer->threshold_high) { /* decrease timer gently */ - shared_timer->timer_direction_downward++; - shared_timer->timer_direction_upward = 0; - } else if (cq_count <= (shared_timer->threshold_high) * 2) { - shared_timer->timer_in_use -= 2; - shared_timer->timer_direction_upward = 0; - shared_timer->timer_direction_downward++; - } else { - shared_timer->timer_in_use -= 4; - shared_timer->timer_direction_upward = 0; - shared_timer->timer_direction_downward++; - } - - if (shared_timer->timer_direction_upward > 3 ) { /* using history */ - shared_timer->timer_in_use += 3; - shared_timer->timer_direction_upward = 0; - shared_timer->timer_direction_downward = 0; - } - if (shared_timer->timer_direction_downward > 5) { /* using history */ - shared_timer->timer_in_use -= 4 ; - shared_timer->timer_direction_downward = 0; - shared_timer->timer_direction_upward = 0; - } - } - - /* boundary checking */ - if (shared_timer->timer_in_use > shared_timer->threshold_high) - shared_timer->timer_in_use = shared_timer->threshold_high; - else if (shared_timer->timer_in_use < shared_timer->threshold_low) - shared_timer->timer_in_use = shared_timer->threshold_low; - - nesdev->currcq_count = 0; - - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); -} - - -/** - * nes_init_adapter - initialize adapter - */ -struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { - struct nes_adapter *nesadapter = NULL; - unsigned long num_pds; - u32 u32temp; - u32 port_count; - u16 max_rq_wrs; - u16 max_sq_wrs; - u32 max_mr; - u32 max_256pbl; - u32 max_4kpbl; - u32 max_qp; - u32 max_irrq; - u32 max_cq; - u32 hte_index_mask; - u32 adapter_size; - u32 arp_table_size; - u16 vendor_id; - u16 device_id; - u8 OneG_Mode; - u8 func_index; - - /* search the list of existing adapters */ - list_for_each_entry(nesadapter, &nes_adapter_list, list) { - nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X," - " adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n", - nesdev->pcidev->devfn, - PCI_SLOT(nesadapter->devfn), - nesadapter->bus_number, - PCI_SLOT(nesdev->pcidev->devfn), - nesdev->pcidev->bus->number ); - if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) && - (nesadapter->bus_number == nesdev->pcidev->bus->number)) { - nesadapter->ref_count++; - return nesadapter; - } - } - - /* no adapter found */ - num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT; - if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) { - nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n", - hw_rev); - return NULL; - } - - nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n", - nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8), - nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS), - nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4), - nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8)); - - nes_debug(NES_DBG_INIT, "Reset and init NE020\n"); - - - if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0) - return NULL; - - max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE); - nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp); - - u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE); - if (max_qp > ((u32)1 << (u32temp & 0x001f))) { - nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n", - max_qp, u32temp); - max_qp = (u32)1 << (u32temp & 0x001f); - } - - hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1; - nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n", - max_qp, hte_index_mask); - - u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT); - - max_irrq = 1 << (u32temp & 0x001f); - - if (max_qp > max_irrq) { - max_qp = max_irrq; - nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n", - max_qp); - } - - /* there should be no reason to allocate more pds than qps */ - if (num_pds > max_qp) - num_pds = max_qp; - - u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE); - max_mr = (u32)8192 << (u32temp & 0x7); - - u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE); - max_256pbl = (u32)1 << (u32temp & 0x0000001f); - max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f); - max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE); - - u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE); - arp_table_size = 1 << u32temp; - - adapter_size = (sizeof(struct nes_adapter) + - (sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1)); - adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp); - adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr); - adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq); - adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds); - adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size); - adapter_size += sizeof(struct nes_qp **) * max_qp; - - /* allocate a new adapter struct */ - nesadapter = kzalloc(adapter_size, GFP_KERNEL); - if (!nesadapter) - return NULL; - - nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n", - nesadapter, (u32)sizeof(struct nes_adapter), adapter_size); - - if (nes_read_eeprom_values(nesdev, nesadapter)) { - printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); - kfree(nesadapter); - return NULL; - } - - nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) | - (nesadapter->mac_addr_low >> 24); - - pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn, - PCI_DEVICE_ID, &device_id); - nesadapter->vendor_part_id = device_id; - - if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter, - OneG_Mode)) { - kfree(nesadapter); - return NULL; - } - nes_init_csr_ne020(nesdev, hw_rev, port_count); - - memset(nesadapter->pft_mcast_map, 255, - sizeof nesadapter->pft_mcast_map); - - /* populate the new nesadapter */ - nesadapter->nesdev = nesdev; - nesadapter->devfn = nesdev->pcidev->devfn; - nesadapter->bus_number = nesdev->pcidev->bus->number; - nesadapter->ref_count = 1; - nesadapter->timer_int_req = 0xffff0000; - nesadapter->OneG_Mode = OneG_Mode; - nesadapter->doorbell_start = nesdev->doorbell_region; - - /* nesadapter->tick_delta = clk_divisor; */ - nesadapter->hw_rev = hw_rev; - nesadapter->port_count = port_count; - - nesadapter->max_qp = max_qp; - nesadapter->hte_index_mask = hte_index_mask; - nesadapter->max_irrq = max_irrq; - nesadapter->max_mr = max_mr; - nesadapter->max_256pbl = max_256pbl - 1; - nesadapter->max_4kpbl = max_4kpbl - 1; - nesadapter->max_cq = max_cq; - nesadapter->free_256pbl = max_256pbl - 1; - nesadapter->free_4kpbl = max_4kpbl - 1; - nesadapter->max_pd = num_pds; - nesadapter->arp_table_size = arp_table_size; - - nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT; - if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) { - nesadapter->et_use_adaptive_rx_coalesce = 0; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; - nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; - } else { - nesadapter->et_use_adaptive_rx_coalesce = 1; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; - nesadapter->et_rx_coalesce_usecs_irq = 0; - printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__); - } - /* Setup and enable the periodic timer */ - if (nesadapter->et_rx_coalesce_usecs_irq) - nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 | - ((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8))); - else - nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000); - - nesadapter->base_pd = 1; - - nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | - IB_DEVICE_MEM_WINDOW | - IB_DEVICE_MEM_MGT_EXTENSIONS; - - nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) - [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); - nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)]; - nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)]; - nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)]; - nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)]; - nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); - - - /* mark the usual suspect QPs, MR and CQs as in use */ - for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { - set_bit(u32temp, nesadapter->allocated_qps); - set_bit(u32temp, nesadapter->allocated_cqs); - } - set_bit(0, nesadapter->allocated_mrs); - - for (u32temp = 0; u32temp < 20; u32temp++) - set_bit(u32temp, nesadapter->allocated_pds); - u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES); - - max_rq_wrs = ((u32temp >> 8) & 3); - switch (max_rq_wrs) { - case 0: - max_rq_wrs = 4; - break; - case 1: - max_rq_wrs = 16; - break; - case 2: - max_rq_wrs = 32; - break; - case 3: - max_rq_wrs = 512; - break; - } - - max_sq_wrs = (u32temp & 3); - switch (max_sq_wrs) { - case 0: - max_sq_wrs = 4; - break; - case 1: - max_sq_wrs = 16; - break; - case 2: - max_sq_wrs = 32; - break; - case 3: - max_sq_wrs = 512; - break; - } - nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs); - nesadapter->max_irrq_wr = (u32temp >> 16) & 3; - - nesadapter->max_sge = 4; - nesadapter->max_cqe = 32766; - - if (nes_read_eeprom_values(nesdev, nesadapter)) { - printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); - kfree(nesadapter); - return NULL; - } - - u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG); - nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG, - (u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff)); - - /* setup port configuration */ - if (nesadapter->port_count == 1) { - nesadapter->log_port = 0x00000000; - if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) - nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002); - else - nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); - } else { - if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { - nesadapter->log_port = 0x000000D8; - } else { - if (nesadapter->port_count == 2) - nesadapter->log_port = 0x00000044; - else - nesadapter->log_port = 0x000000e4; - } - nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); - } - - nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT, - nesadapter->log_port); - nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n", - nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT)); - - spin_lock_init(&nesadapter->resource_lock); - spin_lock_init(&nesadapter->phy_lock); - spin_lock_init(&nesadapter->pbl_lock); - spin_lock_init(&nesadapter->periodic_timer_lock); - - INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]); - INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]); - INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]); - INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]); - - if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { - u32 pcs_control_status0, pcs_control_status1; - u32 reset_value; - u32 i = 0; - u32 int_cnt = 0; - u32 ext_cnt = 0; - unsigned long flags; - u32 j = 0; - - pcs_control_status0 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0); - pcs_control_status1 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - - for (i = 0; i < NES_MAX_LINK_CHECK; i++) { - pcs_control_status0 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0); - pcs_control_status1 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) - || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) - int_cnt++; - usleep_range(1000, 2000); - } - if (int_cnt > 1) { - spin_lock_irqsave(&nesadapter->phy_lock, flags); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); - mh_detected++; - reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); - reset_value |= 0x0000003d; - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); - - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) - & 0x00000040) != 0x00000040) && (j++ < 5000)); - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - - pcs_control_status0 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0); - pcs_control_status1 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - - for (i = 0; i < NES_MAX_LINK_CHECK; i++) { - pcs_control_status0 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0); - pcs_control_status1 = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) - || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) { - if (++ext_cnt > int_cnt) { - spin_lock_irqsave(&nesadapter->phy_lock, flags); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, - 0x0000F088); - mh_detected++; - reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); - reset_value |= 0x0000003d; - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); - - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) - & 0x00000040) != 0x00000040) && (j++ < 5000)); - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - break; - } - } - usleep_range(1000, 2000); - } - } - } - - if (nesadapter->hw_rev == NE020_REV) { - timer_setup(&nesadapter->mh_timer, nes_mh_fix, 0); - nesadapter->mh_timer.expires = jiffies + (HZ/5); /* 1 second */ - add_timer(&nesadapter->mh_timer); - } else { - nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000); - } - - timer_setup(&nesadapter->lc_timer, nes_clc, 0); - nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ - add_timer(&nesadapter->lc_timer); - - list_add_tail(&nesadapter->list, &nes_adapter_list); - - for (func_index = 0; func_index < 8; func_index++) { - pci_bus_read_config_word(nesdev->pcidev->bus, - PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn), - func_index), 0, &vendor_id); - if (vendor_id == 0xffff) - break; - } - nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__, - func_index, pci_name(nesdev->pcidev)); - nesadapter->adapter_fcn_count = func_index; - - return nesadapter; -} - - -/** - * nes_reset_adapter_ne020 - */ -static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode) -{ - u32 port_count; - u32 u32temp; - u32 i; - - u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); - port_count = ((u32temp & 0x00000300) >> 8) + 1; - /* TODO: assuming that both SERDES are set the same for now */ - *OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1; - nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n", - u32temp, port_count); - if (*OneG_Mode) - nes_debug(NES_DBG_INIT, "Running in 1G mode.\n"); - u32temp &= 0xff00ffc0; - switch (port_count) { - case 1: - u32temp |= 0x00ee0000; - break; - case 2: - u32temp |= 0x00cc0000; - break; - case 4: - u32temp |= 0x00000000; - break; - default: - return 0; - break; - } - - /* check and do full reset if needed */ - if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) { - nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd); - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); - - i = 0; - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) - mdelay(1); - if (i > 10000) { - nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n"); - return 0; - } - - i = 0; - while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000) - mdelay(1); - if (i > 10000) { - printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n", - nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS)); - return 0; - } - } - - /* port reset */ - switch (port_count) { - case 1: - u32temp |= 0x00ee0010; - break; - case 2: - u32temp |= 0x00cc0030; - break; - case 4: - u32temp |= 0x00000030; - break; - } - - nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd); - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); - - i = 0; - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) - mdelay(1); - if (i > 10000) { - nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n"); - return 0; - } - - /* serdes 0 */ - i = 0; - while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) - & 0x0000000f)) != 0x0000000f) && i++ < 5000) - mdelay(1); - if (i > 5000) { - nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp); - return 0; - } - - /* serdes 1 */ - if (port_count > 1) { - i = 0; - while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) - & 0x0000000f)) != 0x0000000f) && i++ < 5000) - mdelay(1); - if (i > 5000) { - nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp); - return 0; - } - } - - return port_count; -} - - -/** - * nes_init_serdes - */ -static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, - struct nes_adapter *nesadapter, u8 OneG_Mode) -{ - int i; - u32 u32temp; - u32 sds; - - if (hw_rev != NE020_REV) { - /* init serdes 0 */ - switch (nesadapter->phy_type[0]) { - case NES_PHY_TYPE_CX4: - if (wide_ppm_offset) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA); - else - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); - break; - case NES_PHY_TYPE_KR: - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); - break; - case NES_PHY_TYPE_PUMA_1G: - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); - sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); - sds |= 0x00000100; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds); - break; - default: - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); - break; - } - - if (!OneG_Mode) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); - - if (port_count < 2) - return 0; - - /* init serdes 1 */ - if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G))) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); - - switch (nesadapter->phy_type[1]) { - case NES_PHY_TYPE_ARGUS: - case NES_PHY_TYPE_SFP_D: - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); - break; - case NES_PHY_TYPE_CX4: - if (wide_ppm_offset) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA); - break; - case NES_PHY_TYPE_KR: - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); - break; - case NES_PHY_TYPE_PUMA_1G: - sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); - sds |= 0x000000100; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); - } - if (!OneG_Mode) { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); - sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); - sds &= 0xFFFFFFBF; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); - } - } else { - /* init serdes 0 */ - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); - i = 0; - while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) - & 0x0000000f)) != 0x0000000f) && i++ < 5000) - mdelay(1); - if (i > 5000) { - nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp); - return 1; - } - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); - if (OneG_Mode) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); - else - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); - - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); - if (port_count > 1) { - /* init serdes 1 */ - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048); - i = 0; - while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) - & 0x0000000f)) != 0x0000000f) && (i++ < 5000)) - mdelay(1); - if (i > 5000) { - printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp); - /* return 1; */ - } - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff); - } - } - return 0; -} - - -/** - * nes_init_csr_ne020 - * Initialize registers for ne020 hardware - */ -static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count) -{ - u32 u32temp; - - nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count); - - nes_write_indexed(nesdev, 0x000001E4, 0x00000007); - /* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */ - nes_write_indexed(nesdev, 0x000001E8, 0x00020874); - nes_write_indexed(nesdev, 0x000001D8, 0x00048002); - /* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */ - nes_write_indexed(nesdev, 0x000001FC, 0x00050005); - nes_write_indexed(nesdev, 0x00000600, 0x55555555); - nes_write_indexed(nesdev, 0x00000604, 0x55555555); - - /* TODO: move these MAC register settings to NIC bringup */ - nes_write_indexed(nesdev, 0x00002000, 0x00000001); - nes_write_indexed(nesdev, 0x00002004, 0x00000001); - nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF); - nes_write_indexed(nesdev, 0x0000200C, 0x00000001); - nes_write_indexed(nesdev, 0x00002010, 0x000003c1); - nes_write_indexed(nesdev, 0x0000201C, 0x75345678); - if (port_count > 1) { - nes_write_indexed(nesdev, 0x00002200, 0x00000001); - nes_write_indexed(nesdev, 0x00002204, 0x00000001); - nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF); - nes_write_indexed(nesdev, 0x0000220C, 0x00000001); - nes_write_indexed(nesdev, 0x00002210, 0x000003c1); - nes_write_indexed(nesdev, 0x0000221C, 0x75345678); - nes_write_indexed(nesdev, 0x00000908, 0x20000001); - } - if (port_count > 2) { - nes_write_indexed(nesdev, 0x00002400, 0x00000001); - nes_write_indexed(nesdev, 0x00002404, 0x00000001); - nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF); - nes_write_indexed(nesdev, 0x0000240C, 0x00000001); - nes_write_indexed(nesdev, 0x00002410, 0x000003c1); - nes_write_indexed(nesdev, 0x0000241C, 0x75345678); - nes_write_indexed(nesdev, 0x00000910, 0x20000001); - - nes_write_indexed(nesdev, 0x00002600, 0x00000001); - nes_write_indexed(nesdev, 0x00002604, 0x00000001); - nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF); - nes_write_indexed(nesdev, 0x0000260C, 0x00000001); - nes_write_indexed(nesdev, 0x00002610, 0x000003c1); - nes_write_indexed(nesdev, 0x0000261C, 0x75345678); - nes_write_indexed(nesdev, 0x00000918, 0x20000001); - } - - nes_write_indexed(nesdev, 0x00005000, 0x00018000); - /* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */ - nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) | - 0x00000001); - nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F); - nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F); - nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F); - nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F); - nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF); - - /* TODO: move this to code, get from EEPROM */ - nes_write_indexed(nesdev, 0x00000900, 0x20000001); - nes_write_indexed(nesdev, 0x000060C0, 0x0000028e); - nes_write_indexed(nesdev, 0x000060C8, 0x00000020); - - nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0); - /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */ - - if (hw_rev != NE020_REV) { - u32temp = nes_read_indexed(nesdev, 0x000008e8); - u32temp |= 0x80000000; - nes_write_indexed(nesdev, 0x000008e8, u32temp); - u32temp = nes_read_indexed(nesdev, 0x000021f8); - u32temp &= 0x7fffffff; - u32temp |= 0x7fff0010; - nes_write_indexed(nesdev, 0x000021f8, u32temp); - if (port_count > 1) { - u32temp = nes_read_indexed(nesdev, 0x000023f8); - u32temp &= 0x7fffffff; - u32temp |= 0x7fff0010; - nes_write_indexed(nesdev, 0x000023f8, u32temp); - } - } -} - - -/** - * nes_destroy_adapter - destroy the adapter structure - */ -void nes_destroy_adapter(struct nes_adapter *nesadapter) -{ - struct nes_adapter *tmp_adapter; - - list_for_each_entry(tmp_adapter, &nes_adapter_list, list) { - nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n", - tmp_adapter); - } - - nesadapter->ref_count--; - if (!nesadapter->ref_count) { - if (nesadapter->hw_rev == NE020_REV) { - del_timer(&nesadapter->mh_timer); - } - del_timer(&nesadapter->lc_timer); - - list_del(&nesadapter->list); - kfree(nesadapter); - } -} - - -/** - * nes_init_cqp - */ -int nes_init_cqp(struct nes_device *nesdev) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_cqp_qp_context *cqp_qp_context; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_hw_ceq *ceq; - struct nes_hw_ceq *nic_ceq; - struct nes_hw_aeq *aeq; - void *vmem; - dma_addr_t pmem; - u32 count=0; - u32 cqp_head; - u64 u64temp; - u32 u32temp; - - /* allocate CQP memory */ - /* Need to add max_cq to the aeq size once cq overflow checking is added back */ - /* SQ is 512 byte aligned, others are 256 byte aligned */ - nesdev->cqp_mem_size = 512 + - (sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) + - (sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) + - max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) + - max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) + - (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) + - sizeof(struct nes_hw_cqp_qp_context); - - nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev, - nesdev->cqp_mem_size, - &nesdev->cqp_pbase); - if (!nesdev->cqp_vbase) { - nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n"); - return -ENOMEM; - } - - /* Allocate a twice the number of CQP requests as the SQ size */ - nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) * - 2 * NES_CQP_SQ_SIZE, GFP_KERNEL); - if (!nesdev->nes_cqp_requests) { - pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, - nesdev->cqp.sq_pbase); - return -ENOMEM; - } - - nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n", - nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size); - - spin_lock_init(&nesdev->cqp.lock); - init_waitqueue_head(&nesdev->cqp.waitq); - - /* Setup Various Structures */ - vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) & - ~(unsigned long)(512 - 1)); - pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) & - ~(unsigned long long)(512 - 1)); - - nesdev->cqp.sq_vbase = vmem; - nesdev->cqp.sq_pbase = pmem; - nesdev->cqp.sq_size = NES_CQP_SQ_SIZE; - nesdev->cqp.sq_head = 0; - nesdev->cqp.sq_tail = 0; - nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn); - - vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); - pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); - - nesdev->ccq.cq_vbase = vmem; - nesdev->ccq.cq_pbase = pmem; - nesdev->ccq.cq_size = NES_CCQ_SIZE; - nesdev->ccq.cq_head = 0; - nesdev->ccq.ce_handler = nes_cqp_ce_handler; - nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn); - - vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); - pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); - - nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn); - ceq = &nesadapter->ceq[nesdev->ceq_index]; - ceq->ceq_vbase = vmem; - ceq->ceq_pbase = pmem; - ceq->ceq_size = NES_CCEQ_SIZE; - ceq->ceq_head = 0; - - vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); - pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); - - nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8; - nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index]; - nic_ceq->ceq_vbase = vmem; - nic_ceq->ceq_pbase = pmem; - nic_ceq->ceq_size = NES_NIC_CEQ_SIZE; - nic_ceq->ceq_head = 0; - - vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); - pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); - - aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]; - aeq->aeq_vbase = vmem; - aeq->aeq_pbase = pmem; - aeq->aeq_size = nesadapter->max_qp; - aeq->aeq_head = 0; - - /* Setup QP Context */ - vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); - pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); - - cqp_qp_context = vmem; - cqp_qp_context->context_words[0] = - cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10)); - cqp_qp_context->context_words[1] = 0; - cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase); - cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32); - - - /* Write the address to Create CQP */ - if ((sizeof(dma_addr_t) > 4)) { - nes_write_indexed(nesdev, - NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), - ((u64)pmem) >> 32); - } else { - nes_write_indexed(nesdev, - NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0); - } - nes_write_indexed(nesdev, - NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8), - (u32)pmem); - - INIT_LIST_HEAD(&nesdev->cqp_avail_reqs); - INIT_LIST_HEAD(&nesdev->cqp_pending_reqs); - - for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) { - init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq); - list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs); - } - - /* Write Create CCQ WQE */ - cqp_head = nesdev->cqp.sq_head++; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | - NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - (nesdev->ccq.cq_number | - ((u32)nesdev->ceq_index << 16))); - u64temp = (u64)nesdev->ccq.cq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; - u64temp = (unsigned long)&nesdev->ccq; - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = - cpu_to_le32((u32)(u64temp >> 1)); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = - cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; - - /* Write Create CEQ WQE */ - cqp_head = nesdev->cqp.sq_head++; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size); - u64temp = (u64)ceq->ceq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - - /* Write Create AEQ WQE */ - cqp_head = nesdev->cqp.sq_head++; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size); - u64temp = (u64)aeq->aeq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - - /* Write Create NIC CEQ WQE */ - cqp_head = nesdev->cqp.sq_head++; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size); - u64temp = (u64)nic_ceq->ceq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - - /* Poll until CCQP done */ - count = 0; - do { - if (count++ > 1000) { - printk(KERN_ERR PFX "Error creating CQP\n"); - pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, - nesdev->cqp_vbase, nesdev->cqp_pbase); - return -1; - } - udelay(10); - } while (!(nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8))); - - nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); - - u32temp = 0x04800000; - nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id); - - /* wait for the CCQ, CEQ, and AEQ to get created */ - count = 0; - do { - if (count++ > 1000) { - printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n"); - pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, - nesdev->cqp_vbase, nesdev->cqp_pbase); - return -1; - } - udelay(10); - } while (((nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8))); - - /* dump the QP status value */ - nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); - - nesdev->cqp.sq_tail++; - - return 0; -} - - -/** - * nes_destroy_cqp - */ -int nes_destroy_cqp(struct nes_device *nesdev) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - u32 count = 0; - u32 cqp_head; - unsigned long flags; - - do { - if (count++ > 1000) - break; - udelay(10); - } while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail)); - - /* Reset CCQ */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET | - nesdev->ccq.cq_number); - - /* Disable device interrupts */ - nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); - - spin_lock_irqsave(&nesdev->cqp.lock, flags); - - /* Destroy the AEQ */ - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ | - ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)); - cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; - - /* Destroy the NIC CEQ */ - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | - ((u32)nesdev->nic_ceq_index << 8)); - - /* Destroy the CEQ */ - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | - (nesdev->ceq_index << 8)); - - /* Destroy the CCQ */ - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number | - ((u32)nesdev->ceq_index << 16)); - - /* Destroy CQP */ - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP | - NES_CQP_QP_TYPE_CQP); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id); - - barrier(); - /* Ring doorbell (5 WQEs) */ - nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id); - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - - /* wait for the CCQ, CEQ, and AEQ to get destroyed */ - count = 0; - do { - if (count++ > 1000) { - printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n", - PCI_FUNC(nesdev->pcidev->devfn)); - break; - } - udelay(10); - } while (((nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0)); - - /* dump the QP status value */ - nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n", - PCI_FUNC(nesdev->pcidev->devfn), - nes_read_indexed(nesdev, - NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); - - kfree(nesdev->nes_cqp_requests); - - /* Free the control structures */ - pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, - nesdev->cqp.sq_pbase); - - return 0; -} - - -/** - * nes_init_1g_phy - */ -static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) -{ - u32 counter = 0; - u16 phy_data; - int ret = 0; - - nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data); - nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000); - - /* Reset the PHY */ - nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000); - udelay(100); - counter = 0; - do { - nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); - if (counter++ > 100) { - ret = -1; - break; - } - } while (phy_data & 0x8000); - - /* Setting no phy loopback */ - phy_data &= 0xbfff; - phy_data |= 0x1140; - nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); - nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); - nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data); - nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data); - - /* Setting the interrupt mask */ - nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); - nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee); - nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); - - /* turning on flow control */ - nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); - nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00); - nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); - - /* Clear Half duplex */ - nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); - nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100)); - nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); - - nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); - nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300); - - return ret; -} - - -/** - * nes_init_2025_phy - */ -static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) -{ - u32 temp_phy_data = 0; - u32 temp_phy_data2 = 0; - u32 counter = 0; - u32 sds; - u32 mac_index = nesdev->mac_index; - int ret = 0; - unsigned int first_attempt = 1; - - /* Check firmware heartbeat */ - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - udelay(1500); - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); - temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - - if (temp_phy_data != temp_phy_data2) { - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if ((temp_phy_data & 0xff) > 0x20) - return 0; - printk(PFX "Reinitialize external PHY\n"); - } - - /* no heartbeat, configure the PHY */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); - - switch (phy_type) { - case NES_PHY_TYPE_ARGUS: - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); - - /* setup LEDs */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); - break; - - case NES_PHY_TYPE_SFP_D: - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); - - /* setup LEDs */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); - break; - - case NES_PHY_TYPE_KR: - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); - - /* setup LEDs */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004); - - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020); - break; - } - - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528); - - /* Bring PHY out of reset */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002); - - /* Check for heartbeat */ - counter = 0; - mdelay(690); - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - do { - if (counter++ > 150) { - printk(PFX "No PHY heartbeat\n"); - break; - } - mdelay(1); - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); - temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - } while (temp_phy_data2 == temp_phy_data); - - /* wait for tracking */ - counter = 0; - do { - nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - if (counter++ > 300) { - if (((temp_phy_data & 0xff) == 0x0) && first_attempt) { - first_attempt = 0; - counter = 0; - /* reset AMCC PHY and try again */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0); - nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040); - continue; - } else { - ret = 1; - break; - } - } - mdelay(10); - } while ((temp_phy_data & 0xff) < 0x30); - - /* setup signal integrity */ - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032); - if (phy_type == NES_PHY_TYPE_KR) { - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C); - } else { - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002); - nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063); - } - - /* reset serdes */ - sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200); - sds |= 0x1; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); - sds &= 0xfffffffe; - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); - - counter = 0; - while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) - && (counter++ < 5000)) - ; - - return ret; -} - - -/** - * nes_init_phy - */ -int nes_init_phy(struct nes_device *nesdev) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 mac_index = nesdev->mac_index; - u32 tx_config = 0; - unsigned long flags; - u8 phy_type = nesadapter->phy_type[mac_index]; - u8 phy_index = nesadapter->phy_index[mac_index]; - int ret = 0; - - tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); - if (phy_type == NES_PHY_TYPE_1G) { - /* setup 1G MDIO operation */ - tx_config &= 0xFFFFFFE3; - tx_config |= 0x04; - } else { - /* setup 10G MDIO operation */ - tx_config &= 0xFFFFFFE3; - tx_config |= 0x1D; - } - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); - - spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); - - switch (phy_type) { - case NES_PHY_TYPE_1G: - ret = nes_init_1g_phy(nesdev, phy_type, phy_index); - break; - case NES_PHY_TYPE_ARGUS: - case NES_PHY_TYPE_SFP_D: - case NES_PHY_TYPE_KR: - ret = nes_init_2025_phy(nesdev, phy_type, phy_index); - break; - } - - spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); - - return ret; -} - - -/** - * nes_replenish_nic_rq - */ -static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) -{ - unsigned long flags; - dma_addr_t bus_address; - struct sk_buff *skb; - struct nes_hw_nic_rq_wqe *nic_rqe; - struct nes_hw_nic *nesnic; - struct nes_device *nesdev; - struct nes_rskb_cb *cb; - u32 rx_wqes_posted = 0; - - nesnic = &nesvnic->nic; - nesdev = nesvnic->nesdev; - spin_lock_irqsave(&nesnic->rq_lock, flags); - if (nesnic->replenishing_rq !=0) { - if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && - (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { - atomic_set(&nesvnic->rx_skb_timer_running, 1); - spin_unlock_irqrestore(&nesnic->rq_lock, flags); - nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ - add_timer(&nesvnic->rq_wqes_timer); - } else - spin_unlock_irqrestore(&nesnic->rq_lock, flags); - return; - } - nesnic->replenishing_rq = 1; - spin_unlock_irqrestore(&nesnic->rq_lock, flags); - do { - skb = dev_alloc_skb(nesvnic->max_frame_size); - if (skb) { - skb->dev = nesvnic->netdev; - - bus_address = pci_map_single(nesdev->pcidev, - skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->busaddr = bus_address; - cb->maplen = nesvnic->max_frame_size; - - nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head]; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = - cpu_to_le32(nesvnic->max_frame_size); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = - cpu_to_le32((u32)bus_address); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = - cpu_to_le32((u32)((u64)bus_address >> 32)); - nesnic->rx_skb[nesnic->rq_head] = skb; - nesnic->rq_head++; - nesnic->rq_head &= nesnic->rq_size - 1; - atomic_dec(&nesvnic->rx_skbs_needed); - barrier(); - if (++rx_wqes_posted == 255) { - nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); - rx_wqes_posted = 0; - } - } else { - spin_lock_irqsave(&nesnic->rq_lock, flags); - if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && - (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { - atomic_set(&nesvnic->rx_skb_timer_running, 1); - spin_unlock_irqrestore(&nesnic->rq_lock, flags); - nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ - add_timer(&nesvnic->rq_wqes_timer); - } else - spin_unlock_irqrestore(&nesnic->rq_lock, flags); - break; - } - } while (atomic_read(&nesvnic->rx_skbs_needed)); - barrier(); - if (rx_wqes_posted) - nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); - nesnic->replenishing_rq = 0; -} - - -/** - * nes_rq_wqes_timeout - */ -static void nes_rq_wqes_timeout(struct timer_list *t) -{ - struct nes_vnic *nesvnic = from_timer(nesvnic, t, rq_wqes_timer); - printk("%s: Timer fired.\n", __func__); - atomic_set(&nesvnic->rx_skb_timer_running, 0); - if (atomic_read(&nesvnic->rx_skbs_needed)) - nes_replenish_nic_rq(nesvnic); -} - - -/** - * nes_init_nic_qp - */ -int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_hw_nic_sq_wqe *nic_sqe; - struct nes_hw_nic_qp_context *nic_context; - struct sk_buff *skb; - struct nes_hw_nic_rq_wqe *nic_rqe; - struct nes_vnic *nesvnic = netdev_priv(netdev); - unsigned long flags; - void *vmem; - dma_addr_t pmem; - u64 u64temp; - int ret; - u32 cqp_head; - u32 counter; - u32 wqe_count; - struct nes_rskb_cb *cb; - u8 jumbomode=0; - - /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */ - nesvnic->nic_mem_size = 256 + - (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) + - (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) + - (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) + - (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) + - sizeof(struct nes_hw_nic_qp_context); - - nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev, - nesvnic->nic_mem_size, - &nesvnic->nic_pbase); - if (!nesvnic->nic_vbase) { - nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n"); - return -ENOMEM; - } - nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n", - nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size); - - vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) & - ~(unsigned long)(256 - 1)); - pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) & - ~(unsigned long long)(256 - 1)); - - /* Setup the first Fragment buffers */ - nesvnic->nic.first_frag_vbase = vmem; - - for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { - nesvnic->nic.frag_paddr[counter] = pmem; - pmem += sizeof(struct nes_first_frag); - } - - /* setup the SQ */ - vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)); - - nesvnic->nic.sq_vbase = (void *)vmem; - nesvnic->nic.sq_pbase = pmem; - nesvnic->nic.sq_head = 0; - nesvnic->nic.sq_tail = 0; - nesvnic->nic.sq_size = NES_NIC_WQ_SIZE; - for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { - nic_sqe = &nesvnic->nic.sq_vbase[counter]; - nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] = - cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM | - NES_NIC_SQ_WQE_COMPLETION); - nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] = - cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16); - nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] = - cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]); - nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] = - cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32)); - } - - nesvnic->get_cqp_request = nes_get_cqp_request; - nesvnic->post_cqp_request = nes_post_cqp_request; - nesvnic->mcrq_mcast_filter = NULL; - - spin_lock_init(&nesvnic->nic.rq_lock); - - /* setup the RQ */ - vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); - pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); - - - nesvnic->nic.rq_vbase = vmem; - nesvnic->nic.rq_pbase = pmem; - nesvnic->nic.rq_head = 0; - nesvnic->nic.rq_tail = 0; - nesvnic->nic.rq_size = NES_NIC_WQ_SIZE; - - /* setup the CQ */ - vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); - pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); - - if (nesdev->nesadapter->netdev_count > 2) - nesvnic->mcrq_qp_id = nesvnic->nic_index + 32; - else - nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4; - - nesvnic->nic_cq.cq_vbase = vmem; - nesvnic->nic_cq.cq_pbase = pmem; - nesvnic->nic_cq.cq_head = 0; - nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2; - - nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler; - - /* Send CreateCQ request to CQP */ - spin_lock_irqsave(&nesdev->cqp.lock, flags); - cqp_head = nesdev->cqp.sq_head; - - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( - NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | - ((u32)nesvnic->nic_cq.cq_size << 16)); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( - nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)); - u64temp = (u64)nesvnic->nic_cq.cq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; - u64temp = (unsigned long)&nesvnic->nic_cq; - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = - cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - /* Send CreateQP request to CQP */ - nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]); - nic_context->context_words[NES_NIC_CTX_MISC_IDX] = - cpu_to_le32((u32)NES_NIC_CTX_SIZE | - ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); - nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", - nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), - nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); - if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) { - nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); - } - - u64temp = (u64)nesvnic->nic.sq_pbase; - nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); - nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); - u64temp = (u64)nesvnic->nic.rq_pbase; - nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); - nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | - NES_CQP_QP_TYPE_NIC); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id); - u64temp = (u64)nesvnic->nic_cq.cq_pbase + - (nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - nesdev->cqp.sq_head = cqp_head; - - barrier(); - - /* Ring doorbell (2 WQEs) */ - nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n", - nesvnic->nic.qp_id); - - ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n", - nesvnic->nic.qp_id, ret); - if (!ret) { - nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id); - pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, - nesvnic->nic_pbase); - return -EIO; - } - - /* Populate the RQ */ - for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) { - skb = dev_alloc_skb(nesvnic->max_frame_size); - if (!skb) { - nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); - - nes_destroy_nic_qp(nesvnic); - return -ENOMEM; - } - - skb->dev = netdev; - - pmem = pci_map_single(nesdev->pcidev, skb->data, - nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->busaddr = pmem; - cb->maplen = nesvnic->max_frame_size; - - nic_rqe = &nesvnic->nic.rq_vbase[counter]; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); - nesvnic->nic.rx_skb[counter] = skb; - } - - wqe_count = NES_NIC_WQ_SIZE - 1; - nesvnic->nic.rq_head = wqe_count; - barrier(); - do { - counter = min(wqe_count, ((u32)255)); - wqe_count -= counter; - nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id); - } while (wqe_count); - timer_setup(&nesvnic->rq_wqes_timer, nes_rq_wqes_timeout, 0); - nes_debug(NES_DBG_INIT, "NAPI support Enabled\n"); - if (nesdev->nesadapter->et_use_adaptive_rx_coalesce) - { - nes_nic_init_timer(nesdev); - if (netdev->mtu > 1500) - jumbomode = 1; - nes_nic_init_timer_defaults(nesdev, jumbomode); - } - if ((nesdev->nesadapter->allow_unaligned_fpdus) && - (nes_init_mgt_qp(nesdev, netdev, nesvnic))) { - nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", - netdev->name); - nes_destroy_nic_qp(nesvnic); - return -ENOMEM; - } - - return 0; -} - - -/** - * nes_destroy_nic_qp - */ -void nes_destroy_nic_qp(struct nes_vnic *nesvnic) -{ - u64 u64temp; - dma_addr_t bus_address; - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_hw_nic_sq_wqe *nic_sqe; - __le16 *wqe_fragment_length; - u16 wqe_fragment_index; - u32 cqp_head; - u32 wqm_cfg0; - unsigned long flags; - struct sk_buff *rx_skb; - struct nes_rskb_cb *cb; - int ret; - - if (nesdev->nesadapter->allow_unaligned_fpdus) - nes_destroy_mgt(nesvnic); - - /* clear wqe stall before destroying NIC QP */ - wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0); - nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF); - - /* Free remaining NIC receive buffers */ - while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { - rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail]; - cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; - pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, - PCI_DMA_FROMDEVICE); - - dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]); - nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1); - } - - /* Free remaining NIC transmit buffers */ - while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) { - nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail]; - wqe_fragment_index = 1; - wqe_fragment_length = (__le16 *) - &nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; - /* bump past the vlan tag */ - wqe_fragment_length++; - if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { - u64temp = (u64)le32_to_cpu( - nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ - wqe_fragment_index*2]); - u64temp += ((u64)le32_to_cpu( - nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX - + wqe_fragment_index*2]))<<32; - bus_address = (dma_addr_t)u64temp; - if (test_and_clear_bit(nesvnic->nic.sq_tail, - nesvnic->nic.first_frag_overflow)) { - pci_unmap_single(nesdev->pcidev, - bus_address, - le16_to_cpu(wqe_fragment_length[ - wqe_fragment_index++]), - PCI_DMA_TODEVICE); - } - for (; wqe_fragment_index < 5; wqe_fragment_index++) { - if (wqe_fragment_length[wqe_fragment_index]) { - u64temp = le32_to_cpu( - nic_sqe->wqe_words[ - NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ - wqe_fragment_index*2]); - u64temp += ((u64)le32_to_cpu( - nic_sqe->wqe_words[ - NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+ - wqe_fragment_index*2]))<<32; - bus_address = (dma_addr_t)u64temp; - pci_unmap_page(nesdev->pcidev, - bus_address, - le16_to_cpu( - wqe_fragment_length[ - wqe_fragment_index]), - PCI_DMA_TODEVICE); - } else - break; - } - } - if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]) - dev_kfree_skb( - nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]); - - nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1) - & (nesvnic->nic.sq_size - 1); - } - - spin_lock_irqsave(&nesdev->cqp.lock, flags); - - /* Destroy NIC QP */ - cqp_head = nesdev->cqp.sq_head; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - nesvnic->nic.qp_id); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - - /* Destroy NIC CQ */ - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - (nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16))); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - - nesdev->cqp.sq_head = cqp_head; - barrier(); - - /* Ring doorbell (2 WQEs) */ - nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," - " cqp.sq_tail=%u, cqp.sq_size=%u\n", - cqp_head, nesdev->cqp.sq_head, - nesdev->cqp.sq_tail, nesdev->cqp.sq_size); - - ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), - NES_EVENT_TIMEOUT); - - nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u," - " cqp.sq_head=%u, cqp.sq_tail=%u\n", - ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); - if (!ret) { - nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n", - nesvnic->nic.qp_id); - } - - pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, - nesvnic->nic_pbase); - - /* restore old wqm_cfg0 value */ - nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0); -} - -/** - * nes_napi_isr - */ -int nes_napi_isr(struct nes_device *nesdev) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 int_stat; - - if (nesdev->napi_isr_ran) { - /* interrupt status has already been read in ISR */ - int_stat = nesdev->int_stat; - } else { - int_stat = nes_read32(nesdev->regs + NES_INT_STAT); - nesdev->int_stat = int_stat; - nesdev->napi_isr_ran = 1; - } - - int_stat &= nesdev->int_req; - /* iff NIC, process here, else wait for DPC */ - if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) { - nesdev->napi_isr_ran = 0; - nes_write32(nesdev->regs + NES_INT_STAT, - (int_stat & - ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); - - /* Process the CEQs */ - nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]); - - if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) && - (!nesadapter->et_use_adaptive_rx_coalesce)) || - ((nesadapter->et_use_adaptive_rx_coalesce) && - (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) { - if ((nesdev->int_req & NES_INT_TIMER) == 0) { - /* Enable Periodic timer interrupts */ - nesdev->int_req |= NES_INT_TIMER; - /* ack any pending periodic timer interrupts so we don't get an immediate interrupt */ - /* TODO: need to also ack other unused periodic timer values, get from nesadapter */ - nes_write32(nesdev->regs+NES_TIMER_STAT, - nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); - nes_write32(nesdev->regs+NES_INTF_INT_MASK, - ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); - } - - if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) - { - nes_nic_init_timer(nesdev); - } - /* Enable interrupts, except CEQs */ - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); - } else { - /* Enable interrupts, make sure timer is off */ - nesdev->int_req &= ~NES_INT_TIMER; - nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); - nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); - } - nesdev->deepcq_count = 0; - return 1; - } else { - return 0; - } -} - -static void process_critical_error(struct nes_device *nesdev) -{ - u32 debug_error; - u32 nes_idx_debug_error_masks0 = 0; - u16 error_module = 0; - - debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS); - printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n", - (u16)debug_error); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS, - 0x01010000 | (debug_error & 0x0000ffff)); - if (crit_err_count++ > 10) - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17); - error_module = (u16) (debug_error & 0x1F00) >> 8; - if (++nesdev->nesadapter->crit_error_count[error_module-1] >= - nes_max_critical_error_count) { - printk(KERN_ERR PFX "Masking off critical error for module " - "0x%02X\n", (u16)error_module); - nes_idx_debug_error_masks0 = nes_read_indexed(nesdev, - NES_IDX_DEBUG_ERROR_MASKS0); - nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, - nes_idx_debug_error_masks0 | (1 << error_module)); - } -} -/** - * nes_dpc - */ -void nes_dpc(unsigned long param) -{ - struct nes_device *nesdev = (struct nes_device *)param; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 counter; - u32 loop_counter = 0; - u32 int_status_bit; - u32 int_stat; - u32 timer_stat; - u32 temp_int_stat; - u32 intf_int_stat; - u32 processed_intf_int = 0; - u16 processed_timer_int = 0; - u16 completion_ints = 0; - u16 timer_ints = 0; - - /* nes_debug(NES_DBG_ISR, "\n"); */ - - do { - timer_stat = 0; - if (nesdev->napi_isr_ran) { - nesdev->napi_isr_ran = 0; - int_stat = nesdev->int_stat; - } else - int_stat = nes_read32(nesdev->regs+NES_INT_STAT); - if (processed_intf_int != 0) - int_stat &= nesdev->int_req & ~NES_INT_INTF; - else - int_stat &= nesdev->int_req; - if (processed_timer_int == 0) { - processed_timer_int = 1; - if (int_stat & NES_INT_TIMER) { - timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); - if ((timer_stat & nesdev->timer_int_req) == 0) { - int_stat &= ~NES_INT_TIMER; - } - } - } else { - int_stat &= ~NES_INT_TIMER; - } - - if (int_stat) { - if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| - NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) { - /* Ack the interrupts */ - nes_write32(nesdev->regs+NES_INT_STAT, - (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| - NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); - } - - temp_int_stat = int_stat; - for (counter = 0, int_status_bit = 1; counter < 16; counter++) { - if (int_stat & int_status_bit) { - nes_process_ceq(nesdev, &nesadapter->ceq[counter]); - temp_int_stat &= ~int_status_bit; - completion_ints = 1; - } - if (!(temp_int_stat & 0x0000ffff)) - break; - int_status_bit <<= 1; - } - - /* Process the AEQ for this pci function */ - int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn)); - if (int_stat & int_status_bit) { - nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]); - } - - /* Process the MAC interrupt for this pci function */ - int_status_bit = 1 << (24 + nesdev->mac_index); - if (int_stat & int_status_bit) { - nes_process_mac_intr(nesdev, nesdev->mac_index); - } - - if (int_stat & NES_INT_TIMER) { - if (timer_stat & nesdev->timer_int_req) { - nes_write32(nesdev->regs + NES_TIMER_STAT, - (timer_stat & nesdev->timer_int_req) | - ~(nesdev->nesadapter->timer_int_req)); - timer_ints = 1; - } - } - - if (int_stat & NES_INT_INTF) { - processed_intf_int = 1; - intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); - intf_int_stat &= nesdev->intf_int_req; - if (NES_INTF_INT_CRITERR & intf_int_stat) { - process_critical_error(nesdev); - } - if (NES_INTF_INT_PCIERR & intf_int_stat) { - printk(KERN_ERR PFX "PCI Error reported by device!!!\n"); - BUG(); - } - if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) { - printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n"); - BUG(); - } - nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat); - } - - if (int_stat & NES_INT_TSW) { - } - } - /* Don't use the interface interrupt bit stay in loop */ - int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | - NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3; - } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS)); - - if (timer_ints == 1) { - if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) { - if (completion_ints == 0) { - nesdev->timer_only_int_count++; - if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) { - nesdev->timer_only_int_count = 0; - nesdev->int_req &= ~NES_INT_TIMER; - nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); - nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req); - } else { - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); - } - } else { - if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) - { - nes_nic_init_timer(nesdev); - } - nesdev->timer_only_int_count = 0; - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); - } - } else { - nesdev->timer_only_int_count = 0; - nesdev->int_req &= ~NES_INT_TIMER; - nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); - nes_write32(nesdev->regs+NES_TIMER_STAT, - nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); - nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); - } - } else { - if ( (completion_ints == 1) && - (((nesadapter->et_rx_coalesce_usecs_irq) && - (!nesadapter->et_use_adaptive_rx_coalesce)) || - ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) && - (nesadapter->et_use_adaptive_rx_coalesce) )) ) { - /* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */ - nesdev->timer_only_int_count = 0; - nesdev->int_req |= NES_INT_TIMER; - nes_write32(nesdev->regs+NES_TIMER_STAT, - nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); - nes_write32(nesdev->regs+NES_INTF_INT_MASK, - ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); - } else { - nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); - } - } - nesdev->deepcq_count = 0; -} - - -/** - * nes_process_ceq - */ -static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) -{ - u64 u64temp; - struct nes_hw_cq *cq; - u32 head; - u32 ceq_size; - - /* nes_debug(NES_DBG_CQ, "\n"); */ - head = ceq->ceq_head; - ceq_size = ceq->ceq_size; - - do { - if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) & - NES_CEQE_VALID) { - u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) | - ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX]))); - u64temp <<= 1; - cq = *((struct nes_hw_cq **)&u64temp); - /* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */ - barrier(); - ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0; - - /* call the event handler */ - cq->ce_handler(nesdev, cq); - - if (++head >= ceq_size) - head = 0; - } else { - break; - } - - } while (1); - - ceq->ceq_head = head; -} - - -/** - * nes_process_aeq - */ -static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) -{ - /* u64 u64temp; */ - u32 head; - u32 aeq_size; - u32 aeqe_misc; - u32 aeqe_cq_id; - struct nes_hw_aeqe volatile *aeqe; - - head = aeq->aeq_head; - aeq_size = aeq->aeq_size; - - do { - aeqe = &aeq->aeq_vbase[head]; - if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0) - break; - aeqe_misc = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); - if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) { - if (aeqe_cq_id >= NES_FIRST_QPN) { - /* dealing with an accelerated QP related AE */ - /* - * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) | - * ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); - */ - nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe); - } else { - /* TODO: dealing with a CQP related AE */ - nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n", - (u16)(aeqe_misc >> 16)); - } - } - - aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0; - - if (++head >= aeq_size) - head = 0; - - nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16); - } - while (1); - aeq->aeq_head = head; -} - -static void nes_reset_link(struct nes_device *nesdev, u32 mac_index) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 reset_value; - u32 i=0; - u32 u32temp; - - if (nesadapter->hw_rev == NE020_REV) { - return; - } - mh_detected++; - - reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); - - if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode))) - reset_value |= 0x0000001d; - else - reset_value |= 0x0000002d; - - if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) { - if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { - nesadapter->link_interrupt_count[0] = 0; - nesadapter->link_interrupt_count[1] = 0; - u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); - if (0x00000040 & u32temp) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); - else - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); - - reset_value |= 0x0000003d; - } - nesadapter->link_interrupt_count[mac_index] = 0; - } - - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); - - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) - & 0x00000040) != 0x00000040) && (i++ < 5000)); - - if (0x0000003d == (reset_value & 0x0000003d)) { - u32 pcs_control_status0, pcs_control_status1; - - for (i = 0; i < 10; i++) { - pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0); - pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - if (((0x0F000000 == (pcs_control_status0 & 0x0F000000)) - && (pcs_control_status0 & 0x00100000)) - || ((0x0F000000 == (pcs_control_status1 & 0x0F000000)) - && (pcs_control_status1 & 0x00100000))) - continue; - else - break; - } - if (10 == i) { - u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); - if (0x00000040 & u32temp) - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); - else - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); - - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); - - while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) - & 0x00000040) != 0x00000040) && (i++ < 5000)); - } - } -} - -/** - * nes_process_mac_intr - */ -static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) -{ - unsigned long flags; - u32 pcs_control_status; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_vnic *nesvnic; - u32 mac_status; - u32 mac_index = nesdev->mac_index; - u32 u32temp; - u16 phy_data; - u16 temp_phy_data; - u32 pcs_val = 0x0f0f0000; - u32 pcs_mask = 0x0f1f0000; - u32 cdr_ctrl; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - return; - } - nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT; - - /* ack the MAC interrupt */ - mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200)); - /* Clear the interrupt */ - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status); - - nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status); - - if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) { - nesdev->link_status_interrupts++; - if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) - nes_reset_link(nesdev, mac_index); - - /* read the PHY interrupt status register */ - if ((nesadapter->OneG_Mode) && - (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { - do { - nes_read_1G_phy_reg(nesdev, 0x1a, - nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n", - nesadapter->phy_index[mac_index], phy_data); - } while (phy_data&0x8000); - - temp_phy_data = 0; - do { - nes_read_1G_phy_reg(nesdev, 0x11, - nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n", - nesadapter->phy_index[mac_index], phy_data); - if (temp_phy_data == phy_data) - break; - temp_phy_data = phy_data; - } while (1); - - nes_read_1G_phy_reg(nesdev, 0x1e, - nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n", - nesadapter->phy_index[mac_index], phy_data); - - nes_read_1G_phy_reg(nesdev, 1, - nesadapter->phy_index[mac_index], &phy_data); - nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n", - nesadapter->phy_index[mac_index], phy_data); - - if (temp_phy_data & 0x1000) { - nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n"); - phy_data = 4; - } else { - nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n"); - } - } - nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n", - nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0), - nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200)); - - if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) { - switch (mac_index) { - case 1: - case 3: - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); - break; - default: - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0); - break; - } - } else { - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); - } - - nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n", - mac_index, pcs_control_status); - if ((nesadapter->OneG_Mode) && - (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { - u32temp = 0x01010000; - if (nesadapter->port_count > 2) { - u32temp |= 0x02020000; - } - if ((pcs_control_status & u32temp)!= u32temp) { - phy_data = 0; - nes_debug(NES_DBG_PHY, "PCS says the link is down\n"); - } - } else { - switch (nesadapter->phy_type[mac_index]) { - case NES_PHY_TYPE_ARGUS: - case NES_PHY_TYPE_SFP_D: - case NES_PHY_TYPE_KR: - /* clear the alarms */ - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005); - /* check link status */ - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); - nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); - phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - - phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; - - nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", - __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); - break; - - case NES_PHY_TYPE_PUMA_1G: - if (mac_index < 2) - pcs_val = pcs_mask = 0x01010000; - else - pcs_val = pcs_mask = 0x02020000; - /* fall through */ - default: - phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0; - break; - } - } - - if (phy_data & 0x0004) { - if (wide_ppm_offset && - (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && - (nesadapter->hw_rev != NE020_REV)) { - cdr_ctrl = nes_read_indexed(nesdev, - NES_IDX_ETH_SERDES_CDR_CONTROL0 + - mac_index * 0x200); - nes_write_indexed(nesdev, - NES_IDX_ETH_SERDES_CDR_CONTROL0 + - mac_index * 0x200, - cdr_ctrl | 0x000F0000); - } - nesadapter->mac_link_down[mac_index] = 0; - list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { - nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", - nesvnic->linkup); - if (nesvnic->linkup == 0) { - printk(PFX "The Link is now up for port %s, netdev %p.\n", - nesvnic->netdev->name, nesvnic->netdev); - if (netif_queue_stopped(nesvnic->netdev)) - netif_start_queue(nesvnic->netdev); - nesvnic->linkup = 1; - netif_carrier_on(nesvnic->netdev); - - spin_lock(&nesvnic->port_ibevent_lock); - if (nesvnic->of_device_registered) { - if (nesdev->iw_status == 0) { - nesdev->iw_status = 1; - nes_port_ibevent(nesvnic); - } - } - spin_unlock(&nesvnic->port_ibevent_lock); - } - } - } else { - if (wide_ppm_offset && - (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && - (nesadapter->hw_rev != NE020_REV)) { - cdr_ctrl = nes_read_indexed(nesdev, - NES_IDX_ETH_SERDES_CDR_CONTROL0 + - mac_index * 0x200); - nes_write_indexed(nesdev, - NES_IDX_ETH_SERDES_CDR_CONTROL0 + - mac_index * 0x200, - cdr_ctrl & 0xFFF0FFFF); - } - nesadapter->mac_link_down[mac_index] = 1; - list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { - nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", - nesvnic->linkup); - if (nesvnic->linkup == 1) { - printk(PFX "The Link is now down for port %s, netdev %p.\n", - nesvnic->netdev->name, nesvnic->netdev); - if (!(netif_queue_stopped(nesvnic->netdev))) - netif_stop_queue(nesvnic->netdev); - nesvnic->linkup = 0; - netif_carrier_off(nesvnic->netdev); - - spin_lock(&nesvnic->port_ibevent_lock); - if (nesvnic->of_device_registered) { - if (nesdev->iw_status == 1) { - nesdev->iw_status = 0; - nes_port_ibevent(nesvnic); - } - } - spin_unlock(&nesvnic->port_ibevent_lock); - } - } - } - if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) { - nesdev->link_recheck = 1; - mod_delayed_work(system_wq, &nesdev->work, - NES_LINK_RECHECK_DELAY); - } - } - - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - - nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE; -} - -void nes_recheck_link_status(struct work_struct *work) -{ - unsigned long flags; - struct nes_device *nesdev = container_of(work, struct nes_device, work.work); - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_vnic *nesvnic; - u32 mac_index = nesdev->mac_index; - u16 phy_data; - u16 temp_phy_data; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - - /* check link status */ - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); - nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); - phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - - phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; - - nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", - __func__, phy_data, - nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); - - if (phy_data & 0x0004) { - nesadapter->mac_link_down[mac_index] = 0; - list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { - if (nesvnic->linkup == 0) { - printk(PFX "The Link is now up for port %s, netdev %p.\n", - nesvnic->netdev->name, nesvnic->netdev); - if (netif_queue_stopped(nesvnic->netdev)) - netif_start_queue(nesvnic->netdev); - nesvnic->linkup = 1; - netif_carrier_on(nesvnic->netdev); - - spin_lock(&nesvnic->port_ibevent_lock); - if (nesvnic->of_device_registered) { - if (nesdev->iw_status == 0) { - nesdev->iw_status = 1; - nes_port_ibevent(nesvnic); - } - } - spin_unlock(&nesvnic->port_ibevent_lock); - } - } - - } else { - nesadapter->mac_link_down[mac_index] = 1; - list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { - if (nesvnic->linkup == 1) { - printk(PFX "The Link is now down for port %s, netdev %p.\n", - nesvnic->netdev->name, nesvnic->netdev); - if (!(netif_queue_stopped(nesvnic->netdev))) - netif_stop_queue(nesvnic->netdev); - nesvnic->linkup = 0; - netif_carrier_off(nesvnic->netdev); - - spin_lock(&nesvnic->port_ibevent_lock); - if (nesvnic->of_device_registered) { - if (nesdev->iw_status == 1) { - nesdev->iw_status = 0; - nes_port_ibevent(nesvnic); - } - } - spin_unlock(&nesvnic->port_ibevent_lock); - } - } - } - if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX) - schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY); - else - nesdev->link_recheck = 0; - - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); -} - - -static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) -{ - struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); - - napi_schedule(&nesvnic->napi); -} - - -/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before -* getting out of nic_ce_handler -*/ -#define MAX_RQES_TO_PROCESS 384 - -/** - * nes_nic_ce_handler - */ -void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) -{ - u64 u64temp; - dma_addr_t bus_address; - struct nes_hw_nic *nesnic; - struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_nic_rq_wqe *nic_rqe; - struct nes_hw_nic_sq_wqe *nic_sqe; - struct sk_buff *skb; - struct sk_buff *rx_skb; - struct nes_rskb_cb *cb; - __le16 *wqe_fragment_length; - u32 head; - u32 cq_size; - u32 rx_pkt_size; - u32 cqe_count=0; - u32 cqe_errv; - u32 cqe_misc; - u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ - u16 vlan_tag; - u16 pkt_type; - u16 rqes_processed = 0; - u8 sq_cqes = 0; - - head = cq->cq_head; - cq_size = cq->cq_size; - cq->cqes_pending = 1; - do { - if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & - NES_NIC_CQE_VALID) { - nesnic = &nesvnic->nic; - cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); - if (cqe_misc & NES_NIC_CQE_SQ) { - sq_cqes++; - wqe_fragment_index = 1; - nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail]; - skb = nesnic->tx_skb[nesnic->sq_tail]; - wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; - /* bump past the vlan tag */ - wqe_fragment_length++; - if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { - u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + - wqe_fragment_index * 2]); - u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + - wqe_fragment_index * 2])) << 32; - bus_address = (dma_addr_t)u64temp; - if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) { - pci_unmap_single(nesdev->pcidev, - bus_address, - le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]), - PCI_DMA_TODEVICE); - } - for (; wqe_fragment_index < 5; wqe_fragment_index++) { - if (wqe_fragment_length[wqe_fragment_index]) { - u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + - wqe_fragment_index * 2]); - u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX - + wqe_fragment_index * 2])) <<32; - bus_address = (dma_addr_t)u64temp; - pci_unmap_page(nesdev->pcidev, - bus_address, - le16_to_cpu(wqe_fragment_length[wqe_fragment_index]), - PCI_DMA_TODEVICE); - } else - break; - } - } - if (skb) - dev_kfree_skb_any(skb); - nesnic->sq_tail++; - nesnic->sq_tail &= nesnic->sq_size-1; - if (sq_cqes > 128) { - barrier(); - /* restart the queue if it had been stopped */ - if (netif_queue_stopped(nesvnic->netdev)) - netif_wake_queue(nesvnic->netdev); - sq_cqes = 0; - } - } else { - rqes_processed ++; - - cq->rx_cqes_completed++; - cq->rx_pkts_indicated++; - rx_pkt_size = cqe_misc & 0x0000ffff; - nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail]; - /* Get the skb */ - rx_skb = nesnic->rx_skb[nesnic->rq_tail]; - nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail]; - bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); - bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; - pci_unmap_single(nesdev->pcidev, bus_address, - nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); - cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; - cb->busaddr = 0; - /* rx_skb->tail = rx_skb->data + rx_pkt_size; */ - /* rx_skb->len = rx_pkt_size; */ - rx_skb->len = 0; /* TODO: see if this is necessary */ - skb_put(rx_skb, rx_pkt_size); - rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev); - nesnic->rq_tail++; - nesnic->rq_tail &= nesnic->rq_size - 1; - - atomic_inc(&nesvnic->rx_skbs_needed); - if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) { - nes_write32(nesdev->regs+NES_CQE_ALLOC, - cq->cq_number | (cqe_count << 16)); - /* nesadapter->tune_timer.cq_count += cqe_count; */ - nesdev->currcq_count += cqe_count; - cqe_count = 0; - nes_replenish_nic_rq(nesvnic); - } - pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])); - cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT; - rx_skb->ip_summed = CHECKSUM_NONE; - - if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) || - (NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) { - if ((cqe_errv & - (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR | - NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { - if (nesvnic->netdev->features & NETIF_F_RXCSUM) - rx_skb->ip_summed = CHECKSUM_UNNECESSARY; - } else - nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." - " errv = 0x%X, pkt_type = 0x%X.\n", - nesvnic->netdev->name, cqe_errv, pkt_type); - - } else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) { - if ((cqe_errv & - (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR | - NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { - if (nesvnic->netdev->features & NETIF_F_RXCSUM) { - rx_skb->ip_summed = CHECKSUM_UNNECESSARY; - /* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n", - nesvnic->netdev->name); */ - } - } else - nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." - " errv = 0x%X, pkt_type = 0x%X.\n", - nesvnic->netdev->name, cqe_errv, pkt_type); - } - /* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n", - pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */ - - if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) { - if (nes_cm_recv(rx_skb, nesvnic->netdev)) - rx_skb = NULL; - } - if (rx_skb == NULL) - goto skip_rx_indicate0; - - - if (cqe_misc & NES_NIC_CQE_TAG_VALID) { - vlan_tag = (u16)(le32_to_cpu( - cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) - >> 16); - nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", - nesvnic->netdev->name, vlan_tag); - - __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag); - } - napi_gro_receive(&nesvnic->napi, rx_skb); - -skip_rx_indicate0: - ; - /* nesvnic->netstats.rx_packets++; */ - /* nesvnic->netstats.rx_bytes += rx_pkt_size; */ - } - - cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; - /* Accounting... */ - cqe_count++; - if (++head >= cq_size) - head = 0; - if (cqe_count == 255) { - /* Replenish Nic CQ */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, - cq->cq_number | (cqe_count << 16)); - /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ - nesdev->currcq_count += cqe_count; - cqe_count = 0; - } - - if (cq->rx_cqes_completed >= nesvnic->budget) - break; - } else { - cq->cqes_pending = 0; - break; - } - - } while (1); - - if (sq_cqes) { - barrier(); - /* restart the queue if it had been stopped */ - if (netif_queue_stopped(nesvnic->netdev)) - netif_wake_queue(nesvnic->netdev); - } - cq->cq_head = head; - /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n", - cq->cq_number, cqe_count, cq->cq_head); */ - cq->cqe_allocs_pending = cqe_count; - if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) - { - /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ - nesdev->currcq_count += cqe_count; - nes_nic_tune_timer(nesdev); - } - if (atomic_read(&nesvnic->rx_skbs_needed)) - nes_replenish_nic_rq(nesvnic); -} - - - -/** - * nes_cqp_ce_handler - */ -static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) -{ - u64 u64temp; - unsigned long flags; - struct nes_hw_cqp *cqp = NULL; - struct nes_cqp_request *cqp_request; - struct nes_hw_cqp_wqe *cqp_wqe; - u32 head; - u32 cq_size; - u32 cqe_count=0; - u32 error_code; - u32 opcode; - u32 ctx_index; - /* u32 counter; */ - - head = cq->cq_head; - cq_size = cq->cq_size; - - do { - /* process the CQE */ - /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head, - le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */ - - opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]); - if (opcode & NES_CQE_VALID) { - cqp = &nesdev->cqp; - - error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]); - if (error_code) { - nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP," - " Major/Minor codes = 0x%04X:%04X.\n", - le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f, - (u16)(error_code >> 16), - (u16)error_code); - } - - u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. - cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | - ((u64)(le32_to_cpu(cq->cq_vbase[head]. - cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); - - cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp; - if (cqp_request) { - if (cqp_request->waiting) { - /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */ - cqp_request->major_code = (u16)(error_code >> 16); - cqp_request->minor_code = (u16)error_code; - barrier(); - cqp_request->request_done = 1; - wake_up(&cqp_request->waitq); - nes_put_cqp_request(nesdev, cqp_request); - } else { - if (cqp_request->callback) - cqp_request->cqp_callback(nesdev, cqp_request); - nes_free_cqp_request(nesdev, cqp_request); - } - } else { - wake_up(&nesdev->cqp.waitq); - } - - cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; - nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16)); - if (++cqp->sq_tail >= cqp->sq_size) - cqp->sq_tail = 0; - - /* Accounting... */ - cqe_count++; - if (++head >= cq_size) - head = 0; - } else { - break; - } - } while (1); - cq->cq_head = head; - - spin_lock_irqsave(&nesdev->cqp.lock, flags); - while ((!list_empty(&nesdev->cqp_pending_reqs)) && - ((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) & - (nesdev->cqp.sq_size - 1)) != 1)) { - cqp_request = list_entry(nesdev->cqp_pending_reqs.next, - struct nes_cqp_request, list); - list_del_init(&cqp_request->list); - head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[head]; - memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); - barrier(); - - opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]); - if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) - ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; - else - ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; - cqp_wqe->wqe_words[ctx_index] = - cpu_to_le32((u32)((unsigned long)cqp_request)); - cqp_wqe->wqe_words[ctx_index + 1] = - cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request))); - nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n", - cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head); - /* Ring doorbell (1 WQEs) */ - barrier(); - nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); - } - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - - /* Arm the CCQ */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - cq->cq_number); - nes_read32(nesdev->regs+NES_CQE_ALLOC); -} - -static u8 *locate_mpa(u8 *pkt, u32 aeq_info) -{ - if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { - /* skip over ethernet header */ - pkt += ETH_HLEN; - - /* Skip over IP and TCP headers */ - pkt += 4 * (pkt[0] & 0x0f); - pkt += 4 * ((pkt[12] >> 4) & 0x0f); - } - return pkt; -} - -/* Determine if incoming error pkt is rdma layer */ -static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info) -{ - u8 *pkt; - u16 *mpa; - u32 opcode = 0xffffffff; - - if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { - pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; - mpa = (u16 *)locate_mpa(pkt, aeq_info); - opcode = be16_to_cpu(mpa[1]) & 0xf; - } - - return opcode; -} - -/* Build iWARP terminate header */ -static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info) -{ - u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; - u16 ddp_seg_len; - int copy_len = 0; - u8 is_tagged = 0; - u8 flush_code = 0; - struct nes_terminate_hdr *termhdr; - - termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; - memset(termhdr, 0, 64); - - if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { - - /* Use data from offending packet to fill in ddp & rdma hdrs */ - pkt = locate_mpa(pkt, aeq_info); - ddp_seg_len = be16_to_cpu(*(u16 *)pkt); - if (ddp_seg_len) { - copy_len = 2; - termhdr->hdrct = DDP_LEN_FLAG; - if (pkt[2] & 0x80) { - is_tagged = 1; - if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) { - copy_len += TERM_DDP_LEN_TAGGED; - termhdr->hdrct |= DDP_HDR_FLAG; - } - } else { - if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) { - copy_len += TERM_DDP_LEN_UNTAGGED; - termhdr->hdrct |= DDP_HDR_FLAG; - } - - if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) { - if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) { - copy_len += TERM_RDMA_LEN; - termhdr->hdrct |= RDMA_HDR_FLAG; - } - } - } - } - } - - switch (async_event_id) { - case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: - switch (iwarp_opcode(nesqp, aeq_info)) { - case IWARP_OPCODE_WRITE: - flush_code = IB_WC_LOC_PROT_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; - termhdr->error_code = DDP_TAGGED_INV_STAG; - break; - default: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_INV_STAG; - } - break; - case NES_AEQE_AEID_AMP_INVALID_STAG: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_INV_STAG; - break; - case NES_AEQE_AEID_AMP_BAD_QP: - flush_code = IB_WC_LOC_QP_OP_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_QN; - break; - case NES_AEQE_AEID_AMP_BAD_STAG_KEY: - case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: - switch (iwarp_opcode(nesqp, aeq_info)) { - case IWARP_OPCODE_SEND_INV: - case IWARP_OPCODE_SEND_SE_INV: - flush_code = IB_WC_REM_OP_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; - termhdr->error_code = RDMAP_CANT_INV_STAG; - break; - default: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_INV_STAG; - } - break; - case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: - if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { - flush_code = IB_WC_LOC_PROT_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; - termhdr->error_code = DDP_TAGGED_BOUNDS; - } else { - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_INV_BOUNDS; - } - break; - case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: - case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: - case NES_AEQE_AEID_PRIV_OPERATION_DENIED: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_ACCESS; - break; - case NES_AEQE_AEID_AMP_TO_WRAP: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_TO_WRAP; - break; - case NES_AEQE_AEID_AMP_BAD_PD: - switch (iwarp_opcode(nesqp, aeq_info)) { - case IWARP_OPCODE_WRITE: - flush_code = IB_WC_LOC_PROT_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; - termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; - break; - case IWARP_OPCODE_SEND_INV: - case IWARP_OPCODE_SEND_SE_INV: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_CANT_INV_STAG; - break; - default: - flush_code = IB_WC_REM_ACCESS_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; - termhdr->error_code = RDMAP_UNASSOC_STAG; - } - break; - case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: - flush_code = IB_WC_LOC_LEN_ERR; - termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; - termhdr->error_code = MPA_MARKER; - break; - case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: - flush_code = IB_WC_GENERAL_ERR; - termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; - termhdr->error_code = MPA_CRC; - break; - case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: - case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: - flush_code = IB_WC_LOC_LEN_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; - termhdr->error_code = DDP_CATASTROPHIC_LOCAL; - break; - case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: - case NES_AEQE_AEID_DDP_NO_L_BIT: - flush_code = IB_WC_FATAL_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; - termhdr->error_code = DDP_CATASTROPHIC_LOCAL; - break; - case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: - case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: - flush_code = IB_WC_GENERAL_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; - break; - case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: - flush_code = IB_WC_LOC_LEN_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: - flush_code = IB_WC_GENERAL_ERR; - if (is_tagged) { - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; - termhdr->error_code = DDP_TAGGED_INV_DDP_VER; - } else { - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER; - } - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_MO: - flush_code = IB_WC_GENERAL_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_MO; - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: - flush_code = IB_WC_REM_OP_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_QN: - flush_code = IB_WC_GENERAL_ERR; - termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; - termhdr->error_code = DDP_UNTAGGED_INV_QN; - break; - case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: - flush_code = IB_WC_GENERAL_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; - termhdr->error_code = RDMAP_INV_RDMAP_VER; - break; - case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: - flush_code = IB_WC_LOC_QP_OP_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; - termhdr->error_code = RDMAP_UNEXPECTED_OP; - break; - default: - flush_code = IB_WC_FATAL_ERR; - termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; - termhdr->error_code = RDMAP_UNSPECIFIED; - break; - } - - if (copy_len) - memcpy(termhdr + 1, pkt, copy_len); - - if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) { - if (aeq_info & NES_AEQE_SQ) - nesqp->term_sq_flush_code = flush_code; - else - nesqp->term_rq_flush_code = flush_code; - } - - return sizeof(struct nes_terminate_hdr) + copy_len; -} - -static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp, - struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype) -{ - u64 context; - unsigned long flags; - u32 aeq_info; - u16 async_event_id; - u8 tcp_state; - u8 iwarp_state; - u32 termlen = 0; - u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE | - NES_CQP_QP_TERM_DONT_SEND_FIN; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - if (nesqp->term_flags & NES_TERM_SENT) - return; /* Sanity check */ - - aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; - iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; - async_event_id = (u16)aeq_info; - - context = (unsigned long)nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; - if (!context) { - WARN_ON(!context); - return; - } - - nesqp = (struct nes_qp *)(unsigned long)context; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - nesqp->terminate_eventtype = eventtype; - spin_unlock_irqrestore(&nesqp->lock, flags); - - if (nesadapter->send_term_ok) - termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info); - else - mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG; - - if (!nesdev->iw_status) { - nesqp->term_flags = NES_TERM_DONE; - nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0); - nes_cm_disconn(nesqp); - } else { - nes_terminate_start_timer(nesqp); - nesqp->term_flags |= NES_TERM_SENT; - nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0); - } -} - -static void nes_terminate_send_fin(struct nes_device *nesdev, - struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) -{ - u32 aeq_info; - u16 async_event_id; - u8 tcp_state; - u8 iwarp_state; - unsigned long flags; - - aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; - iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; - async_event_id = (u16)aeq_info; - - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - - /* Send the fin only */ - nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE | - NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0); -} - -/* Cleanup after a terminate sent or received */ -static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred) -{ - u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; - unsigned long flags; - struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device); - struct nes_device *nesdev = nesvnic->nesdev; - u8 first_time = 0; - - spin_lock_irqsave(&nesqp->lock, flags); - if (nesqp->hte_added) { - nesqp->hte_added = 0; - next_iwarp_state |= NES_CQP_QP_DEL_HTE; - } - - first_time = (nesqp->term_flags & NES_TERM_DONE) == 0; - nesqp->term_flags |= NES_TERM_DONE; - spin_unlock_irqrestore(&nesqp->lock, flags); - - /* Make sure we go through this only once */ - if (first_time) { - if (timeout_occurred == 0) - del_timer(&nesqp->terminate_timer); - else - next_iwarp_state |= NES_CQP_QP_RESET; - - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); - nes_cm_disconn(nesqp); - } -} - -static void nes_terminate_received(struct nes_device *nesdev, - struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) -{ - u32 aeq_info; - u8 *pkt; - u32 *mpa; - u8 ddp_ctl; - u8 rdma_ctl; - u16 aeq_id = 0; - - aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { - /* Terminate is not a performance path so the silicon */ - /* did not validate the frame - do it now */ - pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; - mpa = (u32 *)locate_mpa(pkt, aeq_info); - ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff; - rdma_ctl = be32_to_cpu(mpa[0]) & 0xff; - if ((ddp_ctl & 0xc0) != 0x40) - aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC; - else if ((ddp_ctl & 0x03) != 1) - aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION; - else if (be32_to_cpu(mpa[2]) != 2) - aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN; - else if (be32_to_cpu(mpa[3]) != 1) - aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN; - else if (be32_to_cpu(mpa[4]) != 0) - aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO; - else if ((rdma_ctl & 0xc0) != 0x40) - aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION; - - if (aeq_id) { - /* Bad terminate recvd - send back a terminate */ - aeq_info = (aeq_info & 0xffff0000) | aeq_id; - aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); - nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); - return; - } - } - - nesqp->term_flags |= NES_TERM_RCVD; - nesqp->terminate_eventtype = IB_EVENT_QP_FATAL; - nes_terminate_start_timer(nesqp); - nes_terminate_send_fin(nesdev, nesqp, aeqe); -} - -/* Timeout routine in case terminate fails to complete */ -void nes_terminate_timeout(struct timer_list *t) -{ - struct nes_qp *nesqp = from_timer(nesqp, t, terminate_timer); - - nes_terminate_done(nesqp, 1); -} - -/* Set a timer in case hw cannot complete the terminate sequence */ -static void nes_terminate_start_timer(struct nes_qp *nesqp) -{ - mod_timer(&nesqp->terminate_timer, (jiffies + HZ)); -} - -/** - * nes_process_iwarp_aeqe - */ -static void nes_process_iwarp_aeqe(struct nes_device *nesdev, - struct nes_hw_aeqe *aeqe) -{ - u64 context; - unsigned long flags; - struct nes_qp *nesqp; - struct nes_hw_cq *hw_cq; - struct nes_cq *nescq; - int resource_allocated; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 aeq_info; - u32 next_iwarp_state = 0; - u32 aeqe_cq_id; - u16 async_event_id; - u8 tcp_state; - u8 iwarp_state; - struct ib_event ibevent; - - nes_debug(NES_DBG_AEQ, "\n"); - aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) { - context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); - context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; - } else { - context = (unsigned long)nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; - BUG_ON(!context); - } - - /* context is nesqp unless async_event_id == CQ ERROR */ - nesqp = (struct nes_qp *)(unsigned long)context; - async_event_id = (u16)aeq_info; - tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; - iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; - nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p," - " Tcp state = %s, iWARP state = %s\n", - async_event_id, - le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, - nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); - - aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); - if (aeq_info & NES_AEQE_QP) { - if (!nes_is_resource_allocated(nesadapter, - nesadapter->allocated_qps, - aeqe_cq_id)) - return; - } - - switch (async_event_id) { - case NES_AEQE_AEID_LLP_FIN_RECEIVED: - if (nesqp->term_flags) - return; /* Ignore it, wait for close complete */ - - if (atomic_inc_return(&nesqp->close_timer_started) == 1) { - if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) && - (nesqp->ibqp_state == IB_QPS_RTS)) { - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); - nes_cm_disconn(nesqp); - } - nesqp->cm_id->add_ref(nesqp->cm_id); - schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, - NES_TIMER_TYPE_CLOSE, 1, 0); - nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d)," - " need ae to finish up, original_last_aeq = 0x%04X." - " last_aeq = 0x%04X, scheduling timer. TCP state = %d\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - async_event_id, nesqp->last_aeq, tcp_state); - } - break; - case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_cm_disconn(nesqp); - break; - - case NES_AEQE_AEID_RESET_SENT: - tcp_state = NES_AEQE_TCP_STATE_CLOSED; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - nesqp->hte_added = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); - next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); - nes_cm_disconn(nesqp); - break; - - case NES_AEQE_AEID_LLP_CONNECTION_RESET: - if (atomic_read(&nesqp->close_timer_started)) - return; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_cm_disconn(nesqp); - break; - - case NES_AEQE_AEID_TERMINATE_SENT: - nes_terminate_send_fin(nesdev, nesqp, aeqe); - break; - - case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: - nes_terminate_received(nesdev, nesqp, aeqe); - break; - - case NES_AEQE_AEID_AMP_BAD_STAG_KEY: - case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: - case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: - case NES_AEQE_AEID_AMP_INVALID_STAG: - case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: - case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: - case NES_AEQE_AEID_PRIV_OPERATION_DENIED: - case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: - case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: - case NES_AEQE_AEID_AMP_TO_WRAP: - printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n", - nesqp->hwqp.qp_id, async_event_id); - nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); - break; - - case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: - case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: - case NES_AEQE_AEID_DDP_UBE_INVALID_MO: - case NES_AEQE_AEID_DDP_UBE_INVALID_QN: - if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { - aeq_info &= 0xffff0000; - aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; - aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); - } - /* fall through */ - case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: - case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: - case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: - case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: - case NES_AEQE_AEID_AMP_BAD_QP: - case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: - case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: - case NES_AEQE_AEID_DDP_NO_L_BIT: - case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: - case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: - case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: - case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: - case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: - case NES_AEQE_AEID_AMP_BAD_PD: - case NES_AEQE_AEID_AMP_FASTREG_SHARED: - case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG: - case NES_AEQE_AEID_AMP_FASTREG_MW_STAG: - case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS: - case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW: - case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH: - case NES_AEQE_AEID_AMP_INVALIDATE_SHARED: - case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS: - case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG: - case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG: - case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG: - case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG: - case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS: - case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS: - case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT: - case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED: - case NES_AEQE_AEID_BAD_CLOSE: - case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO: - case NES_AEQE_AEID_STAG_ZERO_INVALID: - case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: - case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: - printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", - nesqp->hwqp.qp_id, async_event_id); - print_ip(nesqp->cm_node); - if (!atomic_read(&nesqp->close_timer_started)) - nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); - break; - - case NES_AEQE_AEID_CQ_OPERATION_ERROR: - context <<= 1; - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", - le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context); - resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs, - le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); - if (resource_allocated) { - printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", - __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); - hw_cq = (struct nes_hw_cq *)(unsigned long)context; - if (hw_cq) { - nescq = container_of(hw_cq, struct nes_cq, hw_cq); - if (nescq->ibcq.event_handler) { - ibevent.device = nescq->ibcq.device; - ibevent.event = IB_EVENT_CQ_ERR; - ibevent.element.cq = &nescq->ibcq; - nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context); - } - } - } - break; - - default: - nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", - async_event_id); - break; - } - -} - -/** - * nes_iwarp_ce_handler - */ -void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq) -{ - struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq); - - /* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n", - nescq->hw_cq.cq_number); */ - nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number); - - if (nescq->ibcq.comp_handler) - nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context); - - return; -} - - -/** - * nes_manage_apbvt() - */ -int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port, - u32 nic_index, u32 add_port) -{ - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - int ret = 0; - u16 major_code; - - /* Send manage APBVT request to CQP */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n", - (add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL", - accel_local_port, accel_local_port, nic_index); - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT | - ((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - ((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port)); - - nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n"); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - if (add_port == NES_MANAGE_APBVT_ADD) - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_QP, "Completed, ret=%u, CQP Major:Minor codes = 0x%04X:0x%04X\n", - ret, cqp_request->major_code, cqp_request->minor_code); - major_code = cqp_request->major_code; - - nes_put_cqp_request(nesdev, cqp_request); - - if (!ret) - return -ETIME; - else if (major_code) - return -EIO; - else - return 0; -} - - -/** - * nes_manage_arp_cache - */ -void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, - u32 ip_addr, u32 action) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev; - struct nes_cqp_request *cqp_request; - int arp_index; - - nesdev = nesvnic->nesdev; - arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action); - if (arp_index == -1) { - return; - } - - /* update the ARP entry */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n"); - return; - } - cqp_request->waiting = 0; - cqp_wqe = &cqp_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( - NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM); - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32( - (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index); - - if (action == NES_ARP_ADD) { - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID); - cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32( - (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | - (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); - cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( - (((u32)mac_addr[0]) << 8) | (u32)mac_addr[1]); - } else { - cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0; - cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0; - } - - nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n", - nesdev->cqp.sq_head, nesdev->cqp.sq_tail); - - atomic_set(&cqp_request->refcount, 1); - nes_post_cqp_request(nesdev, cqp_request); -} - - -/** - * flush_wqes - */ -void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, - u32 which_wq, u32 wait_completion) -{ - struct nes_cqp_request *cqp_request; - struct nes_hw_cqp_wqe *cqp_wqe; - u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; - u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; - int ret; - - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); - return; - } - if (wait_completion) { - cqp_request->waiting = 1; - atomic_set(&cqp_request->refcount, 2); - } else { - cqp_request->waiting = 0; - } - cqp_wqe = &cqp_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - /* If wqe in error was identified, set code to be put into cqe */ - if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) { - which_wq |= NES_CQP_FLUSH_MAJ_MIN; - sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code; - nesqp->term_sq_flush_code = 0; - } - - if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) { - which_wq |= NES_CQP_FLUSH_MAJ_MIN; - rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code; - nesqp->term_rq_flush_code = 0; - } - - if (which_wq & NES_CQP_FLUSH_MAJ_MIN) { - cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code); - cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code); - } - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = - cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); - - nes_post_cqp_request(nesdev, cqp_request); - - if (wait_completion) { - /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u," - " CQP Major:Minor codes = 0x%04X:0x%04X\n", - ret, cqp_request->major_code, cqp_request->minor_code); - nes_put_cqp_request(nesdev, cqp_request); - } -} diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h deleted file mode 100644 index 3c56470816a8..000000000000 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ /dev/null @@ -1,1380 +0,0 @@ -/* -* Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. -* -* This software is available to you under a choice of one of two -* licenses. You may choose to be licensed under the terms of the GNU -* General Public License (GPL) Version 2, available from the file -* COPYING in the main directory of this source tree, or the -* OpenIB.org BSD license below: -* -* Redistribution and use in source and binary forms, with or -* without modification, are permitted provided that the following -* conditions are met: -* -* - Redistributions of source code must retain the above -* copyright notice, this list of conditions and the following -* disclaimer. -* -* - Redistributions in binary form must reproduce the above -* copyright notice, this list of conditions and the following -* disclaimer in the documentation and/or other materials -* provided with the distribution. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ - -#ifndef __NES_HW_H -#define __NES_HW_H - -#define NES_PHY_TYPE_CX4 1 -#define NES_PHY_TYPE_1G 2 -#define NES_PHY_TYPE_ARGUS 4 -#define NES_PHY_TYPE_PUMA_1G 5 -#define NES_PHY_TYPE_PUMA_10G 6 -#define NES_PHY_TYPE_GLADIUS 7 -#define NES_PHY_TYPE_SFP_D 8 -#define NES_PHY_TYPE_KR 9 - -#define NES_MULTICAST_PF_MAX 8 -#define NES_A0 3 - -#define NES_ENABLE_PAU 0x07000001 -#define NES_DISABLE_PAU 0x07000000 -#define NES_PAU_COUNTER 10 -#define NES_CQP_OPCODE_MASK 0x3f - -enum pci_regs { - NES_INT_STAT = 0x0000, - NES_INT_MASK = 0x0004, - NES_INT_PENDING = 0x0008, - NES_INTF_INT_STAT = 0x000C, - NES_INTF_INT_MASK = 0x0010, - NES_TIMER_STAT = 0x0014, - NES_PERIODIC_CONTROL = 0x0018, - NES_ONE_SHOT_CONTROL = 0x001C, - NES_EEPROM_COMMAND = 0x0020, - NES_EEPROM_DATA = 0x0024, - NES_FLASH_COMMAND = 0x0028, - NES_FLASH_DATA = 0x002C, - NES_SOFTWARE_RESET = 0x0030, - NES_CQ_ACK = 0x0034, - NES_WQE_ALLOC = 0x0040, - NES_CQE_ALLOC = 0x0044, - NES_AEQ_ALLOC = 0x0048 -}; - -enum indexed_regs { - NES_IDX_CREATE_CQP_LOW = 0x0000, - NES_IDX_CREATE_CQP_HIGH = 0x0004, - NES_IDX_QP_CONTROL = 0x0040, - NES_IDX_FLM_CONTROL = 0x0080, - NES_IDX_INT_CPU_STATUS = 0x00a0, - NES_IDX_GPR_TRIGGER = 0x00bc, - NES_IDX_GPIO_CONTROL = 0x00f0, - NES_IDX_GPIO_DATA = 0x00f4, - NES_IDX_GPR2 = 0x010c, - NES_IDX_TCP_CONFIG0 = 0x01e4, - NES_IDX_TCP_TIMER_CONFIG = 0x01ec, - NES_IDX_TCP_NOW = 0x01f0, - NES_IDX_QP_MAX_CFG_SIZES = 0x0200, - NES_IDX_QP_CTX_SIZE = 0x0218, - NES_IDX_TCP_TIMER_SIZE0 = 0x0238, - NES_IDX_TCP_TIMER_SIZE1 = 0x0240, - NES_IDX_ARP_CACHE_SIZE = 0x0258, - NES_IDX_CQ_CTX_SIZE = 0x0260, - NES_IDX_MRT_SIZE = 0x0278, - NES_IDX_PBL_REGION_SIZE = 0x0280, - NES_IDX_IRRQ_COUNT = 0x02b0, - NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0, - NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300, - NES_IDX_DST_IP_ADDR = 0x0400, - NES_IDX_PCIX_DIAG = 0x08e8, - NES_IDX_MPP_DEBUG = 0x0a00, - NES_IDX_PORT_RX_DISCARDS = 0x0a30, - NES_IDX_PORT_TX_DISCARDS = 0x0a34, - NES_IDX_MPP_LB_DEBUG = 0x0b00, - NES_IDX_DENALI_CTL_22 = 0x1058, - NES_IDX_MAC_TX_CONTROL = 0x2000, - NES_IDX_MAC_TX_CONFIG = 0x2004, - NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008, - NES_IDX_MAC_RX_CONTROL = 0x200c, - NES_IDX_MAC_RX_CONFIG = 0x2010, - NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c, - NES_IDX_MAC_MDIO_CONTROL = 0x2084, - NES_IDX_MAC_TX_OCTETS_LOW = 0x2100, - NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104, - NES_IDX_MAC_TX_FRAMES_LOW = 0x2108, - NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c, - NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118, - NES_IDX_MAC_TX_ERRORS = 0x2138, - NES_IDX_MAC_RX_OCTETS_LOW = 0x213c, - NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140, - NES_IDX_MAC_RX_FRAMES_LOW = 0x2144, - NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148, - NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c, - NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150, - NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154, - NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174, - NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178, - NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c, - NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180, - NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184, - NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188, - NES_IDX_MAC_INT_STATUS = 0x21f0, - NES_IDX_MAC_INT_MASK = 0x21f4, - NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800, - NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00, - NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808, - NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08, - NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c, - NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c, - NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810, - NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10, - NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814, - NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14, - NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818, - NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18, - NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c, - NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c, - NES_IDX_ETH_SERDES_BYPASS0 = 0x2820, - NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20, - NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824, - NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24, - NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828, - NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28, - NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c, - NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c, - NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830, - NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30, - NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834, - NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34, - NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838, - NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38, - NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c, - NES_IDX_WQM_CONFIG0 = 0x5000, - NES_IDX_WQM_CONFIG1 = 0x5004, - NES_IDX_CM_CONFIG = 0x5100, - NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000, - NES_IDX_NIC_PHYPORT_TO_USW = 0x6008, - NES_IDX_NIC_ACTIVE = 0x6010, - NES_IDX_NIC_UNICAST_ALL = 0x6018, - NES_IDX_NIC_MULTICAST_ALL = 0x6020, - NES_IDX_NIC_MULTICAST_ENABLE = 0x6028, - NES_IDX_NIC_BROADCAST_ON = 0x6030, - NES_IDX_USED_CHUNKS_TX = 0x60b0, - NES_IDX_TX_POOL_SIZE = 0x60b8, - NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148, - NES_IDX_PERFECT_FILTER_LOW = 0x6200, - NES_IDX_PERFECT_FILTER_HIGH = 0x6204, - NES_IDX_IPV4_TCP_REXMITS = 0x7080, - NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c, - NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140, - NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144, - NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148, - NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c, - NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150, - NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154, -}; - -#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE 1 -#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17) - -enum nes_cqp_opcodes { - NES_CQP_CREATE_QP = 0x00, - NES_CQP_MODIFY_QP = 0x01, - NES_CQP_DESTROY_QP = 0x02, - NES_CQP_CREATE_CQ = 0x03, - NES_CQP_MODIFY_CQ = 0x04, - NES_CQP_DESTROY_CQ = 0x05, - NES_CQP_ALLOCATE_STAG = 0x09, - NES_CQP_REGISTER_STAG = 0x0a, - NES_CQP_QUERY_STAG = 0x0b, - NES_CQP_REGISTER_SHARED_STAG = 0x0c, - NES_CQP_DEALLOCATE_STAG = 0x0d, - NES_CQP_MANAGE_ARP_CACHE = 0x0f, - NES_CQP_DOWNLOAD_SEGMENT = 0x10, - NES_CQP_SUSPEND_QPS = 0x11, - NES_CQP_UPLOAD_CONTEXT = 0x13, - NES_CQP_CREATE_CEQ = 0x16, - NES_CQP_DESTROY_CEQ = 0x18, - NES_CQP_CREATE_AEQ = 0x19, - NES_CQP_DESTROY_AEQ = 0x1b, - NES_CQP_LMI_ACCESS = 0x20, - NES_CQP_FLUSH_WQES = 0x22, - NES_CQP_MANAGE_APBVT = 0x23, - NES_CQP_MANAGE_QUAD_HASH = 0x25 -}; - -enum nes_cqp_wqe_word_idx { - NES_CQP_WQE_OPCODE_IDX = 0, - NES_CQP_WQE_ID_IDX = 1, - NES_CQP_WQE_COMP_CTX_LOW_IDX = 2, - NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3, - NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4, - NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5, -}; - -enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */ - NES_CQP_WQE_DL_OPCODE_IDX = 0, - NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1, - NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2, - NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3 - /* For index values 4-15 use NES_NIC_SQ_WQE_ values */ -}; - -enum nes_cqp_cq_wqeword_idx { - NES_CQP_CQ_WQE_PBL_LOW_IDX = 6, - NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7, - NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8, - NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9, - NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10, -}; - -enum nes_cqp_stag_wqeword_idx { - NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1, - NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6, - NES_CQP_STAG_WQE_LEN_LOW_IDX = 7, - NES_CQP_STAG_WQE_STAG_IDX = 8, - NES_CQP_STAG_WQE_VA_LOW_IDX = 10, - NES_CQP_STAG_WQE_VA_HIGH_IDX = 11, - NES_CQP_STAG_WQE_PA_LOW_IDX = 12, - NES_CQP_STAG_WQE_PA_HIGH_IDX = 13, - NES_CQP_STAG_WQE_PBL_LEN_IDX = 14 -}; - -#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26 -#define NES_CQP_OP_IWARP_STATE_SHIFT 28 -#define NES_CQP_OP_TERMLEN_SHIFT 28 - -enum nes_cqp_qp_bits { - NES_CQP_QP_ARP_VALID = (1<<8), - NES_CQP_QP_WINBUF_VALID = (1<<9), - NES_CQP_QP_CONTEXT_VALID = (1<<10), - NES_CQP_QP_ORD_VALID = (1<<11), - NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12), - NES_CQP_QP_VIRT_WQS = (1<<13), - NES_CQP_QP_DEL_HTE = (1<<14), - NES_CQP_QP_CQS_VALID = (1<<15), - NES_CQP_QP_TYPE_TSA = 0, - NES_CQP_QP_TYPE_IWARP = (1<<16), - NES_CQP_QP_TYPE_CQP = (4<<16), - NES_CQP_QP_TYPE_NIC = (5<<16), - NES_CQP_QP_MSS_CHG = (1<<20), - NES_CQP_QP_STATIC_RESOURCES = (1<<21), - NES_CQP_QP_IGNORE_MW_BOUND = (1<<22), - NES_CQP_QP_VWQ_USE_LMI = (1<<23), - NES_CQP_QP_IWARP_STATE_IDLE = (1<netdev */ - u8 perfect_filter_index; - u8 nic_index; - u8 qp_nic_index[4]; - u8 next_qp_nic_index; - u8 of_device_registered; - u8 rdma_enabled; - struct timer_list event_timer; - enum ib_event_type delayed_event; - enum ib_event_type last_dispatched_event; - spinlock_t port_ibevent_lock; - u32 mgt_mem_size; - void *mgt_vbase; - dma_addr_t mgt_pbase; - struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT]; - struct task_struct *mgt_thread; - wait_queue_head_t mgt_wait_queue; - struct sk_buff_head mgt_skb_list; - -}; - -struct nes_ib_device { - struct ib_device ibdev; - struct nes_vnic *nesvnic; - - /* Virtual RNIC Limits */ - u32 max_mr; - u32 max_qp; - u32 max_cq; - u32 max_pd; - u32 num_mr; - u32 num_qp; - u32 num_cq; - u32 num_pd; -}; - -enum nes_hdrct_flags { - DDP_LEN_FLAG = 0x80, - DDP_HDR_FLAG = 0x40, - RDMA_HDR_FLAG = 0x20 -}; - -enum nes_term_layers { - LAYER_RDMA = 0, - LAYER_DDP = 1, - LAYER_MPA = 2 -}; - -enum nes_term_error_types { - RDMAP_CATASTROPHIC = 0, - RDMAP_REMOTE_PROT = 1, - RDMAP_REMOTE_OP = 2, - DDP_CATASTROPHIC = 0, - DDP_TAGGED_BUFFER = 1, - DDP_UNTAGGED_BUFFER = 2, - DDP_LLP = 3 -}; - -enum nes_term_rdma_errors { - RDMAP_INV_STAG = 0x00, - RDMAP_INV_BOUNDS = 0x01, - RDMAP_ACCESS = 0x02, - RDMAP_UNASSOC_STAG = 0x03, - RDMAP_TO_WRAP = 0x04, - RDMAP_INV_RDMAP_VER = 0x05, - RDMAP_UNEXPECTED_OP = 0x06, - RDMAP_CATASTROPHIC_LOCAL = 0x07, - RDMAP_CATASTROPHIC_GLOBAL = 0x08, - RDMAP_CANT_INV_STAG = 0x09, - RDMAP_UNSPECIFIED = 0xff -}; - -enum nes_term_ddp_errors { - DDP_CATASTROPHIC_LOCAL = 0x00, - DDP_TAGGED_INV_STAG = 0x00, - DDP_TAGGED_BOUNDS = 0x01, - DDP_TAGGED_UNASSOC_STAG = 0x02, - DDP_TAGGED_TO_WRAP = 0x03, - DDP_TAGGED_INV_DDP_VER = 0x04, - DDP_UNTAGGED_INV_QN = 0x01, - DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02, - DDP_UNTAGGED_INV_MSN_RANGE = 0x03, - DDP_UNTAGGED_INV_MO = 0x04, - DDP_UNTAGGED_INV_TOO_LONG = 0x05, - DDP_UNTAGGED_INV_DDP_VER = 0x06 -}; - -enum nes_term_mpa_errors { - MPA_CLOSED = 0x01, - MPA_CRC = 0x02, - MPA_MARKER = 0x03, - MPA_REQ_RSP = 0x04, -}; - -struct nes_terminate_hdr { - u8 layer_etype; - u8 error_code; - u8 hdrct; - u8 rsvd; -}; - -/* Used to determine how to fill in terminate error codes */ -#define IWARP_OPCODE_WRITE 0 -#define IWARP_OPCODE_READREQ 1 -#define IWARP_OPCODE_READRSP 2 -#define IWARP_OPCODE_SEND 3 -#define IWARP_OPCODE_SEND_INV 4 -#define IWARP_OPCODE_SEND_SE 5 -#define IWARP_OPCODE_SEND_SE_INV 6 -#define IWARP_OPCODE_TERM 7 - -/* These values are used only during terminate processing */ -#define TERM_DDP_LEN_TAGGED 14 -#define TERM_DDP_LEN_UNTAGGED 18 -#define TERM_RDMA_LEN 28 -#define RDMA_OPCODE_MASK 0x0f -#define RDMA_READ_REQ_OPCODE 1 -#define BAD_FRAME_OFFSET 64 -#define CQE_MAJOR_DRV 0x8000 - -/* Used for link status recheck after interrupt processing */ -#define NES_LINK_RECHECK_DELAY msecs_to_jiffies(50) -#define NES_LINK_RECHECK_MAX 60 - -#endif /* __NES_HW_H */ diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c deleted file mode 100644 index cc4dce5c3e5f..000000000000 --- a/drivers/infiniband/hw/nes/nes_mgt.c +++ /dev/null @@ -1,1155 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include -#include -#include -#include "nes.h" -#include "nes_mgt.h" - -atomic_t pau_qps_created; -atomic_t pau_qps_destroyed; - -static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic) -{ - unsigned long flags; - dma_addr_t bus_address; - struct sk_buff *skb; - struct nes_hw_nic_rq_wqe *nic_rqe; - struct nes_hw_mgt *nesmgt; - struct nes_device *nesdev; - struct nes_rskb_cb *cb; - u32 rx_wqes_posted = 0; - - nesmgt = &mgtvnic->mgt; - nesdev = mgtvnic->nesvnic->nesdev; - spin_lock_irqsave(&nesmgt->rq_lock, flags); - if (nesmgt->replenishing_rq != 0) { - if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && - (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { - atomic_set(&mgtvnic->rx_skb_timer_running, 1); - spin_unlock_irqrestore(&nesmgt->rq_lock, flags); - mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ - add_timer(&mgtvnic->rq_wqes_timer); - } else { - spin_unlock_irqrestore(&nesmgt->rq_lock, flags); - } - return; - } - nesmgt->replenishing_rq = 1; - spin_unlock_irqrestore(&nesmgt->rq_lock, flags); - do { - skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size); - if (skb) { - skb->dev = mgtvnic->nesvnic->netdev; - - bus_address = pci_map_single(nesdev->pcidev, - skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->busaddr = bus_address; - cb->maplen = mgtvnic->nesvnic->max_frame_size; - - nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head]; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = - cpu_to_le32(mgtvnic->nesvnic->max_frame_size); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = - cpu_to_le32((u32)bus_address); - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = - cpu_to_le32((u32)((u64)bus_address >> 32)); - nesmgt->rx_skb[nesmgt->rq_head] = skb; - nesmgt->rq_head++; - nesmgt->rq_head &= nesmgt->rq_size - 1; - atomic_dec(&mgtvnic->rx_skbs_needed); - barrier(); - if (++rx_wqes_posted == 255) { - nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); - rx_wqes_posted = 0; - } - } else { - spin_lock_irqsave(&nesmgt->rq_lock, flags); - if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && - (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { - atomic_set(&mgtvnic->rx_skb_timer_running, 1); - spin_unlock_irqrestore(&nesmgt->rq_lock, flags); - mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ - add_timer(&mgtvnic->rq_wqes_timer); - } else { - spin_unlock_irqrestore(&nesmgt->rq_lock, flags); - } - break; - } - } while (atomic_read(&mgtvnic->rx_skbs_needed)); - barrier(); - if (rx_wqes_posted) - nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); - nesmgt->replenishing_rq = 0; -} - -/** - * nes_mgt_rq_wqes_timeout - */ -static void nes_mgt_rq_wqes_timeout(struct timer_list *t) -{ - struct nes_vnic_mgt *mgtvnic = from_timer(mgtvnic, t, - rq_wqes_timer); - - atomic_set(&mgtvnic->rx_skb_timer_running, 0); - if (atomic_read(&mgtvnic->rx_skbs_needed)) - nes_replenish_mgt_rq(mgtvnic); -} - -/** - * nes_mgt_free_skb - unmap and free skb - */ -static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir) -{ - struct nes_rskb_cb *cb; - - cb = (struct nes_rskb_cb *)&skb->cb[0]; - pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir); - cb->busaddr = 0; - dev_kfree_skb_any(skb); -} - -/** - * nes_download_callback - handle download completions - */ -static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) -{ - struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer; - struct nes_qp *nesqp = fpdu_info->nesqp; - struct sk_buff *skb; - int i; - - for (i = 0; i < fpdu_info->frag_cnt; i++) { - skb = fpdu_info->frags[i].skb; - if (fpdu_info->frags[i].cmplt) { - nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); - nes_rem_ref_cm_node(nesqp->cm_node); - } - } - - if (fpdu_info->hdr_vbase) - pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len, - fpdu_info->hdr_vbase, fpdu_info->hdr_pbase); - kfree(fpdu_info); -} - -/** - * nes_get_seq - Get the seq, ack_seq and window from the packet - */ -static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) -{ - struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0]; - struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN); - struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); - - *ack = be32_to_cpu(tcph->ack_seq); - *wnd = be16_to_cpu(tcph->window); - *fin_rcvd = tcph->fin; - *rst_rcvd = tcph->rst; - return be32_to_cpu(tcph->seq); -} - -/** - * nes_get_next_skb - Get the next skb based on where current skb is in the queue - */ -static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp, - struct sk_buff *skb, u32 nextseq, u32 *ack, - u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) -{ - u32 seq; - bool processacks; - struct sk_buff *old_skb; - - if (skb) { - /* Continue processing fpdu */ - skb = skb_peek_next(skb, &nesqp->pau_list); - if (!skb) - goto out; - processacks = false; - } else { - /* Starting a new one */ - if (skb_queue_empty(&nesqp->pau_list)) - goto out; - skb = skb_peek(&nesqp->pau_list); - processacks = true; - } - - while (1) { - if (skb_queue_empty(&nesqp->pau_list)) - goto out; - - seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); - if (seq == nextseq) { - if (skb->len || processacks) - break; - } else if (after(seq, nextseq)) { - goto out; - } - - old_skb = skb; - skb = skb_peek_next(skb, &nesqp->pau_list); - skb_unlink(old_skb, &nesqp->pau_list); - nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); - nes_rem_ref_cm_node(nesqp->cm_node); - if (!skb) - goto out; - } - return skb; - -out: - return NULL; -} - -/** - * get_fpdu_info - Find the next complete fpdu and return its fragments. - */ -static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, - struct pau_fpdu_info **pau_fpdu_info) -{ - struct sk_buff *skb; - struct iphdr *iph; - struct tcphdr *tcph; - struct nes_rskb_cb *cb; - struct pau_fpdu_info *fpdu_info = NULL; - struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; - u32 fpdu_len = 0; - u32 tmp_len; - int frag_cnt = 0; - u32 tot_len; - u32 frag_tot; - u32 ack; - u32 fin_rcvd; - u32 rst_rcvd; - u16 wnd; - int i; - int rc = 0; - - *pau_fpdu_info = NULL; - - skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) - goto out; - - cb = (struct nes_rskb_cb *)&skb->cb[0]; - if (skb->len) { - fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; - fpdu_len = (fpdu_len + 3) & 0xfffffffc; - tmp_len = fpdu_len; - - /* See if we have all of the fpdu */ - frag_tot = 0; - memset(&frags, 0, sizeof frags); - for (i = 0; i < MAX_FPDU_FRAGS; i++) { - frags[i].physaddr = cb->busaddr; - frags[i].physaddr += skb->data - cb->data_start; - frags[i].frag_len = min(tmp_len, skb->len); - frags[i].skb = skb; - frags[i].cmplt = (skb->len == frags[i].frag_len); - frag_tot += frags[i].frag_len; - frag_cnt++; - - tmp_len -= frags[i].frag_len; - if (tmp_len == 0) - break; - - skb = nes_get_next_skb(nesdev, nesqp, skb, - nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); - if (!skb) - goto out; - if (rst_rcvd) { - /* rst received in the middle of fpdu */ - for (; i >= 0; i--) { - skb_unlink(frags[i].skb, &nesqp->pau_list); - nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE); - } - cb = (struct nes_rskb_cb *)&skb->cb[0]; - frags[0].physaddr = cb->busaddr; - frags[0].physaddr += skb->data - cb->data_start; - frags[0].frag_len = skb->len; - frags[0].skb = skb; - frags[0].cmplt = true; - frag_cnt = 1; - break; - } - - cb = (struct nes_rskb_cb *)&skb->cb[0]; - } - } else { - /* no data */ - frags[0].physaddr = cb->busaddr; - frags[0].frag_len = 0; - frags[0].skb = skb; - frags[0].cmplt = true; - frag_cnt = 1; - } - - /* Found one */ - fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); - if (!fpdu_info) { - rc = -ENOMEM; - goto out; - } - - fpdu_info->cqp_request = nes_get_cqp_request(nesdev); - if (fpdu_info->cqp_request == NULL) { - nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); - rc = -ENOMEM; - goto out; - } - - cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0]; - iph = (struct iphdr *)(cb->data_start + ETH_HLEN); - tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); - fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start; - fpdu_info->data_len = fpdu_len; - tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN; - - if (frags[0].cmplt) { - fpdu_info->hdr_pbase = cb->busaddr; - fpdu_info->hdr_vbase = NULL; - } else { - fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev, - fpdu_info->hdr_len, &fpdu_info->hdr_pbase); - if (!fpdu_info->hdr_vbase) { - nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n"); - rc = -ENOMEM; - goto out; - } - - /* Copy hdrs, adjusting len and seqnum */ - memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len); - iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN); - tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); - } - - iph->tot_len = cpu_to_be16(tot_len); - iph->saddr = cpu_to_be32(0x7f000001); - - tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); - tcph->ack_seq = cpu_to_be32(ack); - tcph->window = cpu_to_be16(wnd); - - nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd; - - memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags)); - fpdu_info->frag_cnt = frag_cnt; - fpdu_info->nesqp = nesqp; - *pau_fpdu_info = fpdu_info; - - /* Update skb's for next pass */ - for (i = 0; i < frag_cnt; i++) { - cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0]; - skb_pull(frags[i].skb, frags[i].frag_len); - - if (frags[i].skb->len == 0) { - /* Pull skb off the list - it will be freed in the callback */ - if (!skb_queue_empty(&nesqp->pau_list)) - skb_unlink(frags[i].skb, &nesqp->pau_list); - } else { - /* Last skb still has data so update the seq */ - iph = (struct iphdr *)(cb->data_start + ETH_HLEN); - tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); - tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); - } - } - -out: - if (rc) { - if (fpdu_info) { - if (fpdu_info->cqp_request) - nes_put_cqp_request(nesdev, fpdu_info->cqp_request); - kfree(fpdu_info); - } - } - return rc; -} - -/** - * forward_fpdu - send complete fpdus, one at a time - */ -static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) -{ - struct nes_device *nesdev = nesvnic->nesdev; - struct pau_fpdu_info *fpdu_info; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - unsigned long flags; - u64 u64tmp; - u32 u32tmp; - int rc; - - while (1) { - spin_lock_irqsave(&nesqp->pau_lock, flags); - rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); - if (rc || (fpdu_info == NULL)) { - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - return rc; - } - - cqp_request = fpdu_info->cqp_request; - cqp_wqe = &cqp_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX, - NES_CQP_DOWNLOAD_SEGMENT | - (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT)); - - u32tmp = fpdu_info->hdr_len << 16; - u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len; - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX, - u32tmp); - - u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len; - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX, - u32tmp); - - u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len; - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX, - u32tmp); - - u64tmp = (u64)fpdu_info->hdr_pbase; - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, - lower_32_bits(u64tmp)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, - upper_32_bits(u64tmp)); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, - lower_32_bits(fpdu_info->frags[0].physaddr)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX, - upper_32_bits(fpdu_info->frags[0].physaddr)); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX, - lower_32_bits(fpdu_info->frags[1].physaddr)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX, - upper_32_bits(fpdu_info->frags[1].physaddr)); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX, - lower_32_bits(fpdu_info->frags[2].physaddr)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX, - upper_32_bits(fpdu_info->frags[2].physaddr)); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX, - lower_32_bits(fpdu_info->frags[3].physaddr)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX, - upper_32_bits(fpdu_info->frags[3].physaddr)); - - cqp_request->cqp_callback_pointer = fpdu_info; - cqp_request->callback = 1; - cqp_request->cqp_callback = nes_download_callback; - - atomic_set(&cqp_request->refcount, 1); - nes_post_cqp_request(nesdev, cqp_request); - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - } - - return 0; -} - -static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) -{ - int again = 1; - unsigned long flags; - - do { - /* Ignore rc - if it failed, tcp retries will cause it to try again */ - forward_fpdus(nesvnic, nesqp); - - spin_lock_irqsave(&nesqp->pau_lock, flags); - if (nesqp->pau_pending) { - nesqp->pau_pending = 0; - } else { - nesqp->pau_busy = 0; - again = 0; - } - - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - } while (again); -} - -/** - * queue_fpdus - Handle fpdu's that hw passed up to sw - */ -static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) -{ - struct sk_buff *tmpskb; - struct nes_rskb_cb *cb; - struct iphdr *iph; - struct tcphdr *tcph; - unsigned char *tcph_end; - u32 rcv_nxt; - u32 rcv_wnd; - u32 seqnum; - u32 len; - bool process_it = false; - unsigned long flags; - - /* Move data ptr to after tcp header */ - iph = (struct iphdr *)skb->data; - tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); - seqnum = be32_to_cpu(tcph->seq); - tcph_end = (((char *)tcph) + (4 * tcph->doff)); - - len = be16_to_cpu(iph->tot_len); - if (skb->len > len) - skb_trim(skb, len); - skb_pull(skb, tcph_end - skb->data); - - /* Initialize tracking values */ - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->seqnum = seqnum; - - /* Make sure data is in the receive window */ - rcv_nxt = nesqp->pau_rcv_nxt; - rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd); - if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) { - nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE); - nes_rem_ref_cm_node(nesqp->cm_node); - return; - } - - spin_lock_irqsave(&nesqp->pau_lock, flags); - - if (nesqp->pau_busy) - nesqp->pau_pending = 1; - else - nesqp->pau_busy = 1; - - /* Queue skb by sequence number */ - if (skb_queue_len(&nesqp->pau_list) == 0) { - __skb_queue_head(&nesqp->pau_list, skb); - } else { - skb_queue_walk(&nesqp->pau_list, tmpskb) { - cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; - if (before(seqnum, cb->seqnum)) - break; - } - __skb_insert(skb, tmpskb->prev, tmpskb, &nesqp->pau_list); - } - if (nesqp->pau_state == PAU_READY) - process_it = true; - spin_unlock_irqrestore(&nesqp->pau_lock, flags); - - if (process_it) - process_fpdus(nesvnic, nesqp); - - return; -} - -/** - * mgt_thread - Handle mgt skbs in a safe context - */ -static int mgt_thread(void *context) -{ - struct nes_vnic *nesvnic = context; - struct sk_buff *skb; - struct nes_rskb_cb *cb; - - while (!kthread_should_stop()) { - wait_event_interruptible(nesvnic->mgt_wait_queue, - skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop()); - while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) { - skb = skb_dequeue(&nesvnic->mgt_skb_list); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->data_start = skb->data - ETH_HLEN; - cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start, - nesvnic->max_frame_size, PCI_DMA_TODEVICE); - queue_fpdus(skb, nesvnic, cb->nesqp); - } - } - - /* Closing down so delete any entries on the queue */ - while (skb_queue_len(&nesvnic->mgt_skb_list)) { - skb = skb_dequeue(&nesvnic->mgt_skb_list); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - nes_rem_ref_cm_node(cb->nesqp->cm_node); - dev_kfree_skb_any(skb); - } - return 0; -} - -/** - * nes_queue_skbs - Queue skb so it can be handled in a thread context - */ -void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) -{ - struct nes_rskb_cb *cb; - - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->nesqp = nesqp; - skb_queue_tail(&nesvnic->mgt_skb_list, skb); - wake_up_interruptible(&nesvnic->mgt_wait_queue); -} - -void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp) -{ - struct sk_buff *skb; - unsigned long flags; - atomic_inc(&pau_qps_destroyed); - - /* Free packets that have not yet been forwarded */ - /* Lock is acquired by skb_dequeue when removing the skb */ - spin_lock_irqsave(&nesqp->pau_lock, flags); - while (skb_queue_len(&nesqp->pau_list)) { - skb = skb_dequeue(&nesqp->pau_list); - nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); - nes_rem_ref_cm_node(nesqp->cm_node); - } - spin_unlock_irqrestore(&nesqp->pau_lock, flags); -} - -static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) -{ - struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer; - struct nes_cqp_request *new_request; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_adapter *nesadapter; - struct nes_qp *nesqp; - struct nes_v4_quad nes_quad; - u32 crc_value; - u64 u64temp; - - nesadapter = nesdev->nesadapter; - nesqp = qh_chg->nesqp; - - /* Should we handle the bad completion */ - if (cqp_request->major_code) - WARN(1, PFX "Invalid cqp_request major_code=0x%x\n", - cqp_request->major_code); - - switch (nesqp->pau_state) { - case PAU_DEL_QH: - /* Old hash code deleted, now set the new one */ - nesqp->pau_state = PAU_ADD_LB_QH; - new_request = nes_get_cqp_request(nesdev); - if (new_request == NULL) { - nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n"); - WARN_ON(1); - return; - } - - memset(&nes_quad, 0, sizeof(nes_quad)); - nes_quad.DstIpAdrIndex = - cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = cpu_to_be32(0x7f000001); - nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]); - nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]); - - /* Produce hash key */ - crc_value = get_crc_value(&nes_quad); - nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); - nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n", - nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); - - nesqp->hte_index &= nesadapter->hte_index_mask; - nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); - nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001); - nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt); - - cqp_wqe = &new_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, - NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | - NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); - u64temp = (u64)nesqp->nesqp_context_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - - nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n"); - - new_request->cqp_callback_pointer = qh_chg; - new_request->callback = 1; - new_request->cqp_callback = nes_chg_qh_handler; - atomic_set(&new_request->refcount, 1); - nes_post_cqp_request(nesdev, new_request); - break; - - case PAU_ADD_LB_QH: - /* Start processing the queued fpdu's */ - nesqp->pau_state = PAU_READY; - process_fpdus(qh_chg->nesvnic, qh_chg->nesqp); - kfree(qh_chg); - break; - } -} - -/** - * nes_change_quad_hash - */ -static int nes_change_quad_hash(struct nes_device *nesdev, - struct nes_vnic *nesvnic, struct nes_qp *nesqp) -{ - struct nes_cqp_request *cqp_request = NULL; - struct pau_qh_chg *qh_chg = NULL; - u64 u64temp; - struct nes_hw_cqp_wqe *cqp_wqe; - int ret = 0; - - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); - ret = -ENOMEM; - goto chg_qh_err; - } - - qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC); - if (!qh_chg) { - ret = -ENOMEM; - goto chg_qh_err; - } - qh_chg->nesdev = nesdev; - qh_chg->nesvnic = nesvnic; - qh_chg->nesqp = nesqp; - nesqp->pau_state = PAU_DEL_QH; - - cqp_wqe = &cqp_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, - NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE | - NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); - u64temp = (u64)nesqp->nesqp_context_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - - nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n"); - - cqp_request->cqp_callback_pointer = qh_chg; - cqp_request->callback = 1; - cqp_request->cqp_callback = nes_chg_qh_handler; - atomic_set(&cqp_request->refcount, 1); - nes_post_cqp_request(nesdev, cqp_request); - - return ret; - -chg_qh_err: - kfree(qh_chg); - if (cqp_request) - nes_put_cqp_request(nesdev, cqp_request); - return ret; -} - -/** - * nes_mgt_ce_handler - * This management code deals with any packed and unaligned (pau) fpdu's - * that the hardware cannot handle. - */ -static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) -{ - struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq); - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 head; - u32 cq_size; - u32 cqe_count = 0; - u32 cqe_misc; - u32 qp_id = 0; - u32 skbs_needed; - unsigned long context; - struct nes_qp *nesqp; - struct sk_buff *rx_skb; - struct nes_rskb_cb *cb; - - head = cq->cq_head; - cq_size = cq->cq_size; - - while (1) { - cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); - if (!(cqe_misc & NES_NIC_CQE_VALID)) - break; - - nesqp = NULL; - if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) { - qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]); - qp_id &= 0x001fffff; - if (qp_id < nesadapter->max_qp) { - context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN]; - nesqp = (struct nes_qp *)context; - } - } - - if (nesqp) { - if (nesqp->pau_mode == false) { - nesqp->pau_mode = true; /* First time for this qp */ - nesqp->pau_rcv_nxt = le32_to_cpu( - cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]); - skb_queue_head_init(&nesqp->pau_list); - spin_lock_init(&nesqp->pau_lock); - atomic_inc(&pau_qps_created); - nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp); - } - - rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; - rx_skb->len = 0; - skb_put(rx_skb, cqe_misc & 0x0000ffff); - rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev); - cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; - pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE); - cb->busaddr = 0; - mgtvnic->mgt.rq_tail++; - mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1; - - nes_add_ref_cm_node(nesqp->cm_node); - nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp); - } else { - printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id); - } - - cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; - cqe_count++; - if (++head >= cq_size) - head = 0; - - if (cqe_count == 255) { - /* Replenish mgt CQ */ - nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); - nesdev->currcq_count += cqe_count; - cqe_count = 0; - } - - skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed); - if (skbs_needed > (mgtvnic->mgt.rq_size >> 1)) - nes_replenish_mgt_rq(mgtvnic); - } - - cq->cq_head = head; - nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - cq->cq_number | (cqe_count << 16)); - nes_read32(nesdev->regs + NES_CQE_ALLOC); - nesdev->currcq_count += cqe_count; -} - -/** - * nes_init_mgt_qp - */ -int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic) -{ - struct nes_vnic_mgt *mgtvnic; - u32 counter; - void *vmem; - dma_addr_t pmem; - struct nes_hw_cqp_wqe *cqp_wqe; - u32 cqp_head; - unsigned long flags; - struct nes_hw_nic_qp_context *mgt_context; - u64 u64temp; - struct nes_hw_nic_rq_wqe *mgt_rqe; - struct sk_buff *skb; - u32 wqe_count; - struct nes_rskb_cb *cb; - u32 mgt_mem_size; - void *mgt_vbase; - dma_addr_t mgt_pbase; - int i; - int ret; - - /* Allocate space the all mgt QPs once */ - mgtvnic = kcalloc(NES_MGT_QP_COUNT, sizeof(struct nes_vnic_mgt), - GFP_KERNEL); - if (!mgtvnic) - return -ENOMEM; - - /* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */ - /* We are not sending from this NIC so sq is not allocated */ - mgt_mem_size = 256 + - (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) + - (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) + - sizeof(struct nes_hw_nic_qp_context); - mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); - mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase); - if (!mgt_vbase) { - kfree(mgtvnic); - nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n"); - return -ENOMEM; - } - - nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size; - nesvnic->mgt_vbase = mgt_vbase; - nesvnic->mgt_pbase = mgt_pbase; - - skb_queue_head_init(&nesvnic->mgt_skb_list); - init_waitqueue_head(&nesvnic->mgt_wait_queue); - nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread"); - - for (i = 0; i < NES_MGT_QP_COUNT; i++) { - mgtvnic->nesvnic = nesvnic; - mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i; - memset(mgt_vbase, 0, mgt_mem_size); - nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n", - mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size); - - vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) & - ~(unsigned long)(256 - 1)); - pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) & - ~(unsigned long long)(256 - 1)); - - spin_lock_init(&mgtvnic->mgt.rq_lock); - - /* setup the RQ */ - mgtvnic->mgt.rq_vbase = vmem; - mgtvnic->mgt.rq_pbase = pmem; - mgtvnic->mgt.rq_head = 0; - mgtvnic->mgt.rq_tail = 0; - mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT; - - /* setup the CQ */ - vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); - pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); - - mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id; - mgtvnic->mgt_cq.cq_vbase = vmem; - mgtvnic->mgt_cq.cq_pbase = pmem; - mgtvnic->mgt_cq.cq_head = 0; - mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT; - - mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler; - - /* Send CreateCQ request to CQP */ - spin_lock_irqsave(&nesdev->cqp.lock, flags); - cqp_head = nesdev->cqp.sq_head; - - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( - NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | - ((u32)mgtvnic->mgt_cq.cq_size << 16)); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( - mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)); - u64temp = (u64)mgtvnic->mgt_cq.cq_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; - u64temp = (unsigned long)&mgtvnic->mgt_cq; - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = - cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - /* Send CreateQP request to CQP */ - mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]); - mgt_context->context_words[NES_NIC_CTX_MISC_IDX] = - cpu_to_le32((u32)NES_MGT_CTX_SIZE | - ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); - nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", - nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), - nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); - if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) - mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); - - u64temp = (u64)mgtvnic->mgt.rq_pbase; - mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); - mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); - u64temp = (u64)mgtvnic->mgt.rq_pbase; - mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); - mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | - NES_CQP_QP_TYPE_NIC); - cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id); - u64temp = (u64)mgtvnic->mgt_cq.cq_pbase + - (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - nesdev->cqp.sq_head = cqp_head; - - barrier(); - - /* Ring doorbell (2 WQEs) */ - nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n", - mgtvnic->mgt.qp_id); - - ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n", - mgtvnic->mgt.qp_id, ret); - if (!ret) { - nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id); - if (i == 0) { - pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, - nesvnic->mgt_pbase); - kfree(mgtvnic); - } else { - nes_destroy_mgt(nesvnic); - } - return -EIO; - } - - /* Populate the RQ */ - for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) { - skb = dev_alloc_skb(nesvnic->max_frame_size); - if (!skb) { - nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); - return -ENOMEM; - } - - skb->dev = netdev; - - pmem = pci_map_single(nesdev->pcidev, skb->data, - nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); - cb = (struct nes_rskb_cb *)&skb->cb[0]; - cb->busaddr = pmem; - cb->maplen = nesvnic->max_frame_size; - - mgt_rqe = &mgtvnic->mgt.rq_vbase[counter]; - mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size); - mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; - mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); - mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); - mgtvnic->mgt.rx_skb[counter] = skb; - } - - timer_setup(&mgtvnic->rq_wqes_timer, nes_mgt_rq_wqes_timeout, - 0); - - wqe_count = NES_MGT_WQ_COUNT - 1; - mgtvnic->mgt.rq_head = wqe_count; - barrier(); - do { - counter = min(wqe_count, ((u32)255)); - wqe_count -= counter; - nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id); - } while (wqe_count); - - nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - mgtvnic->mgt_cq.cq_number); - nes_read32(nesdev->regs + NES_CQE_ALLOC); - - mgt_vbase += mgt_mem_size; - mgt_pbase += mgt_mem_size; - nesvnic->mgtvnic[i] = mgtvnic++; - } - return 0; -} - - -void nes_destroy_mgt(struct nes_vnic *nesvnic) -{ - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_vnic_mgt *mgtvnic; - struct nes_vnic_mgt *first_mgtvnic; - unsigned long flags; - struct nes_hw_cqp_wqe *cqp_wqe; - u32 cqp_head; - struct sk_buff *rx_skb; - int i; - int ret; - - kthread_stop(nesvnic->mgt_thread); - - /* Free remaining NIC receive buffers */ - first_mgtvnic = nesvnic->mgtvnic[0]; - for (i = 0; i < NES_MGT_QP_COUNT; i++) { - mgtvnic = nesvnic->mgtvnic[i]; - if (mgtvnic == NULL) - continue; - - while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) { - rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; - nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE); - mgtvnic->mgt.rq_tail++; - mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1); - } - - spin_lock_irqsave(&nesdev->cqp.lock, flags); - - /* Destroy NIC QP */ - cqp_head = nesdev->cqp.sq_head; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - mgtvnic->mgt.qp_id); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - - /* Destroy NIC CQ */ - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16))); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16))); - - if (++cqp_head >= nesdev->cqp.sq_size) - cqp_head = 0; - - nesdev->cqp.sq_head = cqp_head; - barrier(); - - /* Ring doorbell (2 WQEs) */ - nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," - " cqp.sq_tail=%u, cqp.sq_size=%u\n", - cqp_head, nesdev->cqp.sq_head, - nesdev->cqp.sq_tail, nesdev->cqp.sq_size); - - ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), - NES_EVENT_TIMEOUT); - - nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u," - " cqp.sq_head=%u, cqp.sq_tail=%u\n", - ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); - if (!ret) - nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n", - mgtvnic->mgt.qp_id); - - nesvnic->mgtvnic[i] = NULL; - } - - if (nesvnic->mgt_vbase) { - pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, - nesvnic->mgt_pbase); - nesvnic->mgt_vbase = NULL; - nesvnic->mgt_pbase = 0; - } - - kfree(first_mgtvnic); -} diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h deleted file mode 100644 index 4f7f701c4a81..000000000000 --- a/drivers/infiniband/hw/nes/nes_mgt.h +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. -* -* This software is available to you under a choice of one of two -* licenses. You may choose to be licensed under the terms of the GNU -* General Public License (GPL) Version 2, available from the file -* COPYING in the main directory of this source tree, or the -* OpenIB.org BSD license below: -* -* Redistribution and use in source and binary forms, with or -* without modification, are permitted provided that the following -* conditions are met: -* -* - Redistributions of source code must retain the above -* copyright notice, this list of conditions and the following -* disclaimer. -* -* - Redistributions in binary form must reproduce the above -* copyright notice, this list of conditions and the following -* disclaimer in the documentation and/or other materials -* provided with the distribution. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ - -#ifndef __NES_MGT_H -#define __NES_MGT_H - -#define MPA_FRAMING 6 /* length is 2 bytes, crc is 4 bytes */ - -int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic); -void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp); -void nes_destroy_mgt(struct nes_vnic *nesvnic); -void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp); - -struct nes_hw_mgt { - struct nes_hw_nic_rq_wqe *rq_vbase; /* virtual address of rq */ - dma_addr_t rq_pbase; /* PCI memory for host rings */ - struct sk_buff *rx_skb[NES_NIC_WQ_SIZE]; - u16 qp_id; - u16 sq_head; - u16 rq_head; - u16 rq_tail; - u16 rq_size; - u8 replenishing_rq; - u8 reserved; - spinlock_t rq_lock; -}; - -struct nes_vnic_mgt { - struct nes_vnic *nesvnic; - struct nes_hw_mgt mgt; - struct nes_hw_nic_cq mgt_cq; - atomic_t rx_skbs_needed; - struct timer_list rq_wqes_timer; - atomic_t rx_skb_timer_running; -}; - -#define MAX_FPDU_FRAGS 4 -struct pau_fpdu_frag { - struct sk_buff *skb; - u64 physaddr; - u32 frag_len; - bool cmplt; -}; - -struct pau_fpdu_info { - struct nes_qp *nesqp; - struct nes_cqp_request *cqp_request; - void *hdr_vbase; - dma_addr_t hdr_pbase; - int hdr_len; - u16 data_len; - u16 frag_cnt; - struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; -}; - -enum pau_qh_state { - PAU_DEL_QH, - PAU_ADD_LB_QH, - PAU_READY -}; - -struct pau_qh_chg { - struct nes_device *nesdev; - struct nes_vnic *nesvnic; - struct nes_qp *nesqp; -}; - -#endif /* __NES_MGT_H */ diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c deleted file mode 100644 index 16f33454c198..000000000000 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ /dev/null @@ -1,1870 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "nes.h" - -static struct nic_qp_map nic_qp_mapping_0[] = { - {16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0}, - {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}, - {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, - {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} -}; - -static struct nic_qp_map nic_qp_mapping_1[] = { - {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, - {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} -}; - -static struct nic_qp_map nic_qp_mapping_2[] = { - {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0} -}; - -static struct nic_qp_map nic_qp_mapping_3[] = { - {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} -}; - -static struct nic_qp_map nic_qp_mapping_4[] = { - {28,8,0,0},{32,12,0,0} -}; - -static struct nic_qp_map nic_qp_mapping_5[] = { - {29,9,1,0},{33,13,1,0} -}; - -static struct nic_qp_map nic_qp_mapping_6[] = { - {30,10,2,0},{34,14,2,0} -}; - -static struct nic_qp_map nic_qp_mapping_7[] = { - {31,11,3,0},{35,15,3,0} -}; - -static struct nic_qp_map *nic_qp_mapping_per_function[] = { - nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3, - nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7 -}; - -static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK - | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; -static int debug = -1; -static int nics_per_function = 1; - -/** - * nes_netdev_poll - */ -static int nes_netdev_poll(struct napi_struct *napi, int budget) -{ - struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq; - - nesvnic->budget = budget; - nescq->cqes_pending = 0; - nescq->rx_cqes_completed = 0; - nescq->cqe_allocs_pending = 0; - nescq->rx_pkts_indicated = 0; - - nes_nic_ce_handler(nesdev, nescq); - - if (nescq->cqes_pending == 0) { - napi_complete(napi); - /* clear out completed cqes and arm */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - nescq->cq_number | (nescq->cqe_allocs_pending << 16)); - nes_read32(nesdev->regs+NES_CQE_ALLOC); - } else { - /* clear out completed cqes but don't arm */ - nes_write32(nesdev->regs+NES_CQE_ALLOC, - nescq->cq_number | (nescq->cqe_allocs_pending << 16)); - nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n", - nesvnic->netdev->name); - } - return nescq->rx_pkts_indicated; -} - - -/** - * nes_netdev_open - Activate the network interface; ifconfig - * ethx up. - */ -static int nes_netdev_open(struct net_device *netdev) -{ - u32 macaddr_low; - u16 macaddr_high; - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - int ret; - int i; - struct nes_vnic *first_nesvnic = NULL; - u32 nic_active_bit; - u32 nic_active; - struct list_head *list_pos, *list_temp; - unsigned long flags; - - if (nesvnic->netdev_open == 1) - return 0; - - if (netif_msg_ifup(nesvnic)) - printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name); - - ret = nes_init_nic_qp(nesdev, netdev); - if (ret) { - return ret; - } - - netif_carrier_off(netdev); - netif_stop_queue(netdev); - - if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) { - nesvnic->nesibdev = nes_init_ofa_device(netdev); - if (nesvnic->nesibdev == NULL) { - printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name); - } else { - nesvnic->nesibdev->nesvnic = nesvnic; - ret = nes_register_ofa_device(nesvnic->nesibdev); - if (ret) { - printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n", - netdev->name, ret); - } - } - } - /* Set packet filters */ - nic_active_bit = 1 << nesvnic->nic_index; - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); - - macaddr_high = ((u16)netdev->dev_addr[0]) << 8; - macaddr_high += (u16)netdev->dev_addr[1]; - - macaddr_low = ((u32)netdev->dev_addr[2]) << 24; - macaddr_low += ((u32)netdev->dev_addr[3]) << 16; - macaddr_low += ((u32)netdev->dev_addr[4]) << 8; - macaddr_low += (u32)netdev->dev_addr[5]; - - /* Program the various MAC regs */ - for (i = 0; i < NES_MAX_PORT_COUNT; i++) { - if (nesvnic->qp_nic_index[i] == 0xf) { - break; - } - nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW" - " (Addr:%08X) = %08X, HIGH = %08X.\n", - i, nesvnic->qp_nic_index[i], - NES_IDX_PERFECT_FILTER_LOW+ - (nesvnic->qp_nic_index[i] * 8), - macaddr_low, - (u32)macaddr_high | NES_MAC_ADDR_VALID | - ((((u32)nesvnic->nic_index) << 16))); - nes_write_indexed(nesdev, - NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), - macaddr_low); - nes_write_indexed(nesdev, - NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), - (u32)macaddr_high | NES_MAC_ADDR_VALID | - ((((u32)nesvnic->nic_index) << 16))); - } - - - nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | - nesvnic->nic_cq.cq_number); - nes_read32(nesdev->regs+NES_CQE_ALLOC); - list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { - first_nesvnic = container_of(list_pos, struct nes_vnic, list); - if (first_nesvnic->netdev_open == 1) - break; - } - if (first_nesvnic->netdev_open == 0) { - nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n"); - nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index), - ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | - NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); - first_nesvnic = nesvnic; - } - - if (first_nesvnic->linkup) { - /* Enable network packets */ - nesvnic->linkup = 1; - netif_start_queue(netdev); - netif_carrier_on(netdev); - } - - spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); - if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) { - nesdev->link_recheck = 1; - mod_delayed_work(system_wq, &nesdev->work, - NES_LINK_RECHECK_DELAY); - } - spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); - - spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); - if (nesvnic->of_device_registered) { - nesdev->nesadapter->send_term_ok = 1; - if (nesvnic->linkup == 1) { - if (nesdev->iw_status == 0) { - nesdev->iw_status = 1; - nes_port_ibevent(nesvnic); - } - } else { - nesdev->iw_status = 0; - } - } - spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); - - napi_enable(&nesvnic->napi); - nesvnic->netdev_open = 1; - - return 0; -} - - -/** - * nes_netdev_stop - */ -static int nes_netdev_stop(struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - u32 nic_active_mask; - u32 nic_active; - struct nes_vnic *first_nesvnic = NULL; - struct list_head *list_pos, *list_temp; - unsigned long flags; - - nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n", - nesvnic, nesdev, netdev, netdev->name); - if (nesvnic->netdev_open == 0) - return 0; - - if (netif_msg_ifdown(nesvnic)) - printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name); - netif_carrier_off(netdev); - - /* Disable network packets */ - napi_disable(&nesvnic->napi); - netif_stop_queue(netdev); - list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { - first_nesvnic = container_of(list_pos, struct nes_vnic, list); - if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)) - break; - } - - if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic) && - (PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) != - PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+ - (0x200*nesdev->mac_index), 0xffffffff); - nes_write_indexed(first_nesvnic->nesdev, - NES_IDX_MAC_INT_MASK+ - (0x200*first_nesvnic->nesdev->mac_index), - ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | - NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); - } else { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff); - } - - nic_active_mask = ~((u32)(1 << nesvnic->nic_index)); - nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+ - (nesvnic->perfect_filter_index*8), 0); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); - nic_active &= nic_active_mask; - nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); - nic_active &= nic_active_mask; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); - nic_active &= nic_active_mask; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); - nic_active &= nic_active_mask; - nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); - nic_active &= nic_active_mask; - nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); - - spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); - if (nesvnic->of_device_registered) { - nesdev->nesadapter->send_term_ok = 0; - nesdev->iw_status = 0; - if (nesvnic->linkup == 1) - nes_port_ibevent(nesvnic); - } - del_timer_sync(&nesvnic->event_timer); - nesvnic->event_timer.function = NULL; - spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); - - nes_destroy_nic_qp(nesvnic); - - nesvnic->netdev_open = 0; - - return 0; -} - - -/** - * nes_nic_send - */ -static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_nic *nesnic = &nesvnic->nic; - struct nes_hw_nic_sq_wqe *nic_sqe; - struct tcphdr *tcph; - __le16 *wqe_fragment_length; - u32 wqe_misc; - u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ - u16 skb_fragment_index; - dma_addr_t bus_address; - - nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; - wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; - - /* setup the VLAN tag if present */ - if (skb_vlan_tag_present(skb)) { - nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", - netdev->name, skb_vlan_tag_get(skb)); - wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; - wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb); - } else - wqe_misc = 0; - - /* bump past the vlan tag */ - wqe_fragment_length++; - /* wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */ - wqe_misc |= NES_NIC_SQ_WQE_COMPLETION; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb_is_gso(skb)) { - tcph = tcp_hdr(skb); - /* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n", - netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */ - wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size; - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, - ((u32)tcph->doff) | - (((u32)(((unsigned char *)tcph) - skb->data)) << 4)); - } - } else { /* CHECKSUM_HW */ - wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM; - } - - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, - skb->len); - memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, - skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb))); - wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), - skb_headlen(skb))); - wqe_fragment_length[1] = 0; - if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { - if ((skb_shinfo(skb)->nr_frags + 1) > 4) { - nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n", - netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb)); - kfree_skb(skb); - nesvnic->tx_sw_dropped++; - return false; - } - set_bit(nesnic->sq_head, nesnic->first_frag_overflow); - bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE, - skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE); - wqe_fragment_length[wqe_fragment_index++] = - cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE); - wqe_fragment_length[wqe_fragment_index] = 0; - set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, - ((u64)(bus_address))); - nesnic->tx_skb[nesnic->sq_head] = skb; - } - - if (skb_headlen(skb) == skb->len) { - if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) { - nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0; - nesnic->tx_skb[nesnic->sq_head] = skb; - } - } else { - /* Deal with Fragments */ - nesnic->tx_skb[nesnic->sq_head] = skb; - for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags; - skb_fragment_index++) { - skb_frag_t *frag = - &skb_shinfo(skb)->frags[skb_fragment_index]; - bus_address = skb_frag_dma_map(&nesdev->pcidev->dev, - frag, 0, skb_frag_size(frag), - DMA_TO_DEVICE); - wqe_fragment_length[wqe_fragment_index] = - cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index])); - set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), - bus_address); - wqe_fragment_index++; - if (wqe_fragment_index < 5) - wqe_fragment_length[wqe_fragment_index] = 0; - } - } - - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc); - nesnic->sq_head++; - nesnic->sq_head &= nesnic->sq_size - 1; - return true; -} - - -/** - * nes_netdev_start_xmit - */ -static netdev_tx_t nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_hw_nic *nesnic = &nesvnic->nic; - struct nes_hw_nic_sq_wqe *nic_sqe; - struct tcphdr *tcph; - /* struct udphdr *udph; */ -#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS - /* 64K segment plus overflow on each side */ - dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS]; - dma_addr_t bus_address; - u32 tso_frag_index; - u32 tso_frag_count; - u32 tso_wqe_length; - u32 curr_tcp_seq; - u32 wqe_count=1; - struct iphdr *iph; - __le16 *wqe_fragment_length; - u32 nr_frags; - u32 original_first_length; - /* u64 *wqe_fragment_address; */ - /* first fragment (0) is used by copy buffer */ - u16 wqe_fragment_index=1; - u16 hoffset; - u16 nhoffset; - u16 wqes_needed; - u16 wqes_available; - u32 wqe_misc; - - /* - * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," - * " (%u frags), tso_size=%u\n", - * netdev->name, skb->len, skb_headlen(skb), - * skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); - */ - - if (netif_queue_stopped(netdev)) - return NETDEV_TX_BUSY; - - /* Check if SQ is full */ - if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) { - if (!netif_queue_stopped(netdev)) { - netif_stop_queue(netdev); - barrier(); - if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) { - netif_start_queue(netdev); - goto sq_no_longer_full; - } - } - nesvnic->sq_full++; - return NETDEV_TX_BUSY; - } - -sq_no_longer_full: - nr_frags = skb_shinfo(skb)->nr_frags; - if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { - nr_frags++; - } - /* Check if too many fragments */ - if (unlikely((nr_frags > 4))) { - if (skb_is_gso(skb)) { - nesvnic->segmented_tso_requests++; - nesvnic->tso_requests++; - /* Basically 4 fragments available per WQE with extended fragments */ - wqes_needed = nr_frags >> 2; - wqes_needed += (nr_frags&3)?1:0; - wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) & - (nesnic->sq_size - 1); - - if (unlikely(wqes_needed > wqes_available)) { - if (!netif_queue_stopped(netdev)) { - netif_stop_queue(netdev); - barrier(); - wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) & - (nesnic->sq_size - 1); - if (wqes_needed <= wqes_available) { - netif_start_queue(netdev); - goto tso_sq_no_longer_full; - } - } - nesvnic->sq_full++; - nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n", - netdev->name); - return NETDEV_TX_BUSY; - } -tso_sq_no_longer_full: - /* Map all the buffers */ - for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags; - tso_frag_count++) { - skb_frag_t *frag = - &skb_shinfo(skb)->frags[tso_frag_count]; - tso_bus_address[tso_frag_count] = - skb_frag_dma_map(&nesdev->pcidev->dev, - frag, 0, skb_frag_size(frag), - DMA_TO_DEVICE); - } - - tso_frag_index = 0; - curr_tcp_seq = ntohl(tcp_hdr(skb)->seq); - hoffset = skb_transport_header(skb) - skb->data; - nhoffset = skb_network_header(skb) - skb->data; - original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2); - - for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) { - tso_wqe_length = 0; - nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; - wqe_fragment_length = - (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; - /* setup the VLAN tag if present */ - if (skb_vlan_tag_present(skb)) { - nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", - netdev->name, - skb_vlan_tag_get(skb)); - wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; - wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb); - } else - wqe_misc = 0; - - /* bump past the vlan tag */ - wqe_fragment_length++; - - /* Assumes header totally fits in allocated buffer and is in first fragment */ - if (original_first_length > NES_FIRST_FRAG_SIZE) { - nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n", - original_first_length, NES_FIRST_FRAG_SIZE); - nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," - " (%u frags), is_gso = %u tso_size=%u\n", - netdev->name, - skb->len, skb_headlen(skb), - skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size); - } - memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, - skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), - original_first_length)); - iph = (struct iphdr *) - (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]); - tcph = (struct tcphdr *) - (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]); - if ((wqe_count+1)!=(u32)wqes_needed) { - tcph->fin = 0; - tcph->psh = 0; - tcph->rst = 0; - tcph->urg = 0; - } - if (wqe_count) { - tcph->syn = 0; - } - tcph->seq = htonl(curr_tcp_seq); - wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), - original_first_length)); - - wqe_fragment_index = 1; - if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) { - set_bit(nesnic->sq_head, nesnic->first_frag_overflow); - bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length, - skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE); - wqe_fragment_length[wqe_fragment_index++] = - cpu_to_le16(skb_headlen(skb) - original_first_length); - wqe_fragment_length[wqe_fragment_index] = 0; - set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, - bus_address); - tso_wqe_length += skb_headlen(skb) - - original_first_length; - } - while (wqe_fragment_index < 5) { - wqe_fragment_length[wqe_fragment_index] = - cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index])); - set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), - (u64)tso_bus_address[tso_frag_index]); - wqe_fragment_index++; - tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]); - if (wqe_fragment_index < 5) - wqe_fragment_length[wqe_fragment_index] = 0; - if (tso_frag_index == tso_frag_count) - break; - } - if ((wqe_count+1) == (u32)wqes_needed) { - nesnic->tx_skb[nesnic->sq_head] = skb; - } else { - nesnic->tx_skb[nesnic->sq_head] = NULL; - } - wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size; - if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) { - wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE; - } else { - iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset); - } - - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, - wqe_misc); - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, - ((u32)tcph->doff) | (((u32)hoffset) << 4)); - - set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, - tso_wqe_length + original_first_length); - curr_tcp_seq += tso_wqe_length; - nesnic->sq_head++; - nesnic->sq_head &= nesnic->sq_size-1; - } - } else { - hoffset = skb_transport_header(skb) - skb->data; - nhoffset = skb_network_header(skb) - skb->data; - if (skb_linearize(skb)) { - nesvnic->tx_sw_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; - } - nesvnic->linearized_skbs++; - skb_set_transport_header(skb, hoffset); - skb_set_network_header(skb, nhoffset); - if (!nes_nic_send(skb, netdev)) - return NETDEV_TX_OK; - } - } else { - if (!nes_nic_send(skb, netdev)) - return NETDEV_TX_OK; - } - - barrier(); - - if (wqe_count) - nes_write32(nesdev->regs+NES_WQE_ALLOC, - (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id); - - netif_trans_update(netdev); - - return NETDEV_TX_OK; -} - - -/** - * nes_netdev_get_stats - */ -static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - u64 u64temp; - u32 u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->endnode_nstat_rx_discard += u32temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; - - nesvnic->endnode_nstat_rx_octets += u64temp; - nesvnic->netstats.rx_bytes += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; - - nesvnic->endnode_nstat_rx_frames += u64temp; - nesvnic->netstats.rx_packets += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; - - nesvnic->endnode_nstat_tx_octets += u64temp; - nesvnic->netstats.tx_bytes += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; - - nesvnic->endnode_nstat_tx_frames += u64temp; - nesvnic->netstats.tx_packets += u64temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_short_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_oversized_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_jabber_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_length_errors += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_crc_errors += u32temp; - nesvnic->netstats.rx_crc_errors += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_tx_errors += u32temp; - nesvnic->netstats.tx_errors += u32temp; - - return &nesvnic->netstats; -} - - -/** - * nes_netdev_tx_timeout - */ -static void nes_netdev_tx_timeout(struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - - if (netif_msg_timer(nesvnic)) - nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name); -} - - -/** - * nes_netdev_set_mac_address - */ -static int nes_netdev_set_mac_address(struct net_device *netdev, void *p) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct sockaddr *mac_addr = p; - int i; - u32 macaddr_low; - u16 macaddr_high; - - if (!is_valid_ether_addr(mac_addr->sa_data)) - return -EADDRNOTAVAIL; - - memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len); - printk(PFX "%s: Address length = %d, Address = %pM\n", - __func__, netdev->addr_len, mac_addr->sa_data); - macaddr_high = ((u16)netdev->dev_addr[0]) << 8; - macaddr_high += (u16)netdev->dev_addr[1]; - macaddr_low = ((u32)netdev->dev_addr[2]) << 24; - macaddr_low += ((u32)netdev->dev_addr[3]) << 16; - macaddr_low += ((u32)netdev->dev_addr[4]) << 8; - macaddr_low += (u32)netdev->dev_addr[5]; - - for (i = 0; i < NES_MAX_PORT_COUNT; i++) { - if (nesvnic->qp_nic_index[i] == 0xf) { - break; - } - nes_write_indexed(nesdev, - NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), - macaddr_low); - nes_write_indexed(nesdev, - NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), - (u32)macaddr_high | NES_MAC_ADDR_VALID | - ((((u32)nesvnic->nic_index) << 16))); - } - return 0; -} - - -static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit) -{ - u32 nic_active; - - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); - nic_active &= ~nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); -} - -#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN) - -/** - * nes_netdev_set_multicast_list - */ -static void nes_netdev_set_multicast_list(struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; - u32 nic_active_bit; - u32 nic_active; - u32 perfect_filter_register_address; - u32 macaddr_low; - u16 macaddr_high; - u8 mc_all_on = 0; - u8 mc_index; - int mc_nic_index = -1; - u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count * - nics_per_function, 4); - u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated; - unsigned long flags; - int mc_count = netdev_mc_count(netdev); - - spin_lock_irqsave(&nesadapter->resource_lock, flags); - nic_active_bit = 1 << nesvnic->nic_index; - - if (netdev->flags & IFF_PROMISC) { - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); - nic_active |= nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); - mc_all_on = 1; - } else if ((netdev->flags & IFF_ALLMULTI) || - (nesvnic->nic_index > 3)) { - set_allmulti(nesdev, nic_active_bit); - mc_all_on = 1; - } else { - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); - nic_active &= ~nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); - nic_active &= ~nic_active_bit; - nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); - } - - nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n", - mc_count, !!(netdev->flags & IFF_PROMISC), - !!(netdev->flags & IFF_ALLMULTI)); - if (!mc_all_on) { - char *addrs; - int i; - struct netdev_hw_addr *ha; - - addrs = kmalloc_array(mc_count, ETH_ALEN, GFP_ATOMIC); - if (!addrs) { - set_allmulti(nesdev, nic_active_bit); - goto unlock; - } - i = 0; - netdev_for_each_mc_addr(ha, netdev) - memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN); - - perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW + - pft_entries_preallocated * 0x8; - for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable; - mc_index++) { - while (i < mc_count && nesvnic->mcrq_mcast_filter && - ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic, - get_addr(addrs, i++))) == 0)); - if (mc_nic_index < 0) - mc_nic_index = nesvnic->nic_index; - while (nesadapter->pft_mcast_map[mc_index] < 16 && - nesadapter->pft_mcast_map[mc_index] != - nesvnic->nic_index && - mc_index < max_pft_entries_avaiable) { - nes_debug(NES_DBG_NIC_RX, - "mc_index=%d skipping nic_index=%d, used for=%d\n", - mc_index, nesvnic->nic_index, - nesadapter->pft_mcast_map[mc_index]); - mc_index++; - } - if (mc_index >= max_pft_entries_avaiable) - break; - if (i < mc_count) { - char *addr = get_addr(addrs, i++); - - nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n", - addr, - perfect_filter_register_address+(mc_index * 8), - mc_nic_index); - macaddr_high = ((u8) addr[0]) << 8; - macaddr_high += (u8) addr[1]; - macaddr_low = ((u8) addr[2]) << 24; - macaddr_low += ((u8) addr[3]) << 16; - macaddr_low += ((u8) addr[4]) << 8; - macaddr_low += (u8) addr[5]; - - nes_write_indexed(nesdev, - perfect_filter_register_address+(mc_index * 8), - macaddr_low); - nes_write_indexed(nesdev, - perfect_filter_register_address+4+(mc_index * 8), - (u32)macaddr_high | NES_MAC_ADDR_VALID | - ((((u32)(1<pft_mcast_map[mc_index] = - nesvnic->nic_index; - } else { - nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n", - perfect_filter_register_address+(mc_index * 8)); - nes_write_indexed(nesdev, - perfect_filter_register_address+4+(mc_index * 8), - 0); - nesadapter->pft_mcast_map[mc_index] = 255; - } - } - kfree(addrs); - /* PFT is not large enough */ - if (i < mc_count) - set_allmulti(nesdev, nic_active_bit); - } - -unlock: - spin_unlock_irqrestore(&nesadapter->resource_lock, flags); -} - - -/** - * nes_netdev_change_mtu - */ -static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - u8 jumbomode = 0; - u32 nic_active; - u32 nic_active_bit; - u32 uc_all_active; - u32 mc_all_active; - - netdev->mtu = new_mtu; - nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN; - - if (netdev->mtu > ETH_DATA_LEN) { - jumbomode=1; - } - nes_nic_init_timer_defaults(nesdev, jumbomode); - - if (netif_running(netdev)) { - nic_active_bit = 1 << nesvnic->nic_index; - mc_all_active = nes_read_indexed(nesdev, - NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit; - uc_all_active = nes_read_indexed(nesdev, - NES_IDX_NIC_UNICAST_ALL) & nic_active_bit; - - nes_netdev_stop(netdev); - nes_netdev_open(netdev); - - nic_active = nes_read_indexed(nesdev, - NES_IDX_NIC_MULTICAST_ALL); - nic_active |= mc_all_active; - nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, - nic_active); - - nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); - nic_active |= uc_all_active; - nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); - } - - return 0; -} - - -static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { - "Link Change Interrupts", - "Linearized SKBs", - "T/GSO Requests", - "Pause Frames Sent", - "Pause Frames Received", - "Internal Routing Errors", - "SQ SW Dropped SKBs", - "SQ Full", - "Segmented TSO Requests", - "Rx Symbol Errors", - "Rx Jabber Errors", - "Rx Oversized Frames", - "Rx Short Frames", - "Rx Length Errors", - "Rx CRC Errors", - "Rx Port Discard", - "Endnode Rx Discards", - "Endnode Rx Octets", - "Endnode Rx Frames", - "Endnode Tx Octets", - "Endnode Tx Frames", - "Tx Errors", - "mh detected", - "mh pauses", - "Retransmission Count", - "CM Connects", - "CM Accepts", - "Disconnects", - "Connected Events", - "Connect Requests", - "CM Rejects", - "ModifyQP Timeouts", - "CreateQPs", - "SW DestroyQPs", - "DestroyQPs", - "CM Closes", - "CM Packets Sent", - "CM Packets Bounced", - "CM Packets Created", - "CM Packets Rcvd", - "CM Packets Dropped", - "CM Packets Retrans", - "CM Listens Created", - "CM Listens Destroyed", - "CM Backlog Drops", - "CM Loopbacks", - "CM Nodes Created", - "CM Nodes Destroyed", - "CM Accel Drops", - "CM Resets Received", - "Free 4Kpbls", - "Free 256pbls", - "Timer Inits", - "PAU CreateQPs", - "PAU DestroyQPs", -}; -#define NES_ETHTOOL_STAT_COUNT ARRAY_SIZE(nes_ethtool_stringset) - - -/** - * nes_netdev_get_sset_count - */ -static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset) -{ - if (stringset == ETH_SS_STATS) - return NES_ETHTOOL_STAT_COUNT; - else - return -EINVAL; -} - - -/** - * nes_netdev_get_strings - */ -static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset, - u8 *ethtool_strings) -{ - if (stringset == ETH_SS_STATS) - memcpy(ethtool_strings, - &nes_ethtool_stringset, - sizeof(nes_ethtool_stringset)); -} - - -/** - * nes_netdev_get_ethtool_stats - */ - -static void nes_netdev_get_ethtool_stats(struct net_device *netdev, - struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values) -{ - u64 u64temp; - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 nic_count; - u32 u32temp; - u32 index = 0; - - target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT; - target_stat_values[index] = nesvnic->nesdev->link_status_interrupts; - target_stat_values[++index] = nesvnic->linearized_skbs; - target_stat_values[++index] = nesvnic->tso_requests; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_pause_frames_sent += u32temp; - target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_pause_frames_received += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); - nesvnic->nesdev->port_rx_discards += u32temp; - nesvnic->netstats.rx_dropped += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); - nesvnic->nesdev->port_tx_discards += u32temp; - nesvnic->netstats.tx_dropped += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_short_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_oversized_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_jabber_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->netstats.rx_length_errors += u32temp; - nesvnic->nesdev->mac_rx_errors += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_rx_errors += u32temp; - nesvnic->nesdev->mac_rx_crc_errors += u32temp; - nesvnic->netstats.rx_crc_errors += u32temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); - nesvnic->nesdev->mac_tx_errors += u32temp; - nesvnic->netstats.tx_errors += u32temp; - - for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) { - if (nesvnic->qp_nic_index[nic_count] == 0xf) - break; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + - (nesvnic->qp_nic_index[nic_count]*0x200)); - nesvnic->netstats.rx_dropped += u32temp; - nesvnic->endnode_nstat_rx_discard += u32temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + - (nesvnic->qp_nic_index[nic_count]*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + - (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; - - nesvnic->endnode_nstat_rx_octets += u64temp; - nesvnic->netstats.rx_bytes += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + - (nesvnic->qp_nic_index[nic_count]*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + - (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; - - nesvnic->endnode_nstat_rx_frames += u64temp; - nesvnic->netstats.rx_packets += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + - (nesvnic->qp_nic_index[nic_count]*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + - (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; - - nesvnic->endnode_nstat_tx_octets += u64temp; - nesvnic->netstats.tx_bytes += u64temp; - - u64temp = (u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + - (nesvnic->qp_nic_index[nic_count]*0x200)); - u64temp += ((u64)nes_read_indexed(nesdev, - NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + - (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; - - nesvnic->endnode_nstat_tx_frames += u64temp; - nesvnic->netstats.tx_packets += u64temp; - - u32temp = nes_read_indexed(nesdev, - NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200)); - nesvnic->endnode_ipv4_tcp_retransmits += u32temp; - } - - target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received; - target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err; - target_stat_values[++index] = nesvnic->tx_sw_dropped; - target_stat_values[++index] = nesvnic->sq_full; - target_stat_values[++index] = nesvnic->segmented_tso_requests; - target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames; - target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames; - target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames; - target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames; - target_stat_values[++index] = nesvnic->netstats.rx_length_errors; - target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors; - target_stat_values[++index] = nesvnic->nesdev->port_rx_discards; - target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard; - target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets; - target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames; - target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets; - target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames; - target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors; - target_stat_values[++index] = mh_detected; - target_stat_values[++index] = mh_pauses_sent; - target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits; - target_stat_values[++index] = atomic_read(&cm_connects); - target_stat_values[++index] = atomic_read(&cm_accepts); - target_stat_values[++index] = atomic_read(&cm_disconnects); - target_stat_values[++index] = atomic_read(&cm_connecteds); - target_stat_values[++index] = atomic_read(&cm_connect_reqs); - target_stat_values[++index] = atomic_read(&cm_rejects); - target_stat_values[++index] = atomic_read(&mod_qp_timouts); - target_stat_values[++index] = atomic_read(&qps_created); - target_stat_values[++index] = atomic_read(&sw_qps_destroyed); - target_stat_values[++index] = atomic_read(&qps_destroyed); - target_stat_values[++index] = atomic_read(&cm_closes); - target_stat_values[++index] = cm_packets_sent; - target_stat_values[++index] = cm_packets_bounced; - target_stat_values[++index] = cm_packets_created; - target_stat_values[++index] = cm_packets_received; - target_stat_values[++index] = cm_packets_dropped; - target_stat_values[++index] = cm_packets_retrans; - target_stat_values[++index] = atomic_read(&cm_listens_created); - target_stat_values[++index] = atomic_read(&cm_listens_destroyed); - target_stat_values[++index] = cm_backlog_drops; - target_stat_values[++index] = atomic_read(&cm_loopbacks); - target_stat_values[++index] = atomic_read(&cm_nodes_created); - target_stat_values[++index] = atomic_read(&cm_nodes_destroyed); - target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts); - target_stat_values[++index] = atomic_read(&cm_resets_recvd); - target_stat_values[++index] = nesadapter->free_4kpbl; - target_stat_values[++index] = nesadapter->free_256pbl; - target_stat_values[++index] = int_mod_timer_init; - target_stat_values[++index] = atomic_read(&pau_qps_created); - target_stat_values[++index] = atomic_read(&pau_qps_destroyed); -} - -/** - * nes_netdev_get_drvinfo - */ -static void nes_netdev_get_drvinfo(struct net_device *netdev, - struct ethtool_drvinfo *drvinfo) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; - - strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); - strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev), - sizeof(drvinfo->bus_info)); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%u.%u", nesadapter->firmware_version >> 16, - nesadapter->firmware_version & 0x000000ff); - strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version)); -} - - -/** - * nes_netdev_set_coalesce - */ -static int nes_netdev_set_coalesce(struct net_device *netdev, - struct ethtool_coalesce *et_coalesce) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; - unsigned long flags; - - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - if (et_coalesce->rx_max_coalesced_frames_low) { - shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; - } - if (et_coalesce->rx_max_coalesced_frames_irq) { - shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq; - } - if (et_coalesce->rx_max_coalesced_frames_high) { - shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high; - } - if (et_coalesce->rx_coalesce_usecs_low) { - shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low; - } - if (et_coalesce->rx_coalesce_usecs_high) { - shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high; - } - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); - - /* using this to drive total interrupt moderation */ - nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq; - if (et_coalesce->use_adaptive_rx_coalesce) { - nesadapter->et_use_adaptive_rx_coalesce = 1; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; - nesadapter->et_rx_coalesce_usecs_irq = 0; - if (et_coalesce->pkt_rate_low) { - nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; - } - } else { - nesadapter->et_use_adaptive_rx_coalesce = 0; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; - if (nesadapter->et_rx_coalesce_usecs_irq) { - nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, - 0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8))); - } - } - return 0; -} - - -/** - * nes_netdev_get_coalesce - */ -static int nes_netdev_get_coalesce(struct net_device *netdev, - struct ethtool_coalesce *et_coalesce) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct ethtool_coalesce temp_et_coalesce; - struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; - unsigned long flags; - - memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce)); - temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; - temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; - temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; - temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low; - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; - temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; - temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high; - temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; - temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max; - if (nesadapter->et_use_adaptive_rx_coalesce) { - temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use; - } - spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); - memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); - return 0; -} - - -/** - * nes_netdev_get_pauseparam - */ -static void nes_netdev_get_pauseparam(struct net_device *netdev, - struct ethtool_pauseparam *et_pauseparam) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - - et_pauseparam->autoneg = 0; - et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0; - et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0; -} - - -/** - * nes_netdev_set_pauseparam - */ -static int nes_netdev_set_pauseparam(struct net_device *netdev, - struct ethtool_pauseparam *et_pauseparam) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - u32 u32temp; - - if (et_pauseparam->autoneg) { - /* TODO: should return unsupported */ - return 0; - } - if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) { - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); - u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; - nes_write_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); - nesdev->disable_tx_flow_control = 0; - } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) { - u32temp = nes_read_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); - u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; - nes_write_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); - nesdev->disable_tx_flow_control = 1; - } - if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) { - u32temp = nes_read_indexed(nesdev, - NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); - u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; - nes_write_indexed(nesdev, - NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); - nesdev->disable_rx_flow_control = 0; - } else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) { - u32temp = nes_read_indexed(nesdev, - NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); - u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; - nes_write_indexed(nesdev, - NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); - nesdev->disable_rx_flow_control = 1; - } - - return 0; -} - - -/** - * nes_netdev_get_settings - */ -static int nes_netdev_get_link_ksettings(struct net_device *netdev, - struct ethtool_link_ksettings *cmd) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 mac_index = nesdev->mac_index; - u8 phy_type = nesadapter->phy_type[mac_index]; - u8 phy_index = nesadapter->phy_index[mac_index]; - u16 phy_data; - u32 supported, advertising; - - cmd->base.duplex = DUPLEX_FULL; - cmd->base.port = PORT_MII; - - if (nesadapter->OneG_Mode) { - cmd->base.speed = SPEED_1000; - if (phy_type == NES_PHY_TYPE_PUMA_1G) { - supported = SUPPORTED_1000baseT_Full; - advertising = ADVERTISED_1000baseT_Full; - cmd->base.autoneg = AUTONEG_DISABLE; - cmd->base.phy_address = mac_index; - } else { - unsigned long flags; - - supported = SUPPORTED_1000baseT_Full - | SUPPORTED_Autoneg; - advertising = ADVERTISED_1000baseT_Full - | ADVERTISED_Autoneg; - spin_lock_irqsave(&nesadapter->phy_lock, flags); - nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - if (phy_data & 0x1000) - cmd->base.autoneg = AUTONEG_ENABLE; - else - cmd->base.autoneg = AUTONEG_DISABLE; - cmd->base.phy_address = phy_index; - } - ethtool_convert_legacy_u32_to_link_mode( - cmd->link_modes.supported, supported); - ethtool_convert_legacy_u32_to_link_mode( - cmd->link_modes.advertising, advertising); - return 0; - } - if ((phy_type == NES_PHY_TYPE_ARGUS) || - (phy_type == NES_PHY_TYPE_SFP_D) || - (phy_type == NES_PHY_TYPE_KR)) { - cmd->base.port = PORT_FIBRE; - supported = SUPPORTED_FIBRE; - advertising = ADVERTISED_FIBRE; - cmd->base.phy_address = phy_index; - } else { - supported = SUPPORTED_10000baseT_Full; - advertising = ADVERTISED_10000baseT_Full; - cmd->base.phy_address = mac_index; - } - cmd->base.speed = SPEED_10000; - cmd->base.autoneg = AUTONEG_DISABLE; - ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported, - supported); - ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising, - advertising); - - return 0; -} - - -/** - * nes_netdev_set_settings - */ -static int -nes_netdev_set_link_ksettings(struct net_device *netdev, - const struct ethtool_link_ksettings *cmd) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - if ((nesadapter->OneG_Mode) && - (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) { - unsigned long flags; - u16 phy_data; - u8 phy_index = nesadapter->phy_index[nesdev->mac_index]; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); - if (cmd->base.autoneg) { - /* Turn on Full duplex, Autoneg, and restart autonegotiation */ - phy_data |= 0x1300; - } else { - /* Turn off autoneg */ - phy_data &= ~0x1000; - } - nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - } - - return 0; -} - - -static const struct ethtool_ops nes_ethtool_ops = { - .get_link = ethtool_op_get_link, - .get_strings = nes_netdev_get_strings, - .get_sset_count = nes_netdev_get_sset_count, - .get_ethtool_stats = nes_netdev_get_ethtool_stats, - .get_drvinfo = nes_netdev_get_drvinfo, - .get_coalesce = nes_netdev_get_coalesce, - .set_coalesce = nes_netdev_set_coalesce, - .get_pauseparam = nes_netdev_get_pauseparam, - .set_pauseparam = nes_netdev_set_pauseparam, - .get_link_ksettings = nes_netdev_get_link_ksettings, - .set_link_ksettings = nes_netdev_set_link_ksettings, -}; - -static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 u32temp; - unsigned long flags; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - - nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name); - - /* Enable/Disable VLAN Stripping */ - u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG); - if (features & NETIF_F_HW_VLAN_CTAG_RX) - u32temp &= 0xfdffffff; - else - u32temp |= 0x02000000; - - nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp); - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); -} - -static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features) -{ - /* - * Since there is no support for separate rx/tx vlan accel - * enable/disable make sure tx flag is always in same state as rx. - */ - if (features & NETIF_F_HW_VLAN_CTAG_RX) - features |= NETIF_F_HW_VLAN_CTAG_TX; - else - features &= ~NETIF_F_HW_VLAN_CTAG_TX; - - return features; -} - -static int nes_set_features(struct net_device *netdev, netdev_features_t features) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - u32 changed = netdev->features ^ features; - - if (changed & NETIF_F_HW_VLAN_CTAG_RX) - nes_vlan_mode(netdev, nesdev, features); - - return 0; -} - -static const struct net_device_ops nes_netdev_ops = { - .ndo_open = nes_netdev_open, - .ndo_stop = nes_netdev_stop, - .ndo_start_xmit = nes_netdev_start_xmit, - .ndo_get_stats = nes_netdev_get_stats, - .ndo_tx_timeout = nes_netdev_tx_timeout, - .ndo_set_mac_address = nes_netdev_set_mac_address, - .ndo_set_rx_mode = nes_netdev_set_multicast_list, - .ndo_change_mtu = nes_netdev_change_mtu, - .ndo_validate_addr = eth_validate_addr, - .ndo_fix_features = nes_fix_features, - .ndo_set_features = nes_set_features, -}; - -/** - * nes_netdev_init - initialize network device - */ -struct net_device *nes_netdev_init(struct nes_device *nesdev, - void __iomem *mmio_addr) -{ - u64 u64temp; - struct nes_vnic *nesvnic; - struct net_device *netdev; - struct nic_qp_map *curr_qp_map; - u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index]; - - netdev = alloc_etherdev(sizeof(struct nes_vnic)); - if (!netdev) { - printk(KERN_ERR PFX "nesvnic etherdev alloc failed"); - return NULL; - } - nesvnic = netdev_priv(netdev); - - nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name); - - SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev); - - netdev->watchdog_timeo = NES_TX_TIMEOUT; - netdev->irq = nesdev->pcidev->irq; - netdev->max_mtu = NES_MAX_MTU; - netdev->hard_header_len = ETH_HLEN; - netdev->addr_len = ETH_ALEN; - netdev->type = ARPHRD_ETHER; - netdev->netdev_ops = &nes_netdev_ops; - netdev->ethtool_ops = &nes_ethtool_ops; - netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128); - nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n"); - - /* Fill in the port structure */ - nesvnic->netdev = netdev; - nesvnic->nesdev = nesdev; - nesvnic->msg_enable = netif_msg_init(debug, default_msg); - nesvnic->netdev_index = nesdev->netdev_count; - nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count; - nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN; - - curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)]; - nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid; - nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index; - nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port; - - /* Setup the burned in MAC address */ - u64temp = (u64)nesdev->nesadapter->mac_addr_low; - u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32; - u64temp += nesvnic->nic_index; - netdev->dev_addr[0] = (u8)(u64temp>>40); - netdev->dev_addr[1] = (u8)(u64temp>>32); - netdev->dev_addr[2] = (u8)(u64temp>>24); - netdev->dev_addr[3] = (u8)(u64temp>>16); - netdev->dev_addr[4] = (u8)(u64temp>>8); - netdev->dev_addr[5] = (u8)u64temp; - - netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX; - if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) - netdev->hw_features |= NETIF_F_TSO; - - netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX; - - nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," - " nic_index = %d, logical_port = %d, mac_index = %d.\n", - nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id, - nesvnic->nic_index, nesvnic->logical_port, nesdev->mac_index); - - if (nesvnic->nesdev->nesadapter->port_count == 1 && - nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) { - - nesvnic->qp_nic_index[0] = nesvnic->nic_index; - nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1; - if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) { - nesvnic->qp_nic_index[2] = 0xf; - nesvnic->qp_nic_index[3] = 0xf; - } else { - nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2; - nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3; - } - } else { - if (nesvnic->nesdev->nesadapter->port_count == 2 || - (nesvnic->nesdev->nesadapter->port_count == 1 && - nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) { - nesvnic->qp_nic_index[0] = nesvnic->nic_index; - nesvnic->qp_nic_index[1] = nesvnic->nic_index - + 2; - nesvnic->qp_nic_index[2] = 0xf; - nesvnic->qp_nic_index[3] = 0xf; - } else { - nesvnic->qp_nic_index[0] = nesvnic->nic_index; - nesvnic->qp_nic_index[1] = 0xf; - nesvnic->qp_nic_index[2] = 0xf; - nesvnic->qp_nic_index[3] = 0xf; - } - } - nesvnic->next_qp_nic_index = 0; - - if (nesdev->netdev_count == 0) { - nesvnic->rdma_enabled = 1; - } else { - nesvnic->rdma_enabled = 0; - } - nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id; - timer_setup(&nesvnic->event_timer, NULL, 0); - spin_lock_init(&nesvnic->tx_lock); - spin_lock_init(&nesvnic->port_ibevent_lock); - nesdev->netdev[nesdev->netdev_count] = netdev; - - nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n", - nesvnic, nesdev->mac_index); - list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]); - - if ((nesdev->netdev_count == 0) && - ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) || - ((phy_type == NES_PHY_TYPE_PUMA_1G) && - (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) || - ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) { - u32 u32temp; - u32 link_mask = 0; - u32 link_val = 0; - u16 temp_phy_data; - u16 phy_data = 0; - unsigned long flags; - - u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200 * (nesdev->mac_index & 1))); - if (phy_type != NES_PHY_TYPE_PUMA_1G) { - u32temp |= 0x00200000; - nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200 * (nesdev->mac_index & 1)), u32temp); - } - - /* Check and set linkup here. This is for back to back */ - /* configuration where second port won't get link interrupt */ - switch (phy_type) { - case NES_PHY_TYPE_PUMA_1G: - if (nesdev->mac_index < 2) { - link_mask = 0x01010000; - link_val = 0x01010000; - } else { - link_mask = 0x02020000; - link_val = 0x02020000; - } - break; - case NES_PHY_TYPE_SFP_D: - spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); - nes_read_10G_phy_reg(nesdev, - nesdev->nesadapter->phy_index[nesdev->mac_index], - 1, 0x9003); - temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - nes_read_10G_phy_reg(nesdev, - nesdev->nesadapter->phy_index[nesdev->mac_index], - 3, 0x0021); - nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - nes_read_10G_phy_reg(nesdev, - nesdev->nesadapter->phy_index[nesdev->mac_index], - 3, 0x0021); - phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); - phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; - break; - default: - link_mask = 0x0f1f0000; - link_val = 0x0f0f0000; - break; - } - - u32temp = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200 * (nesdev->mac_index & 1))); - - if (phy_type == NES_PHY_TYPE_SFP_D) { - if (phy_data & 0x0004) - nesvnic->linkup = 1; - } else { - if ((u32temp & link_mask) == link_val) - nesvnic->linkup = 1; - } - - /* clear the MAC interrupt status, assumes direct logical to physical mapping */ - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index)); - nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp); - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp); - - nes_init_phy(nesdev); - } - - nes_vlan_mode(netdev, nesdev, netdev->features); - - return netdev; -} - - -/** - * nes_netdev_destroy - destroy network device structure - */ -void nes_netdev_destroy(struct net_device *netdev) -{ - struct nes_vnic *nesvnic = netdev_priv(netdev); - - /* make sure 'stop' method is called by Linux stack */ - /* nes_netdev_stop(netdev); */ - - list_del(&nesvnic->list); - - if (nesvnic->of_device_registered) { - nes_destroy_ofa_device(nesvnic->nesibdev); - } - - free_netdev(netdev); -} - - -/** - * nes_nic_cm_xmit -- CM calls this to send out pkts - */ -int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev) -{ - int ret; - - skb->dev = netdev; - ret = dev_queue_xmit(skb); - if (ret) { - nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret); - } - - return ret; -} diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c deleted file mode 100644 index e976292fc6c0..000000000000 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ /dev/null @@ -1,913 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "nes.h" - -static u16 nes_read16_eeprom(void __iomem *addr, u16 offset); - -u32 mh_detected; -u32 mh_pauses_sent; - -static u32 nes_set_pau(struct nes_device *nesdev) -{ - u32 ret = 0; - u32 counter; - - nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU); - nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); - - for (counter = 0; counter < NES_PAU_COUNTER; counter++) { - udelay(30); - if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) { - printk(KERN_INFO PFX "PAU is supported.\n"); - break; - } - nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); - } - if (counter == NES_PAU_COUNTER) { - printk(KERN_INFO PFX "PAU is not supported.\n"); - return -EPERM; - } - return ret; -} - -/** - * nes_read_eeprom_values - - */ -int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter) -{ - u32 mac_addr_low; - u16 mac_addr_high; - u16 eeprom_data; - u16 eeprom_offset; - u16 next_section_address; - u16 sw_section_ver; - u8 major_ver = 0; - u8 minor_ver = 0; - - /* TODO: deal with EEPROM endian issues */ - if (nesadapter->firmware_eeprom_offset == 0) { - /* Read the EEPROM Parameters */ - eeprom_data = nes_read16_eeprom(nesdev->regs, 0); - nes_debug(NES_DBG_HW, "EEPROM Offset 0 = 0x%04X\n", eeprom_data); - eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) << - ((eeprom_data & 0x0080) >> 7)); - nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset); - nesadapter->firmware_eeprom_offset = eeprom_offset; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); - if (eeprom_data != 0x5746) { - nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data); - return -1; - } - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8); - nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset); - nesadapter->software_eeprom_offset = eeprom_offset; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); - if (eeprom_data != 0x5753) { - printk("Not a valid Software Image = 0x%04X\n", eeprom_data); - return -1; - } - sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset + 6); - nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n", - sw_section_ver); - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << - ((eeprom_data & 0x0100) >> 8)); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x414d) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_offset = next_section_address; - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << - ((eeprom_data & 0x0100) >> 8)); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x4f52) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_offset = next_section_address; - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x5746) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_offset = next_section_address; - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x5753) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_offset = next_section_address; - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x414d) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_offset = next_section_address; - - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); - nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", - eeprom_offset + 2, eeprom_data); - next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); - if (eeprom_data != 0x464e) { - nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n", - eeprom_data); - goto no_fw_rev; - } - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8); - printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); - major_ver = (u8)(eeprom_data >> 8); - minor_ver = (u8)(eeprom_data); - - if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) { - nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n"); - } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) { - nesadapter->virtwq = 1; - } - if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) - nesadapter->send_term_ok = 1; - - if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) { - if (!nes_set_pau(nesdev)) - nesadapter->allow_unaligned_fpdus = 1; - } - - nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + - (u32)((u8)eeprom_data); - - eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10); - printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); - nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) + - (u32)((u8)eeprom_data); - -no_fw_rev: - /* eeprom is valid */ - eeprom_offset = nesadapter->software_eeprom_offset; - eeprom_offset += 8; - nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - mac_addr_low <<= 16; - mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n", - mac_addr_high, mac_addr_low); - nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max); - - nesadapter->mac_addr_low = mac_addr_low; - nesadapter->mac_addr_high = mac_addr_high; - - /* Read the Phy Type array */ - eeprom_offset += 10; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->phy_type[0] = (u8)(eeprom_data >> 8); - nesadapter->phy_type[1] = (u8)eeprom_data; - - /* Read the port array */ - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->phy_type[2] = (u8)(eeprom_data >> 8); - nesadapter->phy_type[3] = (u8)eeprom_data; - /* port_count is set by soft reset reg */ - nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u," - " port 2 -> %u, port 3 -> %u\n", - nesadapter->port_count, - nesadapter->phy_type[0], nesadapter->phy_type[1], - nesadapter->phy_type[2], nesadapter->phy_type[3]); - - /* Read PD config array */ - eeprom_offset += 10; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_size[0] = eeprom_data; - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_base[0] = eeprom_data; - nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n", - nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_size[1] = eeprom_data; - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_base[1] = eeprom_data; - nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n", - nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_size[2] = eeprom_data; - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_base[2] = eeprom_data; - nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n", - nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_size[3] = eeprom_data; - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->pd_config_base[3] = eeprom_data; - nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n", - nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]); - - /* Read Rx Pool Size */ - eeprom_offset += 22; /* 46 */ - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->rx_threshold = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n", - nesadapter->tcp_timer_core_clk_divisor); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->iwarp_config = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->cm_config = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->wqm_wat = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat); - - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - eeprom_offset += 2; - nesadapter->core_clock = (((u32)eeprom_data) << 16) + - nes_read16_eeprom(nesdev->regs, eeprom_offset); - nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock); - - if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) { - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8; - nesadapter->phy_index[1] = eeprom_data & 0x00ff; - eeprom_offset += 2; - eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); - nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8; - nesadapter->phy_index[3] = eeprom_data & 0x00ff; - } else { - nesadapter->phy_index[0] = 4; - nesadapter->phy_index[1] = 5; - nesadapter->phy_index[2] = 6; - nesadapter->phy_index[3] = 7; - } - nes_debug(NES_DBG_HW, "Phy address map = 0 > %u, 1 > %u, 2 > %u, 3 > %u\n", - nesadapter->phy_index[0],nesadapter->phy_index[1], - nesadapter->phy_index[2],nesadapter->phy_index[3]); - } - - return 0; -} - - -/** - * nes_read16_eeprom - */ -static u16 nes_read16_eeprom(void __iomem *addr, u16 offset) -{ - writel(NES_EEPROM_READ_REQUEST + (offset >> 1), - (void __iomem *)addr + NES_EEPROM_COMMAND); - - do { - } while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) & - NES_EEPROM_READ_REQUEST); - - return readw((void __iomem *)addr + NES_EEPROM_DATA); -} - - -/** - * nes_write_1G_phy_reg - */ -void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data) -{ - u32 u32temp; - u32 counter; - - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); -} - - -/** - * nes_read_1G_phy_reg - * This routine only issues the read, the data must be read - * separately. - */ -void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data) -{ - u32 u32temp; - u32 counter; - - /* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n", - phy_addr, nesdev->mac_index); */ - - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) { - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); - *data = 0xffff; - } else { - *data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - } -} - - -/** - * nes_write_10G_phy_reg - */ -void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg, - u16 data) -{ - u32 port_addr; - u32 u32temp; - u32 counter; - - port_addr = phy_addr; - - /* set address */ - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); - - /* set data */ - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); -} - - -/** - * nes_read_10G_phy_reg - * This routine only issues the read, the data must be read - * separately. - */ -void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg) -{ - u32 port_addr; - u32 u32temp; - u32 counter; - - port_addr = phy_addr; - - /* set address */ - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); - - /* issue read */ - nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, - 0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); - for (counter = 0; counter < 100 ; counter++) { - udelay(30); - u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); - if (u32temp & 1) { - nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); - break; - } - } - if (!(u32temp & 1)) - nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", - u32temp); -} - - -/** - * nes_get_cqp_request - */ -struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) -{ - unsigned long flags; - struct nes_cqp_request *cqp_request = NULL; - - if (!list_empty(&nesdev->cqp_avail_reqs)) { - spin_lock_irqsave(&nesdev->cqp.lock, flags); - if (!list_empty(&nesdev->cqp_avail_reqs)) { - cqp_request = list_entry(nesdev->cqp_avail_reqs.next, - struct nes_cqp_request, list); - list_del_init(&cqp_request->list); - } - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - } - if (cqp_request == NULL) { - cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC); - if (cqp_request) { - cqp_request->dynamic = 1; - INIT_LIST_HEAD(&cqp_request->list); - } - } - - if (cqp_request) { - init_waitqueue_head(&cqp_request->waitq); - cqp_request->waiting = 0; - cqp_request->request_done = 0; - cqp_request->callback = 0; - nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", - cqp_request); - } - - return cqp_request; -} - -void nes_free_cqp_request(struct nes_device *nesdev, - struct nes_cqp_request *cqp_request) -{ - unsigned long flags; - - nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", - cqp_request, - le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f); - - if (cqp_request->dynamic) { - kfree(cqp_request); - } else { - spin_lock_irqsave(&nesdev->cqp.lock, flags); - list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - } -} - -void nes_put_cqp_request(struct nes_device *nesdev, - struct nes_cqp_request *cqp_request) -{ - if (atomic_dec_and_test(&cqp_request->refcount)) - nes_free_cqp_request(nesdev, cqp_request); -} - - -/** - * nes_post_cqp_request - */ -void nes_post_cqp_request(struct nes_device *nesdev, - struct nes_cqp_request *cqp_request) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - unsigned long flags; - u32 cqp_head; - u64 u64temp; - u32 opcode; - int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; - - spin_lock_irqsave(&nesdev->cqp.lock, flags); - - if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) & - (nesdev->cqp.sq_size - 1)) != 1) - && (list_empty(&nesdev->cqp_pending_reqs))) { - cqp_head = nesdev->cqp.sq_head++; - nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; - cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; - memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); - opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]); - if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) - ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; - barrier(); - u64temp = (unsigned long)cqp_request; - set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp); - nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ," - " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," - " waiting = %d, refcount = %d.\n", - opcode & NES_CQP_OPCODE_MASK, - le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, - nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, - cqp_request->waiting, atomic_read(&cqp_request->refcount)); - - barrier(); - - /* Ring doorbell (1 WQEs) */ - nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); - - barrier(); - } else { - nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X" - " put on the pending queue.\n", - cqp_request, - le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, - le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX])); - list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs); - } - - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - - return; -} - -/** - * nes_arp_table - */ -int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action) -{ - struct nes_adapter *nesadapter = nesdev->nesadapter; - int arp_index; - int err = 0; - __be32 tmp_addr; - - for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) { - if (nesadapter->arp_table[arp_index].ip_addr == ip_addr) - break; - } - - if (action == NES_ARP_ADD) { - if (arp_index != nesadapter->arp_table_size) { - return -1; - } - - arp_index = 0; - err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps, - nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP); - if (err) { - nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err); - return err; - } - nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index); - - nesadapter->arp_table[arp_index].ip_addr = ip_addr; - memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN); - return arp_index; - } - - /* DELETE or RESOLVE */ - if (arp_index == nesadapter->arp_table_size) { - tmp_addr = cpu_to_be32(ip_addr); - nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n", - &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete"); - return -1; - } - - if (action == NES_ARP_RESOLVE) { - nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index); - return arp_index; - } - - if (action == NES_ARP_DELETE) { - nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index); - nesadapter->arp_table[arp_index].ip_addr = 0; - eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr); - nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index); - return arp_index; - } - - return -1; -} - - -/** - * nes_mh_fix - */ -void nes_mh_fix(struct timer_list *t) -{ - struct nes_adapter *nesadapter = from_timer(nesadapter, t, mh_timer); - struct nes_device *nesdev = nesadapter->nesdev; - unsigned long flags; - struct nes_vnic *nesvnic; - u32 used_chunks_tx; - u32 temp_used_chunks_tx; - u32 temp_last_used_chunks_tx; - u32 used_chunks_mask; - u32 mac_tx_frames_low; - u32 mac_tx_frames_high; - u32 mac_tx_pauses; - u32 reset_value; - u32 tx_control; - u32 tx_config; - u32 tx_pause_quanta; - u32 rx_control; - u32 rx_config; - u32 mac_exact_match; - u32 mpp_debug; - u32 i=0; - u32 chunks_tx_progress = 0; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) { - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - goto no_mh_work; - } - nesadapter->mac_sw_state[0] = NES_MAC_SW_MH; - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - do { - mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW); - mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH); - mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); - used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX); - nesdev->mac_pause_frames_sent += mac_tx_pauses; - used_chunks_mask = 0; - temp_used_chunks_tx = used_chunks_tx; - temp_last_used_chunks_tx = nesdev->last_used_chunks_tx; - - if (nesdev->netdev[0]) { - nesvnic = netdev_priv(nesdev->netdev[0]); - } else { - break; - } - - for (i=0; i<4; i++) { - used_chunks_mask <<= 8; - if (nesvnic->qp_nic_index[i] != 0xff) { - used_chunks_mask |= 0xff; - if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) { - chunks_tx_progress = 1; - } - } - temp_used_chunks_tx >>= 8; - temp_last_used_chunks_tx >>= 8; - } - if ((mac_tx_frames_low) || (mac_tx_frames_high) || - (!(used_chunks_tx&used_chunks_mask)) || - (!(nesdev->last_used_chunks_tx&used_chunks_mask)) || - (chunks_tx_progress) ) { - nesdev->last_used_chunks_tx = used_chunks_tx; - break; - } - nesdev->last_used_chunks_tx = used_chunks_tx; - barrier(); - - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005); - mh_pauses_sent++; - mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); - if (mac_tx_pauses) { - nesdev->mac_pause_frames_sent += mac_tx_pauses; - break; - } - - tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL); - tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); - tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA); - rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL); - rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG); - mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM); - mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG); - - /* one last ditch effort to avoid a false positive */ - mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); - if (mac_tx_pauses) { - nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent; - nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n"); - break; - } - mh_detected++; - - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000); - reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); - - nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d); - - while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) - & 0x00000040) != 0x00000040) && (i++ < 5000)) { - /* mdelay(1); */ - } - - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); - nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0); - - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); - if (nesadapter->OneG_Mode) { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); - } else { - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); - } - nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0); - nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); - - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control); - nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); - nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta); - nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control); - nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config); - nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match); - nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug); - - } while (0); - - nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE; -no_mh_work: - nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5); - add_timer(&nesdev->nesadapter->mh_timer); -} - -/** - * nes_clc - */ -void nes_clc(struct timer_list *t) -{ - struct nes_adapter *nesadapter = from_timer(nesadapter, t, lc_timer); - unsigned long flags; - - spin_lock_irqsave(&nesadapter->phy_lock, flags); - nesadapter->link_interrupt_count[0] = 0; - nesadapter->link_interrupt_count[1] = 0; - nesadapter->link_interrupt_count[2] = 0; - nesadapter->link_interrupt_count[3] = 0; - spin_unlock_irqrestore(&nesadapter->phy_lock, flags); - - nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ - add_timer(&nesadapter->lc_timer); -} - - -/** - * nes_dump_mem - */ -void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) -{ - if (!(nes_debug_level & dump_debug_level)) { - return; - } - - if (length > 0x100) { - nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); - length = 0x100; - } - nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length); - - print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true); -} diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c deleted file mode 100644 index 0420203820f6..000000000000 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ /dev/null @@ -1,3721 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "nes.h" - -#include - -atomic_t mod_qp_timouts; -atomic_t qps_created; -atomic_t sw_qps_destroyed; - -static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev); -static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); - -/** - * nes_alloc_mw - */ -static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, - struct ib_udata *udata) -{ - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_cqp_request *cqp_request; - struct nes_mr *nesmr; - struct ib_mw *ibmw; - struct nes_hw_cqp_wqe *cqp_wqe; - int ret; - u32 stag; - u32 stag_index = 0; - u32 next_stag_index = 0; - u32 driver_key = 0; - u8 stag_key = 0; - - if (type != IB_MW_TYPE_1) - return ERR_PTR(-EINVAL); - - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - - driver_key = 0; - - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, - nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW); - if (ret) { - return ERR_PTR(ret); - } - - nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); - if (!nesmr) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - - nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", - stag, stag_index); - - /* Register the region with the adapter */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - kfree(nesmr); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = - cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ | - NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO | - NES_CQP_STAG_REM_ACC_EN); - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - stag, ret, cqp_request->major_code, cqp_request->minor_code); - if ((!ret) || (cqp_request->major_code)) { - nes_put_cqp_request(nesdev, cqp_request); - kfree(nesmr); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - if (!ret) { - return ERR_PTR(-ETIME); - } else { - return ERR_PTR(-ENOMEM); - } - } - nes_put_cqp_request(nesdev, cqp_request); - - nesmr->ibmw.rkey = stag; - nesmr->mode = IWNES_MEMREG_TYPE_MW; - ibmw = &nesmr->ibmw; - nesmr->pbl_4k = 0; - nesmr->pbls_used = 0; - - return ibmw; -} - - -/** - * nes_dealloc_mw - */ -static int nes_dealloc_mw(struct ib_mw *ibmw) -{ - struct nes_mr *nesmr = to_nesmw(ibmw); - struct nes_vnic *nesvnic = to_nesvnic(ibmw->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - int err = 0; - int ret; - - /* Deallocate the window with the adapter */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n", - ibmw->rkey); - ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - ret, cqp_request->major_code, cqp_request->minor_code); - if (!ret) - err = -ETIME; - else if (cqp_request->major_code) - err = -EIO; - - nes_put_cqp_request(nesdev, cqp_request); - - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - (ibmw->rkey & 0x0fffff00) >> 8); - kfree(nesmr); - - return err; -} - - -/* - * nes_alloc_fast_mr - */ -static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, - u32 stag, u32 page_count) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - unsigned long flags; - int ret; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 opcode = 0; - u16 major_code; - u64 region_length = page_count * PAGE_SIZE; - - - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, " - "region_length = %llu\n", - page_count, region_length); - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesadapter->free_4kpbl > 0) { - nesadapter->free_4kpbl--; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } else { - /* No 4kpbl's available: */ - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - nes_debug(NES_DBG_MR, "Out of Pbls\n"); - nes_free_cqp_request(nesdev, cqp_request); - return -ENOMEM; - } - - opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR | - NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO | - NES_CQP_STAG_REM_ACC_EN; - /* - * The current OFED API does not support the zero based TO option. - * If added then need to changed the NES_CQP_STAG_VA* option. Also, - * the API does not support that ability to have the MR set for local - * access only when created and not allow the SQ op to override. Given - * this the remote enable must be set here. - */ - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1); - - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = - cpu_to_le32((u32)(region_length >> 8) & 0xff000000); - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= - cpu_to_le32(nespd->pd_id & 0x00007fff); - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8)); - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); - barrier(); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, - (0 != cqp_request->request_done), - NES_EVENT_TIMEOUT); - - nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, " - "wait_event_timeout ret = %u, CQP Major:Minor codes = " - "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code, - cqp_request->minor_code); - major_code = cqp_request->major_code; - nes_put_cqp_request(nesdev, cqp_request); - - if (!ret || major_code) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_4kpbl++; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } - - if (!ret) - return -ETIME; - else if (major_code) - return -EIO; - return 0; -} - -/* - * nes_alloc_mr - */ -static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, - u32 max_num_sg, struct ib_udata *udata) -{ - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - u32 next_stag_index; - u8 stag_key = 0; - u32 driver_key = 0; - int err = 0; - u32 stag_index = 0; - struct nes_mr *nesmr; - u32 stag; - int ret; - struct ib_mr *ibmr; - - if (mr_type != IB_MR_TYPE_MEM_REG) - return ERR_PTR(-EINVAL); - - if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) - return ERR_PTR(-E2BIG); - -/* - * Note: Set to always use a fixed length single page entry PBL. This is to allow - * for the fast_reg_mr operation to always know the size of the PBL. - */ - if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) - return ERR_PTR(-E2BIG); - - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, - nesadapter->max_mr, &stag_index, - &next_stag_index, NES_RESOURCE_FAST_MR); - if (err) - return ERR_PTR(err); - - nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); - if (!nesmr) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - - nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", - stag, stag_index); - - ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg); - - if (ret == 0) { - nesmr->ibmr.rkey = stag; - nesmr->ibmr.lkey = stag; - nesmr->mode = IWNES_MEMREG_TYPE_FMEM; - ibmr = &nesmr->ibmr; - } else { - kfree(nesmr); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - - nesmr->pages = pci_alloc_consistent(nesdev->pcidev, - max_num_sg * sizeof(u64), - &nesmr->paddr); - if (!nesmr->paddr) - goto err; - - nesmr->max_pages = max_num_sg; - - return ibmr; - -err: - nes_dereg_mr(ibmr, udata); - - return ERR_PTR(-ENOMEM); -} - -static int nes_set_page(struct ib_mr *ibmr, u64 addr) -{ - struct nes_mr *nesmr = to_nesmr(ibmr); - - if (unlikely(nesmr->npages == nesmr->max_pages)) - return -ENOMEM; - - nesmr->pages[nesmr->npages++] = cpu_to_le64(addr); - - return 0; -} - -static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, - int sg_nents, unsigned int *sg_offset) -{ - struct nes_mr *nesmr = to_nesmr(ibmr); - - nesmr->npages = 0; - - return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page); -} - -/** - * nes_query_device - */ -static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props, - struct ib_udata *uhw) -{ - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_ib_device *nesibdev = nesvnic->nesibdev; - - if (uhw->inlen || uhw->outlen) - return -EINVAL; - - memset(props, 0, sizeof(*props)); - memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6); - - props->fw_ver = nesdev->nesadapter->firmware_version; - props->device_cap_flags = nesdev->nesadapter->device_cap_flags; - props->vendor_id = nesdev->nesadapter->vendor_id; - props->vendor_part_id = nesdev->nesadapter->vendor_part_id; - props->hw_ver = nesdev->nesadapter->hw_rev; - props->max_mr_size = 0x80000000; - props->max_qp = nesibdev->max_qp; - props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; - props->max_send_sge = nesdev->nesadapter->max_sge; - props->max_recv_sge = nesdev->nesadapter->max_sge; - props->max_cq = nesibdev->max_cq; - props->max_cqe = nesdev->nesadapter->max_cqe; - props->max_mr = nesibdev->max_mr; - props->max_mw = nesibdev->max_mr; - props->max_pd = nesibdev->max_pd; - props->max_sge_rd = 1; - switch (nesdev->nesadapter->max_irrq_wr) { - case 0: - props->max_qp_rd_atom = 2; - break; - case 1: - props->max_qp_rd_atom = 8; - break; - case 2: - props->max_qp_rd_atom = 32; - break; - case 3: - props->max_qp_rd_atom = 64; - break; - default: - props->max_qp_rd_atom = 0; - } - props->max_qp_init_rd_atom = props->max_qp_rd_atom; - props->atomic_cap = IB_ATOMIC_NONE; - props->max_map_per_fmr = 1; - - return 0; -} - - -/** - * nes_query_port - */ -static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) -{ - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - struct net_device *netdev = nesvnic->netdev; - - /* props being zeroed by the caller, avoid zeroing it here */ - - props->max_mtu = IB_MTU_4096; - props->active_mtu = ib_mtu_int_to_enum(netdev->mtu); - - props->lid = 1; - if (netif_queue_stopped(netdev)) - props->state = IB_PORT_DOWN; - else if (nesvnic->linkup) - props->state = IB_PORT_ACTIVE; - else - props->state = IB_PORT_DOWN; - props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | - IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; - props->gid_tbl_len = 1; - props->pkey_tbl_len = 1; - props->active_width = IB_WIDTH_4X; - props->active_speed = IB_SPEED_SDR; - props->max_msg_sz = 0x80000000; - - return 0; -} - -/** - * nes_query_pkey - */ -static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) -{ - *pkey = 0; - return 0; -} - - -/** - * nes_query_gid - */ -static int nes_query_gid(struct ib_device *ibdev, u8 port, - int index, union ib_gid *gid) -{ - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - - memset(&(gid->raw[0]), 0, sizeof(gid->raw)); - memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6); - - return 0; -} - - -/** - * nes_alloc_ucontext - Allocate the user context data structure. This keeps track - * of all objects associated with a particular user-mode client. - */ -static int nes_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) -{ - struct ib_device *ibdev = uctx->device; - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_alloc_ucontext_req req; - struct nes_alloc_ucontext_resp uresp = {}; - struct nes_ucontext *nes_ucontext = to_nesucontext(uctx); - struct nes_ib_device *nesibdev = nesvnic->nesibdev; - - - if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) { - printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n"); - return -EINVAL; - } - - if (req.userspace_ver != NES_ABI_USERSPACE_VER) { - printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n", - req.userspace_ver, NES_ABI_USERSPACE_VER); - return -EINVAL; - } - - - uresp.max_qps = nesibdev->max_qp; - uresp.max_pds = nesibdev->max_pd; - uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2; - uresp.virtwq = nesadapter->virtwq; - uresp.kernel_ver = NES_ABI_KERNEL_VER; - - nes_ucontext->nesdev = nesdev; - nes_ucontext->mmap_wq_offset = uresp.max_pds; - nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset + - ((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) / - PAGE_SIZE; - - - if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) - return -EFAULT; - - INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list); - INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list); - return 0; -} - -/** - * nes_dealloc_ucontext - */ -static void nes_dealloc_ucontext(struct ib_ucontext *context) -{ - return; -} - -/** - * nes_mmap - */ -static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) -{ - unsigned long index; - struct nes_vnic *nesvnic = to_nesvnic(context->device); - struct nes_device *nesdev = nesvnic->nesdev; - /* struct nes_adapter *nesadapter = nesdev->nesadapter; */ - struct nes_ucontext *nes_ucontext; - struct nes_qp *nesqp; - - nes_ucontext = to_nesucontext(context); - - - if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) { - index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE; - index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) + - PAGE_SIZE-1) & (~(PAGE_SIZE-1)); - if (!test_bit(index, nes_ucontext->allocated_wqs)) { - nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index); - return -EFAULT; - } - nesqp = nes_ucontext->mmap_nesqp[index]; - if (nesqp == NULL) { - nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index); - return -EFAULT; - } - if (remap_pfn_range(vma, vma->vm_start, - virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n"); - return -EAGAIN; - } - vma->vm_private_data = nesqp; - return 0; - } else { - index = vma->vm_pgoff; - if (!test_bit(index, nes_ucontext->allocated_doorbells)) - return -EFAULT; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, - (nesdev->doorbell_start + - ((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096)) - >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - vma->vm_private_data = nes_ucontext; - return 0; - } - - return -ENOSYS; -} - - -/** - * nes_alloc_pd - */ -static int nes_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) -{ - struct ib_device *ibdev = pd->device; - struct nes_pd *nespd = to_nespd(pd); - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_alloc_pd_resp uresp; - u32 pd_num = 0; - int err; - struct nes_ucontext *nesucontext = rdma_udata_to_drv_context( - udata, struct nes_ucontext, ibucontext); - - nes_debug( - NES_DBG_PD, - "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", - nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, - &nesucontext->ibucontext, netdev_refcnt_read(nesvnic->netdev)); - - err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, - nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD); - if (err) - return err; - - nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", - nespd, dev_name(&nesvnic->nesibdev->ibdev.dev)); - - nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; - - if (udata) { - nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells, - NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db); - nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n", - nespd->mmap_db_index, nespd->pd_id); - if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) { - nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); - nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - return -ENOMEM; - } - - uresp.pd_id = nespd->pd_id; - uresp.mmap_db_index = nespd->mmap_db_index; - if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { - nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - return -EFAULT; - } - - set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); - nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id; - nesucontext->first_free_db = nespd->mmap_db_index + 1; - } - - nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); - return 0; -} - - -/** - * nes_dealloc_pd - */ -static void nes_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) -{ - struct nes_ucontext *nesucontext; - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - - if (udata) { - nesucontext = - rdma_udata_to_drv_context( - udata, - struct nes_ucontext, - ibucontext); - nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n", - nespd->mmap_db_index); - clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); - nesucontext->mmap_db_index[nespd->mmap_db_index] = 0; - if (nesucontext->first_free_db > nespd->mmap_db_index) { - nesucontext->first_free_db = nespd->mmap_db_index; - } - } - - nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n", - nespd->pd_id, nespd); - nes_free_resource(nesadapter, nesadapter->allocated_pds, - (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); -} - - -/** - * nes_get_encoded_size - */ -static inline u8 nes_get_encoded_size(int *size) -{ - u8 encoded_size = 0; - if (*size <= 32) { - *size = 32; - encoded_size = 1; - } else if (*size <= 128) { - *size = 128; - encoded_size = 2; - } else if (*size <= 512) { - *size = 512; - encoded_size = 3; - } - return (encoded_size); -} - - - -/** - * nes_setup_virt_qp - */ -static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, - struct nes_vnic *nesvnic, int sq_size, int rq_size) -{ - unsigned long flags; - void *mem; - __le64 *pbl = NULL; - __le64 *tpbl; - __le64 *pblbuffer; - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - u32 pbl_entries; - u8 rq_pbl_entries; - u8 sq_pbl_entries; - - pbl_entries = nespbl->pbl_size >> 3; - nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n", - nespbl->pbl_size, pbl_entries, - (void *)nespbl->pbl_vbase, - (unsigned long) nespbl->pbl_pbase); - pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */ - /* now lets set the sq_vbase as well as rq_vbase addrs we will assign */ - /* the first pbl to be fro the rq_vbase... */ - rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; - sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; - nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); - if (!nespbl->page) { - nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n"); - kfree(nespbl); - return -ENOMEM; - } - - nesqp->hwqp.sq_vbase = kmap(nespbl->page); - nesqp->page = nespbl->page; - if (!nesqp->hwqp.sq_vbase) { - nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n"); - kfree(nespbl); - return -ENOMEM; - } - - /* Now to get to sq.. we need to calculate how many */ - /* PBL entries were used by the rq.. */ - pbl += sq_pbl_entries; - nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); - /* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */ - /*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */ - - nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n", - nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase, - nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase); - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (!nesadapter->free_256pbl) { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, - nespbl->pbl_pbase); - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kunmap(nesqp->page); - kfree(nespbl); - return -ENOMEM; - } - nesadapter->free_256pbl--; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - - nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase); - pblbuffer = nesqp->pbl_vbase; - if (!nesqp->pbl_vbase) { - /* memory allocated during nes_reg_user_mr() */ - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, - nespbl->pbl_pbase); - kfree(nespbl); - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_256pbl++; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kunmap(nesqp->page); - return -ENOMEM; - } - memset(nesqp->pbl_vbase, 0, 256); - /* fill in the page address in the pbl buffer.. */ - tpbl = pblbuffer + 16; - pbl = (__le64 *)nespbl->pbl_vbase; - while (sq_pbl_entries--) - *tpbl++ = *pbl++; - tpbl = pblbuffer; - while (rq_pbl_entries--) - *tpbl++ = *pbl++; - - /* done with memory allocated during nes_reg_user_mr() */ - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, - nespbl->pbl_pbase); - kfree(nespbl); - - nesqp->qp_mem_size = - max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256; /* this is Q2 */ - /* Round up to a multiple of a page */ - nesqp->qp_mem_size += PAGE_SIZE - 1; - nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); - - mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, - &nesqp->hwqp.q2_pbase); - - if (!mem) { - pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); - nesqp->pbl_vbase = NULL; - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_256pbl++; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - kunmap(nesqp->page); - return -ENOMEM; - } - nesqp->sq_kmapped = 1; - nesqp->hwqp.q2_vbase = mem; - mem += 256; - memset(nesqp->hwqp.q2_vbase, 0, 256); - nesqp->nesqp_context = mem; - memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); - nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; - - return 0; -} - - -/** - * nes_setup_mmap_qp - */ -static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, - int sq_size, int rq_size) -{ - void *mem; - struct nes_device *nesdev = nesvnic->nesdev; - - nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) + - (sizeof(struct nes_hw_qp_wqe) * rq_size) + - max((u32)sizeof(struct nes_qp_context), ((u32)256)) + - 256; /* this is Q2 */ - /* Round up to a multiple of a page */ - nesqp->qp_mem_size += PAGE_SIZE - 1; - nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); - - mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, - &nesqp->hwqp.sq_pbase); - if (!mem) - return -ENOMEM; - nes_debug(NES_DBG_QP, "PCI consistent memory for " - "host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n", - mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size); - - memset(mem, 0, nesqp->qp_mem_size); - - nesqp->hwqp.sq_vbase = mem; - mem += sizeof(struct nes_hw_qp_wqe) * sq_size; - - nesqp->hwqp.rq_vbase = mem; - nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase + - sizeof(struct nes_hw_qp_wqe) * sq_size; - mem += sizeof(struct nes_hw_qp_wqe) * rq_size; - - nesqp->hwqp.q2_vbase = mem; - nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase + - sizeof(struct nes_hw_qp_wqe) * rq_size; - mem += 256; - memset(nesqp->hwqp.q2_vbase, 0, 256); - - nesqp->nesqp_context = mem; - nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; - memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); - return 0; -} - - -/** - * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. - */ -static void nes_free_qp_mem(struct nes_device *nesdev, - struct nes_qp *nesqp, int virt_wqs) -{ - unsigned long flags; - struct nes_adapter *nesadapter = nesdev->nesadapter; - if (!virt_wqs) { - pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, - nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); - }else { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_256pbl++; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); - pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); - nesqp->pbl_vbase = NULL; - if (nesqp->sq_kmapped) { - nesqp->sq_kmapped = 0; - kunmap(nesqp->page); - } - } -} - - -/** - * nes_create_qp - */ -static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, - struct ib_qp_init_attr *init_attr, struct ib_udata *udata) -{ - u64 u64temp= 0; - u64 u64nesqp = 0; - struct nes_pd *nespd = to_nespd(ibpd); - struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_qp *nesqp; - struct nes_cq *nescq; - struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( - udata, struct nes_ucontext, ibucontext); - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - struct nes_create_qp_req req; - struct nes_create_qp_resp uresp; - struct nes_pbl *nespbl = NULL; - u32 qp_num = 0; - u32 opcode = 0; - /* u32 counter = 0; */ - void *mem; - unsigned long flags; - int ret; - int err; - int virt_wqs = 0; - int sq_size; - int rq_size; - u8 sq_encoded_size; - u8 rq_encoded_size; - /* int counter; */ - - if (init_attr->create_flags) - return ERR_PTR(-EINVAL); - - atomic_inc(&qps_created); - switch (init_attr->qp_type) { - case IB_QPT_RC: - if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) { - init_attr->cap.max_inline_data = 0; - } else { - init_attr->cap.max_inline_data = 64; - } - sq_size = init_attr->cap.max_send_wr; - rq_size = init_attr->cap.max_recv_wr; - - /* check if the encoded sizes are OK or not... */ - sq_encoded_size = nes_get_encoded_size(&sq_size); - rq_encoded_size = nes_get_encoded_size(&rq_size); - - if ((!sq_encoded_size) || (!rq_encoded_size)) { - nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n", - rq_size, sq_size); - return ERR_PTR(-EINVAL); - } - - init_attr->cap.max_send_wr = sq_size -2; - init_attr->cap.max_recv_wr = rq_size -1; - nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size); - - ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps, - nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP); - if (ret) { - return ERR_PTR(ret); - } - - /* Need 512 (actually now 1024) byte alignment on this structure */ - mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL); - if (!mem) { - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - return ERR_PTR(-ENOMEM); - } - u64nesqp = (unsigned long)mem; - u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1; - u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1; - u64nesqp &= ~u64temp; - nesqp = (struct nes_qp *)(unsigned long)u64nesqp; - /* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p. Rounded to closest %u\n", - nesqp, mem, NES_SW_CONTEXT_ALIGN); */ - nesqp->allocated_buffer = mem; - - if (udata) { - if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) { - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - kfree(nesqp->allocated_buffer); - nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); - return ERR_PTR(-EFAULT); - } - if (req.user_wqe_buffers) { - virt_wqs = 1; - } - if (req.user_qp_buffer) - nesqp->nesuqp_addr = req.user_qp_buffer; - - nesqp->user_mode = 1; - if (virt_wqs) { - err = 1; - list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) { - if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) { - list_del(&nespbl->list); - err = 0; - nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n", - nespbl, nespbl->user_base); - break; - } - } - if (err) { - nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n", - (long long unsigned int)req.user_wqe_buffers); - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - kfree(nesqp->allocated_buffer); - return ERR_PTR(-EFAULT); - } - } - - nesqp->mmap_sq_db_index = - find_next_zero_bit(nes_ucontext->allocated_wqs, - NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq); - /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n", - nespd->mmap_db_index); */ - if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) { - nes_debug(NES_DBG_QP, - "db index > max user regions, failing create QP\n"); - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - if (virt_wqs) { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, - nespbl->pbl_pbase); - kfree(nespbl); - } - kfree(nesqp->allocated_buffer); - return ERR_PTR(-ENOMEM); - } - set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); - nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp; - nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1; - } - err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) : - nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size); - if (err) { - nes_debug(NES_DBG_QP, - "error geting qp mem code = %d\n", err); - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - kfree(nesqp->allocated_buffer); - return ERR_PTR(-ENOMEM); - } - - nesqp->hwqp.sq_size = sq_size; - nesqp->hwqp.sq_encoded_size = sq_encoded_size; - nesqp->hwqp.sq_head = 1; - nesqp->hwqp.rq_size = rq_size; - nesqp->hwqp.rq_encoded_size = rq_encoded_size; - /* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n", - (void *)nesqp->nesqp_context_pbase); - */ - nesqp->hwqp.qp_id = qp_num; - nesqp->ibqp.qp_num = nesqp->hwqp.qp_id; - nesqp->nespd = nespd; - - nescq = to_nescq(init_attr->send_cq); - nesqp->nesscq = nescq; - nescq = to_nescq(init_attr->recv_cq); - nesqp->nesrcq = nescq; - - nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << - NES_QPCONTEXT_MISC_PCI_FCN_SHIFT); - nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size << - NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT); - nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size << - NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT); - if (!udata) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN); - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN); - } - nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number + - ((u32)nesqp->nesrcq->hw_cq.cq_number << 16)); - u64temp = (u64)nesqp->hwqp.sq_pbase; - nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); - nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); - - - if (!virt_wqs) { - u64temp = (u64)nesqp->hwqp.sq_pbase; - nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); - nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); - u64temp = (u64)nesqp->hwqp.rq_pbase; - nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); - nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); - } else { - u64temp = (u64)nesqp->pbl_pbase; - nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); - nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); - } - - /* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n", - nesvnic->next_qp_nic_index, - nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */ - spin_lock_irqsave(&nesdev->cqp.lock, flags); - nesqp->nesqp_context->misc2 |= cpu_to_le32( - (u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] << - NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT); - nesvnic->next_qp_nic_index++; - if ((nesvnic->next_qp_nic_index > 3) || - (nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) { - nesvnic->next_qp_nic_index = 0; - } - spin_unlock_irqrestore(&nesdev->cqp.lock, flags); - - nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16); - u64temp = (u64)nesqp->hwqp.q2_pbase; - nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp); - nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32)); - nesqp->nesqp_context->aeq_token_low = cpu_to_le32((u32)((unsigned long)(nesqp))); - nesqp->nesqp_context->aeq_token_high = cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp)))); - nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM | - NES_QPCONTEXT_ORDIRD_AAH | - ((((u32)nesadapter->max_irrq_wr) << - NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK)); - if (disable_mpa_crc) { - nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n"); - nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC); - } - - - /* Create the QP */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n"); - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - nes_free_qp_mem(nesdev, nesqp,virt_wqs); - kfree(nesqp->allocated_buffer); - return ERR_PTR(-ENOMEM); - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - if (!virt_wqs) { - opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | - NES_CQP_QP_IWARP_STATE_IDLE; - } else { - opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS | - NES_CQP_QP_IWARP_STATE_IDLE; - } - opcode |= NES_CQP_QP_CQS_VALID; - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); - - u64temp = (u64)nesqp->nesqp_context_pbase; - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n", - nesqp->hwqp.qp_id); - ret = wait_event_timeout(cqp_request->waitq, - (cqp_request->request_done != 0), NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u," - " nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail, - cqp_request->major_code, cqp_request->minor_code); - if ((!ret) || (cqp_request->major_code)) { - nes_put_cqp_request(nesdev, cqp_request); - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - nes_free_qp_mem(nesdev, nesqp,virt_wqs); - kfree(nesqp->allocated_buffer); - if (!ret) { - return ERR_PTR(-ETIME); - } else { - return ERR_PTR(-EIO); - } - } - - nes_put_cqp_request(nesdev, cqp_request); - - if (udata) { - uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index; - uresp.mmap_rq_db_index = 0; - uresp.actual_sq_size = sq_size; - uresp.actual_rq_size = rq_size; - uresp.qp_id = nesqp->hwqp.qp_id; - uresp.nes_drv_opt = nes_drv_opt; - if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { - nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); - nes_free_qp_mem(nesdev, nesqp,virt_wqs); - kfree(nesqp->allocated_buffer); - return ERR_PTR(-EFAULT); - } - } - - nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", - nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); - spin_lock_init(&nesqp->lock); - nes_add_ref(&nesqp->ibqp); - break; - default: - nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); - return ERR_PTR(-EINVAL); - } - init_completion(&nesqp->sq_drained); - init_completion(&nesqp->rq_drained); - - nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); - timer_setup(&nesqp->terminate_timer, nes_terminate_timeout, 0); - - /* update the QP table */ - nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; - nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", - netdev_refcnt_read(nesvnic->netdev)); - - return &nesqp->ibqp; -} - -/** - * nes_clean_cq - */ -static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq) -{ - u32 cq_head; - u32 lo; - u32 hi; - u64 u64temp; - unsigned long flags = 0; - - spin_lock_irqsave(&nescq->lock, flags); - - cq_head = nescq->hw_cq.cq_head; - while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { - rmb(); - lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); - hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]); - u64temp = (((u64)hi) << 32) | ((u64)lo); - u64temp &= ~(NES_SW_CONTEXT_ALIGN-1); - if (u64temp == (u64)(unsigned long)nesqp) { - /* Zero the context value so cqe will be ignored */ - nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0; - nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0; - } - - if (++cq_head >= nescq->hw_cq.cq_size) - cq_head = 0; - } - - spin_unlock_irqrestore(&nescq->lock, flags); -} - - -/** - * nes_destroy_qp - */ -static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) -{ - struct nes_qp *nesqp = to_nesqp(ibqp); - struct nes_ucontext *nes_ucontext; - struct ib_qp_attr attr; - struct iw_cm_id *cm_id; - struct iw_cm_event cm_event; - int ret = 0; - - atomic_inc(&sw_qps_destroyed); - nesqp->destroyed = 1; - - /* Blow away the connection if it exists. */ - if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) { - /* if (nesqp->ibqp_state == IB_QPS_RTS) { */ - attr.qp_state = IB_QPS_ERR; - nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); - } - - if (((nesqp->ibqp_state == IB_QPS_INIT) || - (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) { - cm_id = nesqp->cm_id; - cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = -ETIMEDOUT; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; - cm_event.private_data = NULL; - cm_event.private_data_len = 0; - - nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for " - "QP%u. cm_id = %p, refcount = %u. \n", - nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount)); - - cm_id->rem_ref(cm_id); - ret = cm_id->event_handler(cm_id, &cm_event); - if (ret) - nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); - } - - if (nesqp->user_mode) { - if (udata) { - nes_ucontext = - rdma_udata_to_drv_context( - udata, - struct nes_ucontext, - ibucontext); - clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); - nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL; - if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) { - nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; - } - } - if (nesqp->pbl_pbase && nesqp->sq_kmapped) { - nesqp->sq_kmapped = 0; - kunmap(nesqp->page); - } - } else { - /* Clean any pending completions from the cq(s) */ - if (nesqp->nesscq) - nes_clean_cq(nesqp, nesqp->nesscq); - - if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) - nes_clean_cq(nesqp, nesqp->nesrcq); - } - nes_rem_ref(&nesqp->ibqp); - return 0; -} - - -/** - * nes_create_cq - */ -static int nes_create_cq(struct ib_cq *ibcq, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) -{ - struct ib_device *ibdev = ibcq->device; - int entries = attr->cqe; - u64 u64temp; - struct nes_vnic *nesvnic = to_nesvnic(ibdev); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_cq *nescq = to_nescq(ibcq); - struct nes_ucontext *nes_ucontext = NULL; - struct nes_cqp_request *cqp_request; - void *mem = NULL; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_pbl *nespbl = NULL; - struct nes_create_cq_req req; - struct nes_create_cq_resp resp; - u32 cq_num = 0; - u32 opcode = 0; - u32 pbl_entries = 1; - int err; - unsigned long flags; - int ret; - - if (attr->flags) - return -EINVAL; - - if (entries > nesadapter->max_cqe) - return -EINVAL; - - err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, - nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ); - if (err) - return err; - - nescq->hw_cq.cq_size = max(entries + 1, 5); - nescq->hw_cq.cq_number = cq_num; - nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1; - - if (udata) { - struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( - udata, struct nes_ucontext, ibucontext); - - if (ib_copy_from_udata(&req, udata, - sizeof(struct nes_create_cq_req))) { - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -EFAULT; - } - nesvnic->mcrq_ucontext = nes_ucontext; - nes_ucontext->mcrqf = req.mcrqf; - if (nes_ucontext->mcrqf) { - if (nes_ucontext->mcrqf & 0x80000000) - nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1); - else if (nes_ucontext->mcrqf & 0x40000000) - nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff; - else - nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1; - nescq->mcrqf = nes_ucontext->mcrqf; - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - } - err = 1; - list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { - if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { - list_del(&nespbl->list); - err = 0; - nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n", - nespbl); - break; - } - } - if (err) { - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -EFAULT; - } - - pbl_entries = nespbl->pbl_size >> 3; - nescq->cq_mem_size = 0; - } else { - nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe); - nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n", - entries, nescq->cq_mem_size, nescq->hw_cq.cq_number); - - /* allocate the physical buffer space */ - mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size, - &nescq->hw_cq.cq_pbase); - if (!mem) { - printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -ENOMEM; - } - - nescq->hw_cq.cq_vbase = mem; - nescq->hw_cq.cq_head = 0; - } - - nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; - spin_lock_init(&nescq->lock); - - /* send CreateCQ request to CQP */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); - if (!udata) - pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, - nescq->hw_cq.cq_pbase); - else { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, - nespbl->pbl_vbase, nespbl->pbl_pbase); - kfree(nespbl); - } - - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | - NES_CQP_CQ_CHK_OVERFLOW | - NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16); - - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - - if (pbl_entries != 1) { - if (pbl_entries > 32) { - /* use 4k pbl */ - nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries); - if (nesadapter->free_4kpbl == 0) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - nes_free_cqp_request(nesdev, cqp_request); - if (!udata) - pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, - nescq->hw_cq.cq_pbase); - else { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, - nespbl->pbl_vbase, nespbl->pbl_pbase); - kfree(nespbl); - } - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -ENOMEM; - } else { - opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); - nescq->virtual_cq = 2; - nesadapter->free_4kpbl--; - } - } else { - /* use 256 byte pbl */ - nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries); - if (nesadapter->free_256pbl == 0) { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - nes_free_cqp_request(nesdev, cqp_request); - if (!udata) - pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, - nescq->hw_cq.cq_pbase); - else { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, - nespbl->pbl_vbase, nespbl->pbl_pbase); - kfree(nespbl); - } - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -ENOMEM; - } else { - opcode |= NES_CQP_CQ_VIRT; - nescq->virtual_cq = 1; - nesadapter->free_256pbl--; - } - } - } - - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16))); - - if (udata) { - if (pbl_entries != 1) - u64temp = (u64)nespbl->pbl_pbase; - else - u64temp = le64_to_cpu(nespbl->pbl_vbase[0]); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX, - nes_ucontext->mmap_db_index[0]); - } else { - u64temp = (u64)nescq->hw_cq.cq_pbase; - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; - } - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; - u64temp = (u64)(unsigned long)&nescq->hw_cq; - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = - cpu_to_le32((u32)(u64temp >> 1)); - cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = - cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n", - nescq->hw_cq.cq_number); - ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), - NES_EVENT_TIMEOUT * 2); - nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n", - nescq->hw_cq.cq_number, ret); - if ((!ret) || (cqp_request->major_code)) { - nes_put_cqp_request(nesdev, cqp_request); - if (!udata) - pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, - nescq->hw_cq.cq_pbase); - else { - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, - nespbl->pbl_vbase, nespbl->pbl_pbase); - kfree(nespbl); - } - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -EIO; - } - nes_put_cqp_request(nesdev, cqp_request); - - if (udata) { - /* free the nespbl */ - pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, - nespbl->pbl_pbase); - kfree(nespbl); - resp.cq_id = nescq->hw_cq.cq_number; - resp.cq_size = nescq->hw_cq.cq_size; - resp.mmap_db_index = 0; - if (ib_copy_to_udata(udata, &resp, - sizeof(resp) - sizeof(resp.reserved))) { - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return -EFAULT; - } - } - - return 0; -} - -/** - * nes_destroy_cq - */ -static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) -{ - struct nes_cq *nescq; - struct nes_device *nesdev; - struct nes_vnic *nesvnic; - struct nes_adapter *nesadapter; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request cqp_request = {}; - unsigned long flags; - u32 opcode = 0; - - nescq = to_nescq(ib_cq); - nesvnic = to_nesvnic(ib_cq->device); - nesdev = nesvnic->nesdev; - nesadapter = nesdev->nesadapter; - - nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number); - - /* Send DestroyCQ request to CQP */ - INIT_LIST_HEAD(&cqp_request.list); - init_waitqueue_head(&cqp_request.waitq); - - cqp_request.waiting = 1; - cqp_wqe = &cqp_request.cqp_wqe; - opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nescq->virtual_cq == 1) { - nesadapter->free_256pbl++; - if (nesadapter->free_256pbl > nesadapter->max_256pbl) { - printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n", - __func__, nesadapter->free_256pbl, nesadapter->max_256pbl); - } - } else if (nescq->virtual_cq == 2) { - nesadapter->free_4kpbl++; - if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) { - printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n", - __func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl); - } - opcode |= NES_CQP_CQ_4KB_CHUNK; - } - - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, - (nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16))); - if (!nescq->mcrqf) - nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number); - - nes_post_cqp_request(nesdev, &cqp_request); - - /* Wait for CQP */ - nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", - nescq->hw_cq.cq_number); - wait_event_timeout(cqp_request.waitq, cqp_request.request_done, - NES_EVENT_TIMEOUT); - nes_debug( - NES_DBG_CQ, - "Destroy iWARP CQ%u completed CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nescq->hw_cq.cq_number, cqp_request.major_code, - cqp_request.minor_code); - - if (nescq->cq_mem_size) - pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, - nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); -} - -/** - * root_256 - */ -static u32 root_256(struct nes_device *nesdev, - struct nes_root_vpbl *root_vpbl, - struct nes_root_vpbl *new_root, - u16 pbl_count_4k) -{ - u64 leaf_pbl; - int i, j, k; - - if (pbl_count_4k == 1) { - new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, - 512, &new_root->pbl_pbase); - - if (new_root->pbl_vbase == NULL) - return 0; - - leaf_pbl = (u64)root_vpbl->pbl_pbase; - for (i = 0; i < 16; i++) { - new_root->pbl_vbase[i].pa_low = - cpu_to_le32((u32)leaf_pbl); - new_root->pbl_vbase[i].pa_high = - cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); - leaf_pbl += 256; - } - } else { - for (i = 3; i >= 0; i--) { - j = i * 16; - root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i]; - leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) + - (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high)) - << 32); - for (k = 1; k < 16; k++) { - leaf_pbl += 256; - root_vpbl->pbl_vbase[j + k].pa_low = - cpu_to_le32((u32)leaf_pbl); - root_vpbl->pbl_vbase[j + k].pa_high = - cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); - } - } - } - - return 1; -} - - -/** - * nes_reg_mr - */ -static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, - u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl, - dma_addr_t single_buffer, u16 pbl_count_4k, - u16 residual_page_count_4k, int acc, u64 *iova_start, - u16 *actual_pbl_cnt, u8 *used_4k_pbls) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - unsigned long flags; - int ret; - struct nes_adapter *nesadapter = nesdev->nesadapter; - uint pg_cnt = 0; - u16 pbl_count_256 = 0; - u16 pbl_count = 0; - u8 use_256_pbls = 0; - u8 use_4k_pbls = 0; - u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0; - struct nes_root_vpbl new_root = { 0, NULL, NULL }; - u32 opcode = 0; - u16 major_code; - - /* Register the region with the adapter */ - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - if (pbl_count_4k) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - - pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k; - pbl_count_256 = (pg_cnt + 31) / 32; - if (pg_cnt <= 32) { - if (pbl_count_256 <= nesadapter->free_256pbl) - use_256_pbls = 1; - else if (pbl_count_4k <= nesadapter->free_4kpbl) - use_4k_pbls = 1; - } else if (pg_cnt <= 2048) { - if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) && - (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) { - use_4k_pbls = 1; - } else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) { - use_256_pbls = 1; - use_two_level = 1; - } else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { - use_4k_pbls = 1; - } - } else { - if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl) - use_4k_pbls = 1; - } - - if (use_256_pbls) { - pbl_count = pbl_count_256; - nesadapter->free_256pbl -= pbl_count + use_two_level; - } else if (use_4k_pbls) { - pbl_count = pbl_count_4k; - nesadapter->free_4kpbl -= pbl_count + use_two_level; - } else { - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - nes_debug(NES_DBG_MR, "Out of Pbls\n"); - nes_free_cqp_request(nesdev, cqp_request); - return -ENOMEM; - } - - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } - - if (use_256_pbls && use_two_level) { - if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) { - if (new_root.pbl_pbase != 0) - root_vpbl = &new_root; - } else { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - nesadapter->free_256pbl += pbl_count_256 + use_two_level; - use_256_pbls = 0; - - if (pbl_count_4k == 1) - use_two_level = 0; - pbl_count = pbl_count_4k; - - if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { - nesadapter->free_4kpbl -= pbl_count + use_two_level; - use_4k_pbls = 1; - } - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - - if (use_4k_pbls == 0) - return -ENOMEM; - } - } - - opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ | - NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; - if (acc & IB_ACCESS_LOCAL_WRITE) - opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE; - if (acc & IB_ACCESS_REMOTE_WRITE) - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN; - if (acc & IB_ACCESS_REMOTE_READ) - opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN; - if (acc & IB_ACCESS_MW_BIND) - opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN; - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length); - - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = - cpu_to_le32((u32)(region_length >> 8) & 0xff000000); - cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= - cpu_to_le32(nespd->pd_id & 0x00007fff); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); - - if (pbl_count == 0) { - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer); - } else { - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8)); - - if (use_4k_pbls) - cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); - } - barrier(); - - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - stag, ret, cqp_request->major_code, cqp_request->minor_code); - major_code = cqp_request->major_code; - nes_put_cqp_request(nesdev, cqp_request); - - if ((!ret || major_code) && pbl_count != 0) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (use_256_pbls) - nesadapter->free_256pbl += pbl_count + use_two_level; - else if (use_4k_pbls) - nesadapter->free_4kpbl += pbl_count + use_two_level; - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } - if (new_root.pbl_pbase) - pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase, - new_root.pbl_pbase); - - if (!ret) - return -ETIME; - else if (major_code) - return -EIO; - - *actual_pbl_cnt = pbl_count + use_two_level; - *used_4k_pbls = use_4k_pbls; - return 0; -} - - -/** - * nes_reg_phys_mr - */ -struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, - int acc, u64 *iova_start) -{ - u64 region_length; - struct nes_pd *nespd = to_nespd(ib_pd); - struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_mr *nesmr; - struct ib_mr *ibmr; - struct nes_vpbl vpbl; - struct nes_root_vpbl root_vpbl; - u32 stag; - unsigned long mask; - u32 stag_index = 0; - u32 next_stag_index = 0; - u32 driver_key = 0; - int err = 0; - int ret = 0; - u16 pbl_count = 0; - u8 single_page = 1; - u8 stag_key = 0; - - region_length = 0; - vpbl.pbl_vbase = NULL; - root_vpbl.pbl_vbase = NULL; - root_vpbl.pbl_pbase = 0; - - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - - driver_key = 0; - - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - if ((addr ^ *iova_start) & ~PAGE_MASK) - return ERR_PTR(-EINVAL); - - err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr, - &stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR); - if (err) { - return ERR_PTR(err); - } - - nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); - if (!nesmr) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - - /* Allocate a 4K buffer for the PBL */ - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", - vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); - if (!vpbl.pbl_vbase) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - ibmr = ERR_PTR(-ENOMEM); - kfree(nesmr); - goto reg_phys_err; - } - - - mask = !size; - - if (mask & ~PAGE_MASK) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n"); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_phys_err; - } - - region_length += size; - vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK); - vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32))); - - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - - nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX," - " length = 0x%016lX, index = 0x%08X\n", - stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index); - - /* Make the leaf PBL the root if only one PBL */ - root_vpbl.pbl_pbase = vpbl.pbl_pbase; - - if (single_page) { - pbl_count = 0; - } else { - pbl_count = 1; - } - ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl, - addr, pbl_count, 1, acc, iova_start, - &nesmr->pbls_used, &nesmr->pbl_4k); - - if (ret == 0) { - nesmr->ibmr.rkey = stag; - nesmr->ibmr.lkey = stag; - nesmr->mode = IWNES_MEMREG_TYPE_MEM; - ibmr = &nesmr->ibmr; - } else { - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - } - -reg_phys_err: - /* single PBL case */ - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); - return ibmr; -} - - -/** - * nes_get_dma_mr - */ -static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc) -{ - u64 kva = 0; - - nes_debug(NES_DBG_MR, "\n"); - - return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva); -} - -/** - * nes_reg_user_mr - */ -static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *udata) -{ - u64 iova_start; - __le64 *pbl; - u64 region_length; - dma_addr_t last_dma_addr = 0; - dma_addr_t first_dma_addr = 0; - struct nes_pd *nespd = to_nespd(pd); - struct nes_vnic *nesvnic = to_nesvnic(pd->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct ib_mr *ibmr = ERR_PTR(-EINVAL); - struct sg_dma_page_iter dma_iter; - struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( - udata, struct nes_ucontext, ibucontext); - struct nes_pbl *nespbl; - struct nes_mr *nesmr; - struct ib_umem *region; - struct nes_mem_reg_req req; - struct nes_vpbl vpbl; - struct nes_root_vpbl root_vpbl; - int page_index; - int page_count = 0; - int err, pbl_depth = 0; - int ret; - u32 stag; - u32 stag_index = 0; - u32 next_stag_index; - u32 driver_key; - u32 root_pbl_index = 0; - u32 cur_pbl_index = 0; - u32 skip_pages; - u16 pbl_count; - u8 single_page = 1; - u8 stag_key; - - region = ib_umem_get(udata, start, length, acc, 0); - if (IS_ERR(region)) { - return (struct ib_mr *)region; - } - - nes_debug( - NES_DBG_MR, - "User base = 0x%lX, Virt base = 0x%lX, length = %u, offset = %u, page size = %lu.\n", - (unsigned long)start, (unsigned long)virt, (u32)length, - ib_umem_offset(region), PAGE_SIZE); - - skip_pages = ((u32)ib_umem_offset(region)) >> 12; - - if (ib_copy_from_udata(&req, udata, sizeof(req))) { - ib_umem_release(region); - return ERR_PTR(-EFAULT); - } - nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type); - - switch (req.reg_type) { - case IWNES_MEMREG_TYPE_MEM: - pbl_depth = 0; - region_length = 0; - vpbl.pbl_vbase = NULL; - root_vpbl.pbl_vbase = NULL; - root_vpbl.pbl_pbase = 0; - - get_random_bytes(&next_stag_index, sizeof(next_stag_index)); - stag_key = (u8)next_stag_index; - - driver_key = next_stag_index & 0x70000000; - - next_stag_index >>= 8; - next_stag_index %= nesadapter->max_mr; - - err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, - nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR); - if (err) { - ib_umem_release(region); - return ERR_PTR(err); - } - - nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); - if (!nesmr) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - return ERR_PTR(-ENOMEM); - } - nesmr->region = region; - - for_each_sg_dma_page (region->sg_head.sgl, &dma_iter, region->nmap, 0) { - - region_length += PAGE_SIZE; - region_length -= skip_pages << 12; - skip_pages = 0; - if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length) - goto enough_pages; - if ((page_count & 0x01FF) == 0) { - if (page_count >= 1024 * 512) { - ib_umem_release(region); - nes_free_resource(nesadapter, - nesadapter->allocated_mrs, stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-E2BIG); - goto reg_user_mr_err; - } - if (root_pbl_index == 1) { - root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, - 8192, &root_vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", - root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); - if (!root_vpbl.pbl_vbase) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.leaf_vpbl = kcalloc(1024, - sizeof(*root_vpbl.leaf_vpbl), - GFP_KERNEL); - if (!root_vpbl.leaf_vpbl) { - ib_umem_release(region); - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - stag_index); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - goto reg_user_mr_err; - } - root_vpbl.pbl_vbase[0].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[0].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[0] = vpbl; - } - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", - vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); - if (!vpbl.pbl_vbase) { - ib_umem_release(region); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - ibmr = ERR_PTR(-ENOMEM); - kfree(nesmr); - goto reg_user_mr_err; - } - if (1 <= root_pbl_index) { - root_vpbl.pbl_vbase[root_pbl_index].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[root_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; - } - root_pbl_index++; - cur_pbl_index = 0; - } - if (single_page) { - if (page_count != 0) { - if ((last_dma_addr + 4096) != sg_page_iter_dma_address(&dma_iter)) - single_page = 0; - last_dma_addr = sg_page_iter_dma_address(&dma_iter); - } else { - first_dma_addr = sg_page_iter_dma_address(&dma_iter); - last_dma_addr = first_dma_addr; - } - } - - vpbl.pbl_vbase[cur_pbl_index].pa_low = - cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter))); - vpbl.pbl_vbase[cur_pbl_index].pa_high = - cpu_to_le32((u32)((u64)(sg_page_iter_dma_address(&dma_iter)))); - cur_pbl_index++; - page_count++; - } - -enough_pages: - nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," - " stag_key=0x%08x\n", - stag_index, driver_key, stag_key); - stag = stag_index << 8; - stag |= driver_key; - stag += (u32)stag_key; - - iova_start = virt; - /* Make the leaf PBL the root if only one PBL */ - if (root_pbl_index == 1) { - root_vpbl.pbl_pbase = vpbl.pbl_pbase; - } - - if (single_page) { - pbl_count = 0; - } else { - pbl_count = root_pbl_index; - first_dma_addr = 0; - } - nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X," - " index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n", - stag, (unsigned int)iova_start, - (unsigned int)region_length, stag_index, - (unsigned long long)region->length, pbl_count); - ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl, - first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, - &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k); - - nes_debug(NES_DBG_MR, "ret=%d\n", ret); - - if (ret == 0) { - nesmr->ibmr.rkey = stag; - nesmr->ibmr.lkey = stag; - nesmr->mode = IWNES_MEMREG_TYPE_MEM; - ibmr = &nesmr->ibmr; - } else { - ib_umem_release(region); - kfree(nesmr); - ibmr = ERR_PTR(-ENOMEM); - } - -reg_user_mr_err: - /* free the resources */ - if (root_pbl_index == 1) { - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - } else { - for (page_index=0; page_indexpcidev, 4096, - root_vpbl.leaf_vpbl[page_index].pbl_vbase, - root_vpbl.leaf_vpbl[page_index].pbl_pbase); - } - kfree(root_vpbl.leaf_vpbl); - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - } - - nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr); - - return ibmr; - case IWNES_MEMREG_TYPE_QP: - case IWNES_MEMREG_TYPE_CQ: - if (!region->length) { - nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n"); - ib_umem_release(region); - return ERR_PTR(-EINVAL); - } - nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); - if (!nespbl) { - ib_umem_release(region); - return ERR_PTR(-ENOMEM); - } - nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); - if (!nesmr) { - ib_umem_release(region); - kfree(nespbl); - return ERR_PTR(-ENOMEM); - } - nesmr->region = region; - pbl_depth = region->length >> 12; - pbl_depth += (region->length & (4096-1)) ? 1 : 0; - nespbl->pbl_size = pbl_depth*sizeof(u64); - if (req.reg_type == IWNES_MEMREG_TYPE_QP) { - nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory"); - } else { - nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory"); - } - - nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n", - nespbl->pbl_size, pbl_depth); - pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size, - &nespbl->pbl_pbase); - if (!pbl) { - ib_umem_release(region); - kfree(nesmr); - kfree(nespbl); - nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n"); - return ERR_PTR(-ENOMEM); - } - - nespbl->pbl_vbase = (u64 *)pbl; - nespbl->user_base = start; - nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx," - " pbl_vbase=%p user_base=0x%lx\n", - nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase, - (void *) nespbl->pbl_vbase, nespbl->user_base); - - nespbl->page = sg_page(region->sg_head.sgl); - for_each_sg_dma_page(region->sg_head.sgl, &dma_iter, region->nmap, 0) { - ((__le32 *)pbl)[0] = cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter))); - ((__le32 *)pbl)[1] = cpu_to_le32(((u64)(sg_page_iter_dma_address(&dma_iter)))>>32); - nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, - (unsigned long long)*pbl, - le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); - pbl++; - } - - if (req.reg_type == IWNES_MEMREG_TYPE_QP) { - list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); - } else { - list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list); - } - nesmr->ibmr.rkey = -1; - nesmr->ibmr.lkey = -1; - nesmr->mode = req.reg_type; - return &nesmr->ibmr; - } - - ib_umem_release(region); - return ERR_PTR(-ENOSYS); -} - - -/** - * nes_dereg_mr - */ -static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) -{ - struct nes_mr *nesmr = to_nesmr(ib_mr); - struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_hw_cqp_wqe *cqp_wqe; - struct nes_cqp_request *cqp_request; - unsigned long flags; - int ret; - u16 major_code; - u16 minor_code; - - - if (nesmr->pages) - pci_free_consistent(nesdev->pcidev, - nesmr->max_pages * sizeof(u64), - nesmr->pages, - nesmr->paddr); - - if (nesmr->region) { - ib_umem_release(nesmr->region); - } - if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) { - kfree(nesmr); - return 0; - } - - /* Deallocate the region with the adapter */ - - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - cqp_request->waiting = 1; - cqp_wqe = &cqp_request->cqp_wqe; - - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO | - NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey); - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey); - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X\n", - ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code); - - major_code = cqp_request->major_code; - minor_code = cqp_request->minor_code; - - nes_put_cqp_request(nesdev, cqp_request); - - if (!ret) { - nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag," - " ib_mr=%p, rkey = 0x%08X\n", - ib_mr, ib_mr->rkey); - return -ETIME; - } else if (major_code) { - nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting" - " to destroy STag, ib_mr=%p, rkey = 0x%08X\n", - major_code, minor_code, ib_mr, ib_mr->rkey); - return -EIO; - } - - if (nesmr->pbls_used != 0) { - spin_lock_irqsave(&nesadapter->pbl_lock, flags); - if (nesmr->pbl_4k) { - nesadapter->free_4kpbl += nesmr->pbls_used; - if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) - printk(KERN_ERR PFX "free 4KB PBLs(%u) has " - "exceeded the max(%u)\n", - nesadapter->free_4kpbl, - nesadapter->max_4kpbl); - } else { - nesadapter->free_256pbl += nesmr->pbls_used; - if (nesadapter->free_256pbl > nesadapter->max_256pbl) - printk(KERN_ERR PFX "free 256B PBLs(%u) has " - "exceeded the max(%u)\n", - nesadapter->free_256pbl, - nesadapter->max_256pbl); - } - spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); - } - nes_free_resource(nesadapter, nesadapter->allocated_mrs, - (ib_mr->rkey & 0x0fffff00) >> 8); - - kfree(nesmr); - - return 0; -} - - -/** - * show_rev - */ -static ssize_t hw_rev_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nes_ib_device *nesibdev = - rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev); - struct nes_vnic *nesvnic = nesibdev->nesvnic; - - nes_debug(NES_DBG_INIT, "\n"); - return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); -} -static DEVICE_ATTR_RO(hw_rev); - -/** - * show_hca - */ -static ssize_t hca_type_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - nes_debug(NES_DBG_INIT, "\n"); - return sprintf(buf, "NES020\n"); -} -static DEVICE_ATTR_RO(hca_type); - -/** - * show_board - */ -static ssize_t board_id_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - nes_debug(NES_DBG_INIT, "\n"); - return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); -} -static DEVICE_ATTR_RO(board_id); - -static struct attribute *nes_dev_attributes[] = { - &dev_attr_hw_rev.attr, - &dev_attr_hca_type.attr, - &dev_attr_board_id.attr, - NULL -}; - -static const struct attribute_group nes_attr_group = { - .attrs = nes_dev_attributes, -}; - -/** - * nes_query_qp - */ -static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_qp_init_attr *init_attr) -{ - struct nes_qp *nesqp = to_nesqp(ibqp); - - nes_debug(NES_DBG_QP, "\n"); - - attr->qp_access_flags = 0; - attr->cap.max_send_wr = nesqp->hwqp.sq_size; - attr->cap.max_recv_wr = nesqp->hwqp.rq_size; - attr->cap.max_recv_sge = 1; - if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) - attr->cap.max_inline_data = 0; - else - attr->cap.max_inline_data = 64; - - init_attr->event_handler = nesqp->ibqp.event_handler; - init_attr->qp_context = nesqp->ibqp.qp_context; - init_attr->send_cq = nesqp->ibqp.send_cq; - init_attr->recv_cq = nesqp->ibqp.recv_cq; - init_attr->srq = nesqp->ibqp.srq; - init_attr->cap = attr->cap; - - return 0; -} - - -/** - * nes_hw_modify_qp - */ -int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, - u32 next_iwarp_state, u32 termlen, u32 wait_completion) -{ - struct nes_hw_cqp_wqe *cqp_wqe; - /* struct iw_cm_id *cm_id = nesqp->cm_id; */ - /* struct iw_cm_event cm_event; */ - struct nes_cqp_request *cqp_request; - int ret; - u16 major_code; - - nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); - - cqp_request = nes_get_cqp_request(nesdev); - if (cqp_request == NULL) { - nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n"); - return -ENOMEM; - } - if (wait_completion) { - cqp_request->waiting = 1; - } else { - cqp_request->waiting = 0; - } - cqp_wqe = &cqp_request->cqp_wqe; - - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, - NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state); - nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n", - next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])); - nes_fill_init_cqp_wqe(cqp_wqe, nesdev); - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); - set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); - - /* If sending a terminate message, fill in the length (in words) */ - if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) && - !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) { - termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT; - set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen); - } - - atomic_set(&cqp_request->refcount, 2); - nes_post_cqp_request(nesdev, cqp_request); - - /* Wait for CQP */ - if (wait_completion) { - /* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n", - nesqp->hwqp.qp_id); */ - ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, " - "CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code); - major_code = cqp_request->major_code; - if (major_code) { - nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed" - "CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n", - nesqp->hwqp.qp_id, cqp_request->major_code, - cqp_request->minor_code, next_iwarp_state); - } - - nes_put_cqp_request(nesdev, cqp_request); - - if (!ret) - return -ETIME; - else if (major_code) - return -EIO; - else - return 0; - } else { - return 0; - } -} - - -/** - * nes_modify_qp - */ -int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) -{ - struct nes_qp *nesqp = to_nesqp(ibqp); - struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); - struct nes_device *nesdev = nesvnic->nesdev; - /* u32 cqp_head; */ - /* u32 counter; */ - u32 next_iwarp_state = 0; - int err; - unsigned long qplockflags; - int ret; - u16 original_last_aeq; - u8 issue_modify_qp = 0; - u8 dont_wait = 0; - - nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u," - " iwarp_state=0x%X, refcount=%d\n", - nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state, - nesqp->iwarp_state, atomic_read(&nesqp->refcount)); - - spin_lock_irqsave(&nesqp->lock, qplockflags); - - nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X," - " QP Access Flags=0x%X, attr_mask = 0x%0x\n", - nesqp->hwqp.qp_id, nesqp->hw_iwarp_state, - nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask); - - if (attr_mask & IB_QP_STATE) { - switch (attr->qp_state) { - case IB_QPS_INIT: - nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n", - nesqp->hwqp.qp_id); - if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; - issue_modify_qp = 1; - break; - case IB_QPS_RTR: - nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n", - nesqp->hwqp.qp_id); - if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; - issue_modify_qp = 1; - break; - case IB_QPS_RTS: - nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n", - nesqp->hwqp.qp_id); - if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - if (nesqp->cm_id == NULL) { - nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n", - nesqp->hwqp.qp_id ); - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS; - if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS) - next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID | - NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID; - issue_modify_qp = 1; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS; - nesqp->hte_added = 1; - break; - case IB_QPS_SQD: - issue_modify_qp = 1; - nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n", - nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail); - if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return 0; - } else { - if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { - nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" - " ignored due to current iWARP state\n", - nesqp->hwqp.qp_id); - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) { - nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" - " already done based on hw state.\n", - nesqp->hwqp.qp_id); - issue_modify_qp = 0; - } - switch (nesqp->hw_iwarp_state) { - case NES_AEQE_IWARP_STATE_CLOSING: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; - break; - case NES_AEQE_IWARP_STATE_TERMINATE: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; - break; - case NES_AEQE_IWARP_STATE_ERROR: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; - break; - default: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; - break; - } - } - break; - case IB_QPS_SQE: - nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n", - nesqp->hwqp.qp_id); - if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; - issue_modify_qp = 1; - break; - case IB_QPS_ERR: - case IB_QPS_RESET: - if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - } - nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", - nesqp->hwqp.qp_id); - if (nesqp->term_flags) - del_timer(&nesqp->terminate_timer); - - next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; - /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ - if (nesqp->hte_added) { - nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n"); - next_iwarp_state |= NES_CQP_QP_DEL_HTE; - nesqp->hte_added = 0; - } - if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && - (nesdev->iw_status) && - (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { - next_iwarp_state |= NES_CQP_QP_RESET; - } else { - nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", - nesqp->hwqp.qp_id, nesqp->hw_tcp_state); - dont_wait = 1; - } - issue_modify_qp = 1; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; - break; - default: - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - return -EINVAL; - break; - } - - nesqp->ibqp_state = attr->qp_state; - nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK; - nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n", - nesqp->iwarp_state); - } - - if (attr_mask & IB_QP_ACCESS_FLAGS) { - if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | - NES_QPCONTEXT_MISC_RDMA_READ_EN); - issue_modify_qp = 1; - } - if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN); - issue_modify_qp = 1; - } - if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN); - issue_modify_qp = 1; - } - if (attr->qp_access_flags & IB_ACCESS_MW_BIND) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN); - issue_modify_qp = 1; - } - - if (nesqp->user_mode) { - nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | - NES_QPCONTEXT_MISC_RDMA_READ_EN); - issue_modify_qp = 1; - } - } - - original_last_aeq = nesqp->last_aeq; - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - - nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp); - - ret = 0; - - - if (issue_modify_qp) { - nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); - ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1); - if (ret) - nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" - " failed for QP%u.\n", - next_iwarp_state, nesqp->hwqp.qp_id); - - } - - if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) { - nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d)," - " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { - if (dont_wait) { - if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { - nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," - " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - /* this one is for the cm_disconnect thread */ - spin_lock_irqsave(&nesqp->lock, qplockflags); - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - nes_cm_disconn(nesqp); - } else { - nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); - } - } else { - spin_lock_irqsave(&nesqp->lock, qplockflags); - if (nesqp->cm_id) { - /* These two are for the timer thread */ - if (atomic_inc_return(&nesqp->close_timer_started) == 1) { - nesqp->cm_id->add_ref(nesqp->cm_id); - nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," - " need ae to finish up, original_last_aeq = 0x%04X." - " last_aeq = 0x%04X, scheduling timer.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0); - } - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - } else { - spin_unlock_irqrestore(&nesqp->lock, qplockflags); - nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," - " need ae to finish up, original_last_aeq = 0x%04X." - " last_aeq = 0x%04X.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - } - } - } else { - nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," - " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - } - } else { - nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," - " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), - original_last_aeq, nesqp->last_aeq); - } - - err = 0; - - nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n", - nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); - - return err; -} - -static inline void -fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, const struct ib_send_wr *ib_wr, - u32 uselkey) -{ - int sge_index; - int total_payload_length = 0; - for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) { - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4), - ib_wr->sg_list[sge_index].addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4), - ib_wr->sg_list[sge_index].length); - if (uselkey) - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), - (ib_wr->sg_list[sge_index].lkey)); - else - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0); - - total_payload_length += ib_wr->sg_list[sge_index].length; - } - nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n", - total_payload_length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - total_payload_length); -} - -/** - * nes_post_send - */ -static int nes_post_send(struct ib_qp *ibqp, const struct ib_send_wr *ib_wr, - const struct ib_send_wr **bad_wr) -{ - u64 u64temp; - unsigned long flags = 0; - struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_qp *nesqp = to_nesqp(ibqp); - struct nes_hw_qp_wqe *wqe; - int err = 0; - u32 qsize = nesqp->hwqp.sq_size; - u32 head; - u32 wqe_misc = 0; - u32 wqe_count = 0; - u32 counter; - - if (nesqp->ibqp_state > IB_QPS_RTS) { - err = -EINVAL; - goto out; - } - - spin_lock_irqsave(&nesqp->lock, flags); - - head = nesqp->hwqp.sq_head; - - while (ib_wr) { - /* Check for QP error */ - if (nesqp->term_flags) { - err = -EINVAL; - break; - } - - /* Check for SQ overflow */ - if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { - err = -ENOMEM; - break; - } - - wqe = &nesqp->hwqp.sq_vbase[head]; - /* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n", - nesqp->hwqp.qp_id, wqe, head); */ - nes_fill_init_qp_wqe(wqe, nesqp, head); - u64temp = (u64)(ib_wr->wr_id); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, - u64temp); - switch (ib_wr->opcode) { - case IB_WR_SEND: - case IB_WR_SEND_WITH_INV: - if (IB_WR_SEND == ib_wr->opcode) { - if (ib_wr->send_flags & IB_SEND_SOLICITED) - wqe_misc = NES_IWARP_SQ_OP_SENDSE; - else - wqe_misc = NES_IWARP_SQ_OP_SEND; - } else { - if (ib_wr->send_flags & IB_SEND_SOLICITED) - wqe_misc = NES_IWARP_SQ_OP_SENDSEINV; - else - wqe_misc = NES_IWARP_SQ_OP_SENDINV; - - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, - ib_wr->ex.invalidate_rkey); - } - - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - err = -EINVAL; - break; - } - - if (ib_wr->send_flags & IB_SEND_FENCE) - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } - - break; - case IB_WR_RDMA_WRITE: - wqe_misc = NES_IWARP_SQ_OP_RDMAW; - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", - ib_wr->num_sge, nesdev->nesadapter->max_sge); - err = -EINVAL; - break; - } - - if (ib_wr->send_flags & IB_SEND_FENCE) - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - rdma_wr(ib_wr)->rkey); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - rdma_wr(ib_wr)->remote_addr); - - if ((ib_wr->send_flags & IB_SEND_INLINE) && - ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && - (ib_wr->sg_list[0].length <= 64)) { - memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], - (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, - ib_wr->sg_list[0].length); - wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; - } else { - fill_wqe_sg_send(wqe, ib_wr, 1); - } - - wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = - wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; - break; - case IB_WR_RDMA_READ: - case IB_WR_RDMA_READ_WITH_INV: - /* iWARP only supports 1 sge for RDMA reads */ - if (ib_wr->num_sge > 1) { - nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", - ib_wr->num_sge); - err = -EINVAL; - break; - } - if (ib_wr->opcode == IB_WR_RDMA_READ) { - wqe_misc = NES_IWARP_SQ_OP_RDMAR; - } else { - wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV; - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, - ib_wr->ex.invalidate_rkey); - } - - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, - rdma_wr(ib_wr)->remote_addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, - rdma_wr(ib_wr)->rkey); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, - ib_wr->sg_list->length); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, - ib_wr->sg_list->addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, - ib_wr->sg_list->lkey); - break; - case IB_WR_LOCAL_INV: - wqe_misc = NES_IWARP_SQ_OP_LOCINV; - set_wqe_32bit_value(wqe->wqe_words, - NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, - ib_wr->ex.invalidate_rkey); - break; - case IB_WR_REG_MR: - { - struct nes_mr *mr = to_nesmr(reg_wr(ib_wr)->mr); - int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size); - int flags = reg_wr(ib_wr)->access; - - if (mr->npages > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) { - nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n"); - err = -EINVAL; - break; - } - wqe_misc = NES_IWARP_SQ_OP_FAST_REG; - set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX, - mr->ibmr.iova); - set_wqe_32bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - lower_32_bits(mr->ibmr.length)); - set_wqe_32bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); - set_wqe_32bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX, - reg_wr(ib_wr)->key); - - if (page_shift == 12) { - wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K; - } else if (page_shift == 21) { - wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M; - } else { - nes_debug(NES_DBG_IW_TX, "Invalid page shift," - " ib_wr=%u, max=1\n", ib_wr->num_sge); - err = -EINVAL; - break; - } - - /* Set access_flags */ - wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ; - if (flags & IB_ACCESS_LOCAL_WRITE) - wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE; - - if (flags & IB_ACCESS_REMOTE_WRITE) - wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE; - - if (flags & IB_ACCESS_REMOTE_READ) - wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ; - - if (flags & IB_ACCESS_MW_BIND) - wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND; - - /* Fill in PBL info: */ - set_wqe_64bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX, - mr->paddr); - - set_wqe_32bit_value(wqe->wqe_words, - NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX, - mr->npages * 8); - - nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %lld, rkey: %0x, pgl_paddr: %llx, " - "page_list_len: %u, wqe_misc: %x\n", - (unsigned long long) mr->ibmr.iova, - mr->ibmr.length, - reg_wr(ib_wr)->key, - (unsigned long long) mr->paddr, - mr->npages, - wqe_misc); - break; - } - default: - /* error */ - err = -EINVAL; - break; - } - - if (err) - break; - - if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all) - wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; - - wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); - - ib_wr = ib_wr->next; - head++; - wqe_count++; - if (head >= qsize) - head = 0; - - } - - nesqp->hwqp.sq_head = head; - barrier(); - while (wqe_count) { - counter = min(wqe_count, ((u32)255)); - wqe_count -= counter; - nes_write32(nesdev->regs + NES_WQE_ALLOC, - (counter << 24) | 0x00800000 | nesqp->hwqp.qp_id); - } - - spin_unlock_irqrestore(&nesqp->lock, flags); - -out: - if (err) - *bad_wr = ib_wr; - return err; -} - - -/** - * nes_post_recv - */ -static int nes_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr, - const struct ib_recv_wr **bad_wr) -{ - u64 u64temp; - unsigned long flags = 0; - struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_qp *nesqp = to_nesqp(ibqp); - struct nes_hw_qp_wqe *wqe; - int err = 0; - int sge_index; - u32 qsize = nesqp->hwqp.rq_size; - u32 head; - u32 wqe_count = 0; - u32 counter; - u32 total_payload_length; - - if (nesqp->ibqp_state > IB_QPS_RTS) { - err = -EINVAL; - goto out; - } - - spin_lock_irqsave(&nesqp->lock, flags); - - head = nesqp->hwqp.rq_head; - - while (ib_wr) { - /* Check for QP error */ - if (nesqp->term_flags) { - err = -EINVAL; - break; - } - - if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { - err = -EINVAL; - break; - } - /* Check for RQ overflow */ - if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) { - err = -ENOMEM; - break; - } - - nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge); - wqe = &nesqp->hwqp.rq_vbase[head]; - - /* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n", - nesqp->hwqp.qp_id, wqe, head); */ - nes_fill_init_qp_wqe(wqe, nesqp, head); - u64temp = (u64)(ib_wr->wr_id); - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, - u64temp); - total_payload_length = 0; - for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) { - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4), - ib_wr->sg_list[sge_index].addr); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4), - ib_wr->sg_list[sge_index].length); - set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4), - ib_wr->sg_list[sge_index].lkey); - - total_payload_length += ib_wr->sg_list[sge_index].length; - } - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX, - total_payload_length); - - ib_wr = ib_wr->next; - head++; - wqe_count++; - if (head >= qsize) - head = 0; - } - - nesqp->hwqp.rq_head = head; - barrier(); - while (wqe_count) { - counter = min(wqe_count, ((u32)255)); - wqe_count -= counter; - nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id); - } - - spin_unlock_irqrestore(&nesqp->lock, flags); - -out: - if (err) - *bad_wr = ib_wr; - return err; -} - -/** - * nes_drain_sq - drain sq - * @ibqp: pointer to ibqp - */ -static void nes_drain_sq(struct ib_qp *ibqp) -{ - struct nes_qp *nesqp = to_nesqp(ibqp); - - if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head) - wait_for_completion(&nesqp->sq_drained); -} - -/** - * nes_drain_rq - drain rq - * @ibqp: pointer to ibqp - */ -static void nes_drain_rq(struct ib_qp *ibqp) -{ - struct nes_qp *nesqp = to_nesqp(ibqp); - - if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head) - wait_for_completion(&nesqp->rq_drained); -} - -/** - * nes_poll_cq - */ -static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) -{ - u64 u64temp; - u64 wrid; - unsigned long flags = 0; - struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_cq *nescq = to_nescq(ibcq); - struct nes_qp *nesqp; - struct nes_hw_cqe cqe; - u32 head; - u32 wq_tail = 0; - u32 cq_size; - u32 cqe_count = 0; - u32 wqe_index; - u32 u32temp; - u32 move_cq_head = 1; - u32 err_code; - - nes_debug(NES_DBG_CQ, "\n"); - - spin_lock_irqsave(&nescq->lock, flags); - - head = nescq->hw_cq.cq_head; - cq_size = nescq->hw_cq.cq_size; - - while (cqe_count < num_entries) { - if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & - NES_CQE_VALID) == 0) - break; - - /* - * Make sure we read CQ entry contents *after* - * we've checked the valid bit. - */ - rmb(); - - cqe = nescq->hw_cq.cq_vbase[head]; - u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); - wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); - u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); - /* parse CQE, get completion context from WQE (either rq or sq) */ - u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | - ((u64)u32temp); - - if (u64temp) { - nesqp = (struct nes_qp *)(unsigned long)u64temp; - memset(entry, 0, sizeof *entry); - if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { - entry->status = IB_WC_SUCCESS; - } else { - err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]); - if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) { - entry->status = err_code & 0x0000ffff; - - /* The rest of the cqe's will be marked as flushed */ - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] = - cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) | - NES_IWARP_CQE_MINOR_FLUSH); - } else - entry->status = IB_WC_WR_FLUSH_ERR; - } - - entry->qp = &nesqp->ibqp; - entry->src_qp = nesqp->hwqp.qp_id; - - if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { - if (nesqp->skip_lsmm) { - nesqp->skip_lsmm = 0; - nesqp->hwqp.sq_tail++; - } - - /* Working on a SQ Completion*/ - wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. - wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | - ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. - wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. - wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); - - switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. - wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { - case NES_IWARP_SQ_OP_RDMAW: - nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); - entry->opcode = IB_WC_RDMA_WRITE; - break; - case NES_IWARP_SQ_OP_RDMAR: - nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); - entry->opcode = IB_WC_RDMA_READ; - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. - wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); - break; - case NES_IWARP_SQ_OP_SENDINV: - case NES_IWARP_SQ_OP_SENDSEINV: - case NES_IWARP_SQ_OP_SEND: - case NES_IWARP_SQ_OP_SENDSE: - nes_debug(NES_DBG_CQ, "Operation = Send.\n"); - entry->opcode = IB_WC_SEND; - break; - case NES_IWARP_SQ_OP_LOCINV: - entry->opcode = IB_WC_LOCAL_INV; - break; - case NES_IWARP_SQ_OP_FAST_REG: - entry->opcode = IB_WC_REG_MR; - break; - } - - nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); - if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) { - move_cq_head = 0; - wq_tail = nesqp->hwqp.sq_tail; - } - } else { - /* Working on a RQ Completion*/ - entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); - wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | - ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); - entry->opcode = IB_WC_RECV; - - nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); - if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) { - move_cq_head = 0; - wq_tail = nesqp->hwqp.rq_tail; - } - } - - if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) { - if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head) - complete(&nesqp->sq_drained); - if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head) - complete(&nesqp->rq_drained); - } - - entry->wr_id = wrid; - entry++; - cqe_count++; - } - - if (move_cq_head) { - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; - if (++head >= cq_size) - head = 0; - nescq->polled_completions++; - - if ((nescq->polled_completions > (cq_size / 2)) || - (nescq->polled_completions == 255)) { - nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" - " are pending %u of %u.\n", - nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); - nes_write32(nesdev->regs+NES_CQE_ALLOC, - nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); - nescq->polled_completions = 0; - } - } else { - /* Update the wqe index and set status to flush */ - wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); - wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail; - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = - cpu_to_le32(wqe_index); - move_cq_head = 1; /* ready for next pass */ - } - } - - if (nescq->polled_completions) { - nes_write32(nesdev->regs+NES_CQE_ALLOC, - nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); - nescq->polled_completions = 0; - } - - nescq->hw_cq.cq_head = head; - nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n", - cqe_count, nescq->hw_cq.cq_number); - - spin_unlock_irqrestore(&nescq->lock, flags); - - return cqe_count; -} - - -/** - * nes_req_notify_cq - */ -static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) - { - struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_cq *nescq = to_nescq(ibcq); - u32 cq_arm; - - nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n", - nescq->hw_cq.cq_number); - - cq_arm = nescq->hw_cq.cq_number; - if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) - cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT; - else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) - cq_arm |= NES_CQE_ALLOC_NOTIFY_SE; - else - return -EINVAL; - - nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm); - nes_read32(nesdev->regs+NES_CQE_ALLOC); - - return 0; -} - -static int nes_port_immutable(struct ib_device *ibdev, u8 port_num, - struct ib_port_immutable *immutable) -{ - struct ib_port_attr attr; - int err; - - immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; - - err = nes_query_port(ibdev, port_num, &attr); - if (err) - return err; - - immutable->pkey_tbl_len = attr.pkey_tbl_len; - immutable->gid_tbl_len = attr.gid_tbl_len; - - return 0; -} - -static void get_dev_fw_str(struct ib_device *dev, char *str) -{ - struct nes_ib_device *nesibdev = - container_of(dev, struct nes_ib_device, ibdev); - struct nes_vnic *nesvnic = nesibdev->nesvnic; - - nes_debug(NES_DBG_INIT, "\n"); - snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", - (nesvnic->nesdev->nesadapter->firmware_version >> 16), - (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); -} - -static const struct ib_device_ops nes_dev_ops = { - .owner = THIS_MODULE, - .driver_id = RDMA_DRIVER_NES, - /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ - .uverbs_abi_ver = NES_ABI_USERSPACE_VER, - - .alloc_mr = nes_alloc_mr, - .alloc_mw = nes_alloc_mw, - .alloc_pd = nes_alloc_pd, - .alloc_ucontext = nes_alloc_ucontext, - .create_cq = nes_create_cq, - .create_qp = nes_create_qp, - .dealloc_mw = nes_dealloc_mw, - .dealloc_pd = nes_dealloc_pd, - .dealloc_ucontext = nes_dealloc_ucontext, - .dereg_mr = nes_dereg_mr, - .destroy_cq = nes_destroy_cq, - .destroy_qp = nes_destroy_qp, - .drain_rq = nes_drain_rq, - .drain_sq = nes_drain_sq, - .get_dev_fw_str = get_dev_fw_str, - .get_dma_mr = nes_get_dma_mr, - .get_port_immutable = nes_port_immutable, - .iw_accept = nes_accept, - .iw_add_ref = nes_add_ref, - .iw_connect = nes_connect, - .iw_create_listen = nes_create_listen, - .iw_destroy_listen = nes_destroy_listen, - .iw_get_qp = nes_get_qp, - .iw_reject = nes_reject, - .iw_rem_ref = nes_rem_ref, - .map_mr_sg = nes_map_mr_sg, - .mmap = nes_mmap, - .modify_qp = nes_modify_qp, - .poll_cq = nes_poll_cq, - .post_recv = nes_post_recv, - .post_send = nes_post_send, - .query_device = nes_query_device, - .query_gid = nes_query_gid, - .query_pkey = nes_query_pkey, - .query_port = nes_query_port, - .query_qp = nes_query_qp, - .reg_user_mr = nes_reg_user_mr, - .req_notify_cq = nes_req_notify_cq, - INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), - INIT_RDMA_OBJ_SIZE(ib_cq, nes_cq, ibcq), - INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext), -}; - -/** - * nes_init_ofa_device - */ -struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) -{ - struct nes_ib_device *nesibdev; - struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - - nesibdev = ib_alloc_device(nes_ib_device, ibdev); - if (nesibdev == NULL) { - return NULL; - } - - nesibdev->ibdev.node_type = RDMA_NODE_RNIC; - memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); - memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6); - - nesibdev->ibdev.uverbs_cmd_mask = - (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | - (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | - (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | - (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_REG_MR) | - (1ull << IB_USER_VERBS_CMD_DEREG_MR) | - (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | - (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_AH) | - (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | - (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_QP) | - (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | - (1ull << IB_USER_VERBS_CMD_POLL_CQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | - (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | - (1ull << IB_USER_VERBS_CMD_BIND_MW) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) | - (1ull << IB_USER_VERBS_CMD_POST_RECV) | - (1ull << IB_USER_VERBS_CMD_POST_SEND); - - nesibdev->ibdev.phys_port_cnt = 1; - nesibdev->ibdev.num_comp_vectors = 1; - nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev; - - ib_set_device_ops(&nesibdev->ibdev, &nes_dev_ops); - memcpy(nesibdev->ibdev.iw_ifname, netdev->name, - sizeof(nesibdev->ibdev.iw_ifname)); - - return nesibdev; -} - - -/** - * nes_handle_delayed_event - */ -static void nes_handle_delayed_event(struct timer_list *t) -{ - struct nes_vnic *nesvnic = from_timer(nesvnic, t, event_timer); - - if (nesvnic->delayed_event != nesvnic->last_dispatched_event) { - struct ib_event event; - - event.device = &nesvnic->nesibdev->ibdev; - if (!event.device) - goto stop_timer; - event.event = nesvnic->delayed_event; - event.element.port_num = nesvnic->logical_port + 1; - ib_dispatch_event(&event); - } - -stop_timer: - nesvnic->event_timer.function = NULL; -} - - -void nes_port_ibevent(struct nes_vnic *nesvnic) -{ - struct nes_ib_device *nesibdev = nesvnic->nesibdev; - struct nes_device *nesdev = nesvnic->nesdev; - struct ib_event event; - event.device = &nesibdev->ibdev; - event.element.port_num = nesvnic->logical_port + 1; - event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; - - if (!nesvnic->event_timer.function) { - ib_dispatch_event(&event); - nesvnic->last_dispatched_event = event.event; - nesvnic->event_timer.function = nes_handle_delayed_event; - nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY; - add_timer(&nesvnic->event_timer); - } else { - mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY); - } - nesvnic->delayed_event = event.event; -} - - -/** - * nes_destroy_ofa_device - */ -void nes_destroy_ofa_device(struct nes_ib_device *nesibdev) -{ - nes_unregister_ofa_device(nesibdev); - - ib_dealloc_device(&nesibdev->ibdev); -} - - -/** - * nes_register_ofa_device - */ -int nes_register_ofa_device(struct nes_ib_device *nesibdev) -{ - struct nes_vnic *nesvnic = nesibdev->nesvnic; - struct nes_device *nesdev = nesvnic->nesdev; - struct nes_adapter *nesadapter = nesdev->nesadapter; - int ret; - - rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); - ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d"); - if (ret) { - return ret; - } - - /* Get the resources allocated to this device */ - nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count; - nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count; - nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; - nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; - - nesvnic->of_device_registered = 1; - - return 0; -} - - -/** - * nes_unregister_ofa_device - */ -static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) -{ - struct nes_vnic *nesvnic = nesibdev->nesvnic; - - if (nesvnic->of_device_registered) - ib_unregister_device(&nesibdev->ibdev); - - nesvnic->of_device_registered = 0; -} diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h deleted file mode 100644 index 114a9b59fefd..000000000000 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef NES_VERBS_H -#define NES_VERBS_H - -struct nes_device; - -#define NES_MAX_USER_DB_REGIONS 4096 -#define NES_MAX_USER_WQ_REGIONS 4096 - -#define NES_TERM_SENT 0x01 -#define NES_TERM_RCVD 0x02 -#define NES_TERM_DONE 0x04 - -struct nes_ucontext { - struct ib_ucontext ibucontext; - struct nes_device *nesdev; - unsigned long mmap_wq_offset; - unsigned long mmap_cq_offset; /* to be removed */ - int index; /* rnic index (minor) */ - unsigned long allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)]; - u16 mmap_db_index[NES_MAX_USER_DB_REGIONS]; - u16 first_free_db; - unsigned long allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)]; - struct nes_qp *mmap_nesqp[NES_MAX_USER_WQ_REGIONS]; - u16 first_free_wq; - struct list_head cq_reg_mem_list; - struct list_head qp_reg_mem_list; - u32 mcrqf; -}; - -struct nes_pd { - struct ib_pd ibpd; - u16 pd_id; - atomic_t sqp_count; - u16 mmap_db_index; -}; - -struct nes_mr { - union { - struct ib_mr ibmr; - struct ib_mw ibmw; - struct ib_fmr ibfmr; - }; - struct ib_umem *region; - u16 pbls_used; - u8 mode; - u8 pbl_4k; - __le64 *pages; - dma_addr_t paddr; - u32 max_pages; - u32 npages; -}; - -struct nes_hw_pb { - __le32 pa_low; - __le32 pa_high; -}; - -struct nes_vpbl { - dma_addr_t pbl_pbase; - struct nes_hw_pb *pbl_vbase; -}; - -struct nes_root_vpbl { - dma_addr_t pbl_pbase; - struct nes_hw_pb *pbl_vbase; - struct nes_vpbl *leaf_vpbl; -}; - -struct nes_fmr { - struct nes_mr nesmr; - u32 leaf_pbl_cnt; - struct nes_root_vpbl root_vpbl; - struct ib_qp *ib_qp; - int access_rights; - struct ib_fmr_attr attr; -}; - -struct nes_av; - -struct nes_cq { - struct ib_cq ibcq; - struct nes_hw_cq hw_cq; - u32 polled_completions; - u32 cq_mem_size; - spinlock_t lock; - u8 virtual_cq; - u8 pad[3]; - u32 mcrqf; -}; - -struct nes_wq { - spinlock_t lock; -}; - -struct disconn_work { - struct work_struct work; - struct nes_qp *nesqp; -}; - -struct iw_cm_id; -struct ietf_mpa_frame; - -struct nes_qp { - struct ib_qp ibqp; - void *allocated_buffer; - struct iw_cm_id *cm_id; - struct nes_cq *nesscq; - struct nes_cq *nesrcq; - struct nes_pd *nespd; - void *cm_node; /* handle of the node this QP is associated with */ - void *ietf_frame; - u8 ietf_frame_size; - dma_addr_t ietf_frame_pbase; - struct ib_mr *lsmm_mr; - struct nes_hw_qp hwqp; - struct work_struct work; - enum ib_qp_state ibqp_state; - u32 iwarp_state; - u32 hte_index; - u32 last_aeq; - u32 qp_mem_size; - atomic_t refcount; - atomic_t close_timer_started; - u32 mmap_sq_db_index; - u32 mmap_rq_db_index; - spinlock_t lock; - spinlock_t pau_lock; - struct nes_qp_context *nesqp_context; - dma_addr_t nesqp_context_pbase; - void *pbl_vbase; - dma_addr_t pbl_pbase; - struct page *page; - struct timer_list terminate_timer; - enum ib_event_type terminate_eventtype; - struct sk_buff_head pau_list; - u32 pau_rcv_nxt; - u16 active_conn:1; - u16 skip_lsmm:1; - u16 user_mode:1; - u16 hte_added:1; - u16 flush_issued:1; - u16 destroyed:1; - u16 sig_all:1; - u16 pau_mode:1; - u16 rsvd:8; - u16 private_data_len; - u16 term_sq_flush_code; - u16 term_rq_flush_code; - u8 hw_iwarp_state; - u8 hw_tcp_state; - u8 term_flags; - u8 sq_kmapped; - u8 pau_busy; - u8 pau_pending; - u8 pau_state; - __u64 nesuqp_addr; - struct completion sq_drained; - struct completion rq_drained; -}; - -struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, - u64 addr, u64 size, int acc, u64 *iova_start); - -#endif /* NES_VERBS_H */ From 5d60c11154116e2127374d4178e952649612b69b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:17 -0300 Subject: [PATCH 058/194] RDMA: Move rdma_node_type to uapi/ This enum is exposed over the sysfs file 'node_type' and over netlink via RDMA_NLDEV_ATTR_DEV_NODE_TYPE, so declare it in the uapi headers. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 2 +- include/rdma/ib_verbs.h | 13 +------------ include/uapi/rdma/rdma_netlink.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 585e100706aa..588f1d195fd2 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) +rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f357e03a85a6..973514ea17a7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -132,17 +132,6 @@ struct ib_gid_attr { u8 port_num; }; -enum rdma_node_type { - /* IB values map to NodeInfo:NodeType. */ - RDMA_NODE_IB_CA = 1, - RDMA_NODE_IB_SWITCH, - RDMA_NODE_IB_ROUTER, - RDMA_NODE_RNIC, - RDMA_NODE_USNIC, - RDMA_NODE_USNIC_UDP, - RDMA_NODE_UNSPECIFIED, -}; - enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, @@ -164,7 +153,7 @@ enum rdma_protocol_type { }; __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type); +rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 41db51367efa..f588e8551c6c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -147,6 +147,18 @@ enum { IWPM_NLA_HELLO_MAX }; +/* For RDMA_NLDEV_ATTR_DEV_NODE_TYPE */ +enum { + /* IB values map to NodeInfo:NodeType. */ + RDMA_NODE_IB_CA = 1, + RDMA_NODE_IB_SWITCH, + RDMA_NODE_IB_ROUTER, + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, + RDMA_NODE_UNSPECIFIED, +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. From 0e2d00eb6fd45f2a645f4874286bdc5b4b53782b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:18 -0300 Subject: [PATCH 059/194] RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload Allow userspace to issue a netlink query against the ib_device for something like "uverbs" and get back the char dev name, inode major/minor, and interface ABI information for "uverbs0". Since we are now in netlink this can also trigger a module autoload to make the uverbs device come into existence. Largely this will let us replace searching and reading inside sysfs to setup devices, and provides an alternative (using driver_id) to device name based provider binding for things like rxe. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 9 +++ drivers/infiniband/core/device.c | 98 +++++++++++++++++++++++++++++ drivers/infiniband/core/nldev.c | 94 +++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 4 ++ include/rdma/rdma_netlink.h | 2 + include/uapi/rdma/rdma_netlink.h | 14 +++++ 6 files changed, 221 insertions(+) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index ff40a450b5d2..a953c2fa2e78 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -88,6 +88,15 @@ typedef int (*nldev_callback)(struct ib_device *device, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb); +struct ib_client_nl_info { + struct sk_buff *nl_msg; + struct device *cdev; + unsigned int port; + u64 abi; +}; +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index abb169f31d0f..7db8566cdb89 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1726,6 +1726,104 @@ void ib_unregister_client(struct ib_client *client) } EXPORT_SYMBOL(ib_unregister_client); +static int __ib_get_global_client_nl_info(const char *client_name, + struct ib_client_nl_info *res) +{ + struct ib_client *client; + unsigned long index; + int ret = -ENOENT; + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + if (strcmp(client->name, client_name) != 0) + continue; + if (!client->get_global_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_global_nl_info(res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&clients_rwsem); + return ret; +} + +static int __ib_get_client_nl_info(struct ib_device *ibdev, + const char *client_name, + struct ib_client_nl_info *res) +{ + unsigned long index; + void *client_data; + int ret = -ENOENT; + + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked (&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || strcmp(client->name, client_name) != 0) + continue; + if (!client->get_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_nl_info(ibdev, client_data, res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + + /* + * The cdev is guaranteed valid as long as we are inside the + * client_data_rwsem as remove_one can't be called. Keep it + * valid for the caller. + */ + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&ibdev->client_data_rwsem); + + return ret; +} + +/** + * ib_get_client_nl_info - Fetch the nl_info from a client + * @device - IB device + * @client_name - Name of the client + * @res - Result of the query + */ +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res) +{ + int ret; + + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); +#ifdef CONFIG_MODULES + if (ret == -ENOENT) { + request_module("rdma-client-%s", client_name); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); + } +#endif + if (ret) { + if (ret == -ENOENT) + return -EOPNOTSUPP; + return ret; + } + + if (WARN_ON(!res->cdev)) + return -EINVAL; + return 0; +} + /** * ib_set_client_data - Set IB client context * @device:Device to set context for diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 69188cbbd99b..16b5d6d4dd1c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -120,6 +120,12 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = 128 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1347,6 +1353,91 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[IB_DEVICE_NAME_MAX]; + struct ib_client_nl_info data = {}; + struct ib_device *ibdev = NULL; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + + if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)) >= sizeof(client_name)) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + ibdev = ib_device_get_by_index(sock_net(skb->sk), index); + if (!ibdev) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(ibdev, data.port)) { + err = -EINVAL; + goto out_put; + } + } else { + data.port = -1; + } + } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out_put; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET_CHARDEV), + 0, 0); + + data.nl_msg = msg; + err = ib_get_client_nl_info(ibdev, client_name, &data); + if (err) + goto out_nlmsg; + + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, + huge_encode_dev(data.cdev->devt), + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, + dev_name(data.cdev))) { + err = -EMSGSIZE; + goto out_data; + } + + nlmsg_end(msg, nlh); + put_device(data.cdev); + if (ibdev) + ib_device_put(ibdev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +out_data: + put_device(data.cdev); +out_nlmsg: + nlmsg_free(msg); +out_put: + if (ibdev) + ib_device_put(ibdev); + return err; +} + static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1404,6 +1495,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_GET_CHARDEV] = { + .doit = nldev_get_chardev, + }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 973514ea17a7..a1265e9ce2d1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2684,10 +2684,14 @@ struct ib_device { u32 iw_driver_flags; }; +struct ib_client_nl_info; struct ib_client { const char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); + int (*get_nl_info)(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res); + int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 10732ab31ba2..c7acbe083428 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -110,4 +110,6 @@ void rdma_link_register(struct rdma_link_ops *ops); void rdma_link_unregister(struct rdma_link_ops *ops); #define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) +#define MODULE_ALIAS_RDMA_CLIENT(type) MODULE_ALIAS("rdma-client-" type) + #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index f588e8551c6c..9903db21a42c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -279,6 +279,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */ + RDMA_NLDEV_CMD_GET_CHARDEV, + RDMA_NLDEV_NUM_OPS }; @@ -491,6 +493,18 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_NET_NS_FD, /* u32 */ + /* + * Information about a chardev. + * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc) + * CHARDEV_ABI signals the ABI revision (historical) + * CHARDEV_NAME is the kernel name for the /dev/ file (no directory) + * CHARDEV is the 64 bit dev_t for the inode + */ + RDMA_NLDEV_ATTR_CHARDEV_TYPE, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_NAME, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ + RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ + /* * Always the end */ From 8f71bb0030b8816f57be142f95b3c7189c6eaf4c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:38:19 -0300 Subject: [PATCH 060/194] RDMA: Report available cdevs through RDMA_NLDEV_CMD_GET_CHARDEV Update the struct ib_client for all modules exporting cdevs related to the ibdevice to also implement RDMA_NLDEV_CMD_GET_CHARDEV. All cdevs are now autoloadable and discoverable by userspace over netlink instead of relying on sysfs. uverbs also exposes the DRIVER_ID for drivers that are able to support driver id binding in rdma-core. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 3 ++ drivers/infiniband/core/nldev.c | 1 + drivers/infiniband/core/ucma.c | 23 +++++++++ drivers/infiniband/core/user_mad.c | 51 ++++++++++++++++++-- drivers/infiniband/core/uverbs_main.c | 32 +++++++++++- drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 1 + include/rdma/ib_verbs.h | 1 + include/uapi/rdma/rdma_netlink.h | 1 + 10 files changed, 110 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7db8566cdb89..1de4ae5d5e0e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2428,6 +2428,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; + dev_ops->uverbs_no_driver_id_binding |= + ops->uverbs_no_driver_id_binding; + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 16b5d6d4dd1c..3cad72a609ff 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -126,6 +126,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = 128 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 39823c842202..0274e9b704be 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -1788,6 +1790,19 @@ static struct miscdevice ucma_misc = { .fops = &ucma_fops, }; +static int ucma_get_global_nl_info(struct ib_client_nl_info *res) +{ + res->abi = RDMA_USER_CM_ABI_VERSION; + res->cdev = ucma_misc.this_device; + return 0; +} + +static struct ib_client rdma_cma_client = { + .name = "rdma_cm", + .get_global_nl_info = ucma_get_global_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); + static ssize_t show_abi_version(struct device *dev, struct device_attribute *attr, char *buf) @@ -1816,7 +1831,14 @@ static int __init ucma_init(void) ret = -ENOMEM; goto err2; } + + ret = ib_register_client(&rdma_cma_client); + if (ret) + goto err3; + return 0; +err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: @@ -1826,6 +1848,7 @@ err1: static void __exit ucma_cleanup(void) { + ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 671f07ba1fad..547090b41cfb 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -54,6 +54,7 @@ #include #include +#include #include "core_priv.h" @@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = { .llseek = no_llseek, }; +static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = client_data; + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev; + + return 0; +} + static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, - .remove = ib_umad_remove_one + .remove = ib_umad_remove_one, + .get_nl_info = ib_umad_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("umad"); + +static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = + ib_get_client_data(ibdev, &umad_client); + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev; + + return 0; +} + +static struct ib_client issm_client = { + .name = "issm", + .get_nl_info = ib_issm_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("issm"); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void) } ret = ib_register_client(&umad_client); - if (ret) { - pr_err("couldn't register ib_umad client\n"); + if (ret) goto out_class; - } + + ret = ib_register_client(&issm_client); + if (ret) + goto out_client; return 0; +out_client: + ib_unregister_client(&umad_client); out_class: class_unregister(&umad_class); @@ -1411,6 +1453,7 @@ out: static void __exit ib_umad_cleanup(void) { + ib_unregister_client(&issm_client); ib_unregister_client(&umad_client); class_unregister(&umad_class); unregister_chrdev_region(base_umad_dev, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 870b3dd35aac..11c13c1381cf 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,7 @@ #include #include +#include #include "uverbs.h" #include "core_priv.h" @@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = { .compat_ioctl = ib_uverbs_ioctl, }; +static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int ret; + + if (res->port != -1) + return -EINVAL; + + res->abi = ibdev->ops.uverbs_abi_ver; + res->cdev = &uverbs_dev->dev; + + /* + * To support DRIVER_ID binding in userspace some of the driver need + * upgrading to expose their PCI dependent revision information + * through get_context instead of relying on modalias matching. When + * the drivers are fixed they can drop this flag. + */ + if (!ibdev->ops.uverbs_no_driver_id_binding) { + ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, + ibdev->ops.driver_id); + if (ret) + return ret; + } + return 0; +} + static struct ib_client uverbs_client = { .name = "uverbs", .no_kverbs_req = true, .add = ib_uverbs_add_one, - .remove = ib_uverbs_remove_one + .remove = ib_uverbs_remove_one, + .get_nl_info = ib_uverbs_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("uverbs"); static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index acba96f289cc..810fa96af2e9 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1230,6 +1230,7 @@ static const struct ib_device_ops iwch_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB3, .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, + .uverbs_no_driver_id_binding = 1, .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 3e45b119b0eb..c0e819ed8c9b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -417,6 +417,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, .uverbs_abi_ver = 1, + .uverbs_no_driver_id_binding = 1, .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index efd4e3d13ae2..d97124bee703 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1147,6 +1147,7 @@ static const struct ib_device_ops mthca_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MTHCA, .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, + .uverbs_no_driver_id_binding = 1, .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1265e9ce2d1..6f09fcc21d7a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2321,6 +2321,7 @@ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; + unsigned int uverbs_no_driver_id_binding:1; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 9903db21a42c..b27c02185dcc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -504,6 +504,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_CHARDEV_NAME, /* string */ RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ + RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, /* u64 */ /* * Always the end From 5a3113d19cb0ec06fd0d068a2f2860786a770e4f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 17 Jun 2019 16:01:38 +0200 Subject: [PATCH 061/194] IB/hfi1: Spelling s/statisfied/satisfied/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index bdf1c313e13f..8f564b0a3333 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -477,7 +477,7 @@ static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, * Must hold the qp s_lock and the exp_lock. * * Return: - * false if either of the conditions below are statisfied: + * false if either of the conditions below are satisfied: * 1. The list is empty or * 2. The indicated qp is at the head of the list and the * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. From 7608bf40cf2480057ec0da31456cc428791c32ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 11 Jun 2019 13:09:51 -0300 Subject: [PATCH 062/194] RDMA/odp: Fix missed unlock in non-blocking invalidate_start If invalidate_start returns with EAGAIN then the umem_rwsem needs to be unlocked as no invalidate_end will be called. Cc: Fixes: ca748c39ea3f ("RDMA/umem: Get rid of per_mm->notifier_count") Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 9001cc10770a..eb9939d52818 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -149,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + int rc; if (mmu_notifier_range_blockable(range)) down_read(&per_mm->umem_rwsem); @@ -165,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } - return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_start_trampoline, - mmu_notifier_range_blockable(range), - NULL); + rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, + range->end, + invalidate_range_start_trampoline, + mmu_notifier_range_blockable(range), + NULL); + if (rc) + up_read(&per_mm->umem_rwsem); + return rc; } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, From cecae747b6208d40d08f8336393345093536b124 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 12 Jun 2019 15:20:13 +0300 Subject: [PATCH 063/194] RDMA/mlx5: Consider eswitch encap mode When flow steering is created, then the encap support should consider the eswitch encap mode. If the eswitch flow table (FDB) supports encap then it shouldn't be supported on NIC RX flow tables. Fixes: 4adda1122c490 ('RDMA/mlx5: Enable decap and packet reformat on flow tables') Signed-off-by: Maor Gottlieb Reviewed-by: Petr Vorel Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 99eb4a8b0b0d..6a69be02ed0d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3252,11 +3253,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int max_table_size; int num_entries; int num_groups; + bool esw_encap; u32 flags = 0; int priority; max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { enum mlx5_flow_namespace_type fn_type; @@ -3269,10 +3273,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, if (ft_type == MLX5_IB_FT_RX) { fn_type = MLX5_FLOW_NAMESPACE_BYPASS; prio = &dev->flow_db->prios[priority]; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, reformat_l3_tunnel_to_l2)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; @@ -3282,7 +3286,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, log_max_ft_size)); fn_type = MLX5_FLOW_NAMESPACE_EGRESS; prio = &dev->flow_db->egress_prios[priority]; - if (!dev->is_rep && + if (!dev->is_rep && !esw_encap && MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } @@ -3892,6 +3896,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; + bool esw_encap; u32 flags = 0; int priority; @@ -3900,18 +3905,21 @@ _get_flow_table(struct mlx5_ib_dev *dev, else priority = ib_prio_to_core_prio(fs_matcher->priority, false); + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - reformat_l3_tunnel_to_l2)) + reformat_l3_tunnel_to_l2) && + !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) { max_table_size = BIT( MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size)); - if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) { max_table_size = BIT( From 09d985bea99da4991076a53e104ec213ac518adb Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 12 Jun 2019 15:20:14 +0300 Subject: [PATCH 064/194] RDMA/mlx5: Enable decap and packet reformat on FDB If FDB flow tables support decap operation, enable it on creation, This allows to perform decapsulation of tunnelled packets by steering rules. If FDB flow tables support reformat operation, enable it on creation as well. Signed-off-by: Maor Gottlieb Reviewed-by: Petr Vorel Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6a69be02ed0d..3b1985215cb9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3924,6 +3924,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) { max_table_size = BIT( MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size)); + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) && + esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; priority = FDB_BYPASS_PATH; } From 696de2e9ccec812eae8c85bd24c75bcb12104750 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Wed, 19 Jun 2019 09:20:49 -0400 Subject: [PATCH 065/194] RDMA/netlink: Resort policy array Sort the netlink policy array by netlink attribute name. This will make it easier in the future to find the entry you are looking for when you need to make changes, or to make sure you don't add the same entry twice. Fix the whitespace while we are there. Reviewed-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 155 ++++++++++++++++---------------- 1 file changed, 79 insertions(+), 76 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 3cad72a609ff..6006d23d0410 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -42,91 +42,94 @@ #include "cma_priv.h" #include "restrack.h" +/* + * Sort array elements by the netlink attribute name + */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { - [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = IB_DEVICE_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, - .len = IB_FW_VERSION_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, - .len = 16 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = 128 }, + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, + [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = IB_FW_VERSION_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = TASK_COMM_LEN }, + [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, - .len = TASK_COMM_LEN }, - [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, - [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, - [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, + .len = 16 }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ }, - [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, - [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, - [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, - .len = 128 }, - [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, From 38389eaa4db192648916464b60f6086d6bbaa6de Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 8 Jun 2019 14:46:08 +0800 Subject: [PATCH 066/194] RDMA/hns: Add mtr support for mixed multihop addressing Currently, the MTT(memory translate table) design required a buffer space must has the same hopnum, but the hip08 hw can support mixed hopnum config in a buffer space. This patch adds the MTR(memory translate region) design for supporting mixed multihop. Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_device.h | 36 ++ drivers/infiniband/hw/hns/hns_roce_hem.c | 460 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_hem.h | 14 + drivers/infiniband/hw/hns/hns_roce_mr.c | 118 +++++ 4 files changed, 628 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 303ea7c614a8..ec2ed5cac3bc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -341,6 +341,29 @@ struct hns_roce_mtt { enum hns_roce_mtt_type mtt_type; }; +struct hns_roce_buf_region { + int offset; /* page offset */ + u32 count; /* page count*/ + int hopnum; /* addressing hop num */ +}; + +#define HNS_ROCE_MAX_BT_REGION 3 +#define HNS_ROCE_MAX_BT_LEVEL 3 +struct hns_roce_hem_list { + struct list_head root_bt; + /* link all bt dma mem by hop config */ + struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL]; + struct list_head btm_bt; /* link all bottom bt in @mid_bt */ + dma_addr_t root_ba; /* pointer to the root ba table */ + int bt_pg_shift; +}; + +/* memory translate region */ +struct hns_roce_mtr { + struct hns_roce_hem_list hem_list; + int buf_pg_shift; +}; + struct hns_roce_mw { struct ib_mw ibmw; u32 pdn; @@ -1111,6 +1134,19 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt, struct hns_roce_buf *buf); +void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift, + int buf_pg_shift); +int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + dma_addr_t **bufs, struct hns_roce_buf_region *regions, + int region_cnt); +void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr); + +/* hns roce hw need current block and next block addr from mtt */ +#define MTT_MIN_COUNT 2 +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr); + int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev); int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev); int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 8490a86c3ef0..d145e3ed21d4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1157,3 +1157,463 @@ void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev) &hr_dev->mr_table.mtt_cqe_table); hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table); } + +struct roce_hem_item { + struct list_head list; /* link all hems in the same bt level */ + struct list_head sibling; /* link all hems in last hop for mtt */ + void *addr; + dma_addr_t dma_addr; + size_t count; /* max ba numbers */ + int start; /* start buf offset in this hem */ + int end; /* end buf offset in this hem */ +}; + +static struct roce_hem_item *hem_list_alloc_item(struct hns_roce_dev *hr_dev, + int start, int end, + int count, bool exist_bt, + int bt_level) +{ + struct roce_hem_item *hem; + + hem = kzalloc(sizeof(*hem), GFP_KERNEL); + if (!hem) + return NULL; + + if (exist_bt) { + hem->addr = dma_alloc_coherent(hr_dev->dev, + count * BA_BYTE_LEN, + &hem->dma_addr, GFP_KERNEL); + if (!hem->addr) { + kfree(hem); + return NULL; + } + } + + hem->count = count; + hem->start = start; + hem->end = end; + INIT_LIST_HEAD(&hem->list); + INIT_LIST_HEAD(&hem->sibling); + + return hem; +} + +static void hem_list_free_item(struct hns_roce_dev *hr_dev, + struct roce_hem_item *hem, bool exist_bt) +{ + if (exist_bt) + dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, + hem->addr, hem->dma_addr); + kfree(hem); +} + +static void hem_list_free_all(struct hns_roce_dev *hr_dev, + struct list_head *head, bool exist_bt) +{ + struct roce_hem_item *hem, *temp_hem; + + list_for_each_entry_safe(hem, temp_hem, head, list) { + list_del(&hem->list); + hem_list_free_item(hr_dev, hem, exist_bt); + } +} + +static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr, + u64 table_addr) +{ + *(u64 *)(base_addr) = table_addr; +} + +/* assign L0 table address to hem from root bt */ +static void hem_list_assign_bt(struct hns_roce_dev *hr_dev, + struct roce_hem_item *hem, void *cpu_addr, + u64 phy_addr) +{ + hem->addr = cpu_addr; + hem->dma_addr = (dma_addr_t)phy_addr; +} + +static inline bool hem_list_page_is_in_range(struct roce_hem_item *hem, + int offset) +{ + return (hem->start <= offset && offset <= hem->end); +} + +static struct roce_hem_item *hem_list_search_item(struct list_head *ba_list, + int page_offset) +{ + struct roce_hem_item *hem, *temp_hem; + struct roce_hem_item *found = NULL; + + list_for_each_entry_safe(hem, temp_hem, ba_list, list) { + if (hem_list_page_is_in_range(hem, page_offset)) { + found = hem; + break; + } + } + + return found; +} + +static bool hem_list_is_bottom_bt(int hopnum, int bt_level) +{ + /* + * hopnum base address table levels + * 0 L0(buf) + * 1 L0 -> buf + * 2 L0 -> L1 -> buf + * 3 L0 -> L1 -> L2 -> buf + */ + return bt_level >= (hopnum ? hopnum - 1 : hopnum); +} + +/** + * calc base address entries num + * @hopnum: num of mutihop addressing + * @bt_level: base address table level + * @unit: ba entries per bt page + */ +static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +{ + u32 step; + int max; + int i; + + if (hopnum <= bt_level) + return 0; + /* + * hopnum bt_level range + * 1 0 unit + * ------------ + * 2 0 unit * unit + * 2 1 unit + * ------------ + * 3 0 unit * unit * unit + * 3 1 unit * unit + * 3 2 unit + */ + step = 1; + max = hopnum - bt_level; + for (i = 0; i < max; i++) + step = step * unit; + + return step; +} + +/** + * calc the root ba entries which could cover all regions + * @regions: buf region array + * @region_cnt: array size of @regions + * @unit: ba entries per bt page + */ +int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, + int region_cnt, int unit) +{ + struct hns_roce_buf_region *r; + int total = 0; + int step; + int i; + + for (i = 0; i < region_cnt; i++) { + r = (struct hns_roce_buf_region *)®ions[i]; + if (r->hopnum > 1) { + step = hem_list_calc_ba_range(r->hopnum, 1, unit); + if (step > 0) + total += (r->count + step - 1) / step; + } else { + total += r->count; + } + } + + return total; +} + +static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, + const struct hns_roce_buf_region *r, int unit, + int offset, struct list_head *mid_bt, + struct list_head *btm_bt) +{ + struct roce_hem_item *hem_ptrs[HNS_ROCE_MAX_BT_LEVEL] = { NULL }; + struct list_head temp_list[HNS_ROCE_MAX_BT_LEVEL]; + struct roce_hem_item *cur, *pre; + const int hopnum = r->hopnum; + int start_aligned; + int distance; + int ret = 0; + int max_ofs; + int level; + u32 step; + int end; + + if (hopnum <= 1) + return 0; + + if (hopnum > HNS_ROCE_MAX_BT_LEVEL) { + dev_err(hr_dev->dev, "invalid hopnum %d!\n", hopnum); + return -EINVAL; + } + + if (offset < r->offset) { + dev_err(hr_dev->dev, "invalid offset %d,min %d!\n", + offset, r->offset); + return -EINVAL; + } + + distance = offset - r->offset; + max_ofs = r->offset + r->count - 1; + for (level = 0; level < hopnum; level++) + INIT_LIST_HEAD(&temp_list[level]); + + /* config L1 bt to last bt and link them to corresponding parent */ + for (level = 1; level < hopnum; level++) { + cur = hem_list_search_item(&mid_bt[level], offset); + if (cur) { + hem_ptrs[level] = cur; + continue; + } + + step = hem_list_calc_ba_range(hopnum, level, unit); + if (step < 1) { + ret = -EINVAL; + goto err_exit; + } + + start_aligned = (distance / step) * step + r->offset; + end = min_t(int, start_aligned + step - 1, max_ofs); + cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, + true, level); + if (!cur) { + ret = -ENOMEM; + goto err_exit; + } + hem_ptrs[level] = cur; + list_add(&cur->list, &temp_list[level]); + if (hem_list_is_bottom_bt(hopnum, level)) + list_add(&cur->sibling, &temp_list[0]); + + /* link bt to parent bt */ + if (level > 1) { + pre = hem_ptrs[level - 1]; + step = (cur->start - pre->start) / step * BA_BYTE_LEN; + hem_list_link_bt(hr_dev, pre->addr + step, + cur->dma_addr); + } + } + + list_splice(&temp_list[0], btm_bt); + for (level = 1; level < hopnum; level++) + list_splice(&temp_list[level], &mid_bt[level]); + + return 0; + +err_exit: + for (level = 1; level < hopnum; level++) + hem_list_free_all(hr_dev, &temp_list[level], true); + + return ret; +} + +static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list, int unit, + const struct hns_roce_buf_region *regions, + int region_cnt) +{ + struct roce_hem_item *hem, *temp_hem, *root_hem; + struct list_head temp_list[HNS_ROCE_MAX_BT_REGION]; + const struct hns_roce_buf_region *r; + struct list_head temp_root; + struct list_head temp_btm; + void *cpu_base; + u64 phy_base; + int ret = 0; + int offset; + int total; + int step; + int i; + + r = ®ions[0]; + root_hem = hem_list_search_item(&hem_list->root_bt, r->offset); + if (root_hem) + return 0; + + INIT_LIST_HEAD(&temp_root); + total = r->offset; + /* indicate to last region */ + r = ®ions[region_cnt - 1]; + root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1, + unit, true, 0); + if (!root_hem) + return -ENOMEM; + list_add(&root_hem->list, &temp_root); + + hem_list->root_ba = root_hem->dma_addr; + + INIT_LIST_HEAD(&temp_btm); + for (i = 0; i < region_cnt; i++) + INIT_LIST_HEAD(&temp_list[i]); + + total = 0; + for (i = 0; i < region_cnt && total < unit; i++) { + r = ®ions[i]; + if (!r->count) + continue; + + /* all regions's mid[x][0] shared the root_bt's trunk */ + cpu_base = root_hem->addr + total * BA_BYTE_LEN; + phy_base = root_hem->dma_addr + total * BA_BYTE_LEN; + + /* if hopnum is 0 or 1, cut a new fake hem from the root bt + * which's address share to all regions. + */ + if (hem_list_is_bottom_bt(r->hopnum, 0)) { + hem = hem_list_alloc_item(hr_dev, r->offset, + r->offset + r->count - 1, + r->count, false, 0); + if (!hem) { + ret = -ENOMEM; + goto err_exit; + } + hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base); + list_add(&hem->list, &temp_list[i]); + list_add(&hem->sibling, &temp_btm); + total += r->count; + } else { + step = hem_list_calc_ba_range(r->hopnum, 1, unit); + if (step < 1) { + ret = -EINVAL; + goto err_exit; + } + /* if exist mid bt, link L1 to L0 */ + list_for_each_entry_safe(hem, temp_hem, + &hem_list->mid_bt[i][1], list) { + offset = hem->start / step * BA_BYTE_LEN; + hem_list_link_bt(hr_dev, cpu_base + offset, + hem->dma_addr); + total++; + } + } + } + + list_splice(&temp_btm, &hem_list->btm_bt); + list_splice(&temp_root, &hem_list->root_bt); + for (i = 0; i < region_cnt; i++) + list_splice(&temp_list[i], &hem_list->mid_bt[i][0]); + + return 0; + +err_exit: + for (i = 0; i < region_cnt; i++) + hem_list_free_all(hr_dev, &temp_list[i], false); + + hem_list_free_all(hr_dev, &temp_root, true); + + return ret; +} + +/* construct the base address table and link them by address hop config */ +int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list, + const struct hns_roce_buf_region *regions, + int region_cnt) +{ + const struct hns_roce_buf_region *r; + int ofs, end; + int ret = 0; + int unit; + int i; + + if (region_cnt > HNS_ROCE_MAX_BT_REGION) { + dev_err(hr_dev->dev, "invalid region region_cnt %d!\n", + region_cnt); + return -EINVAL; + } + + unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN; + for (i = 0; i < region_cnt; i++) { + r = ®ions[i]; + if (!r->count) + continue; + + end = r->offset + r->count; + for (ofs = r->offset; ofs < end; ofs += unit) { + ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs, + hem_list->mid_bt[i], + &hem_list->btm_bt); + if (ret) { + dev_err(hr_dev->dev, + "alloc hem trunk fail ret=%d!\n", ret); + goto err_alloc; + } + } + } + + ret = hem_list_alloc_root_bt(hr_dev, hem_list, unit, regions, + region_cnt); + if (ret) + dev_err(hr_dev->dev, "alloc hem root fail ret=%d!\n", ret); + else + return 0; + +err_alloc: + hns_roce_hem_list_release(hr_dev, hem_list); + + return ret; +} + +void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list) +{ + int i, j; + + for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) + for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], + j != 0); + + hem_list_free_all(hr_dev, &hem_list->root_bt, true); + INIT_LIST_HEAD(&hem_list->btm_bt); + hem_list->root_ba = 0; +} + +void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list, + int bt_page_order) +{ + int i, j; + + INIT_LIST_HEAD(&hem_list->root_bt); + INIT_LIST_HEAD(&hem_list->btm_bt); + for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) + for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) + INIT_LIST_HEAD(&hem_list->mid_bt[i][j]); + + hem_list->bt_pg_shift = bt_page_order; +} + +void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list, + int offset, int *mtt_cnt, u64 *phy_addr) +{ + struct list_head *head = &hem_list->btm_bt; + struct roce_hem_item *hem, *temp_hem; + void *cpu_base = NULL; + u64 phy_base = 0; + int nr = 0; + + list_for_each_entry_safe(hem, temp_hem, head, sibling) { + if (hem_list_page_is_in_range(hem, offset)) { + nr = offset - hem->start; + cpu_base = hem->addr + nr * BA_BYTE_LEN; + phy_base = hem->dma_addr + nr * BA_BYTE_LEN; + nr = hem->end + 1 - offset; + break; + } + } + + if (mtt_cnt) + *mtt_cnt = nr; + + if (phy_addr) + *phy_addr = phy_base; + + return cpu_base; +} diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index d9d668992e49..e865fc8a18a7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -133,6 +133,20 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop); bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type); +void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list, + int bt_page_order); +int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, + int region_cnt, int unit); +int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list, + const struct hns_roce_buf_region *regions, + int region_cnt); +void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list); +void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, + struct hns_roce_hem_list *hem_list, + int offset, int *mtt_cnt, u64 *phy_addr); + static inline void hns_roce_hem_first(struct hns_roce_hem *hem, struct hns_roce_hem_iter *iter) { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 38ed4ac741b5..6db0dae18ab7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1496,3 +1496,121 @@ int hns_roce_dealloc_mw(struct ib_mw *ibmw) return 0; } + +void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift, + int buf_pg_shift) +{ + hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift); + mtr->buf_pg_shift = buf_pg_shift; +} + +void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr) +{ + hns_roce_hem_list_release(hr_dev, &mtr->hem_list); +} +EXPORT_SYMBOL_GPL(hns_roce_mtr_cleanup); + +static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, dma_addr_t *bufs, + struct hns_roce_buf_region *r) +{ + int offset; + int count; + int npage; + u64 *mtts; + int end; + int i; + + offset = r->offset; + end = offset + r->count; + npage = 0; + while (offset < end) { + mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, + offset, &count, NULL); + if (!mtts) + return -ENOBUFS; + + /* Save page addr, low 12 bits : 0 */ + for (i = 0; i < count; i++) { + if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) + mtts[i] = cpu_to_le64(bufs[npage] >> + PAGE_ADDR_SHIFT); + else + mtts[i] = cpu_to_le64(bufs[npage]); + + npage++; + } + offset += count; + } + + return 0; +} + +int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + dma_addr_t **bufs, struct hns_roce_buf_region *regions, + int region_cnt) +{ + struct hns_roce_buf_region *r; + int ret; + int i; + + ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions, + region_cnt); + if (ret) + return ret; + + for (i = 0; i < region_cnt; i++) { + r = ®ions[i]; + ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r); + if (ret) { + dev_err(hr_dev->dev, + "write mtr[%d/%d] err %d,offset=%d.\n", + i, region_cnt, ret, r->offset); + goto err_write; + } + } + + return 0; + +err_write: + hns_roce_hem_list_release(hr_dev, &mtr->hem_list); + + return ret; +} + +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr) +{ + u64 *mtts = mtt_buf; + int mtt_count; + int total = 0; + u64 *addr; + int npage; + int left; + + if (mtts == NULL || mtt_max < 1) + goto done; + + left = mtt_max; + while (left > 0) { + mtt_count = 0; + addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, + offset + total, + &mtt_count, NULL); + if (!addr || !mtt_count) + goto done; + + npage = min(mtt_count, left); + memcpy(&mtts[total], addr, BA_BYTE_LEN * npage); + left -= npage; + total += npage; + } + +done: + if (base_addr) + *base_addr = mtr->hem_list.root_ba; + + return total; +} +EXPORT_SYMBOL_GPL(hns_roce_mtr_find); From 2ac0bc5e725e84d56b7fc8b280b2dbc9f2f10737 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 8 Jun 2019 14:46:09 +0800 Subject: [PATCH 067/194] RDMA/hns: Add a group interfaces for optimizing buffers getting flow Currently, the code for getting umem and kmem buffers exist many files, this patch adds a group interfaces to simplify the buffers getting flow. Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 99 +++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_device.h | 12 +++ 2 files changed, 111 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index dac058d3df53..14fcc359599c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -34,6 +34,7 @@ #include #include #include "hns_roce_device.h" +#include int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj) { @@ -238,6 +239,104 @@ err_free: return -ENOMEM; } +int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, + int buf_cnt, int start, struct hns_roce_buf *buf) +{ + int i, end; + int total; + + end = start + buf_cnt; + if (end > buf->npages) { + dev_err(hr_dev->dev, + "invalid kmem region,offset %d,buf_cnt %d,total %d!\n", + start, buf_cnt, buf->npages); + return -EINVAL; + } + + total = 0; + for (i = start; i < end; i++) + if (buf->nbufs == 1) + bufs[total++] = buf->direct.map + + (i << buf->page_shift); + else + bufs[total++] = buf->page_list[i].map; + + return total; +} + +int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, + int buf_cnt, int start, struct ib_umem *umem, + int page_shift) +{ + struct ib_block_iter biter; + int total = 0; + int idx = 0; + u64 addr; + + if (page_shift < PAGE_SHIFT) { + dev_err(hr_dev->dev, "invalid page shift %d!\n", page_shift); + return -EINVAL; + } + + /* convert system page cnt to hw page cnt */ + rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, + 1 << page_shift) { + addr = rdma_block_iter_dma_address(&biter); + if (idx >= start) { + bufs[total++] = addr; + if (total >= buf_cnt) + goto done; + } + idx++; + } + +done: + return total; +} + +void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum, + int offset, int buf_cnt) +{ + if (hopnum == HNS_ROCE_HOP_NUM_0) + region->hopnum = 0; + else + region->hopnum = hopnum; + + region->offset = offset; + region->count = buf_cnt; +} + +void hns_roce_free_buf_list(dma_addr_t **bufs, int region_cnt) +{ + int i; + + for (i = 0; i < region_cnt; i++) { + kfree(bufs[i]); + bufs[i] = NULL; + } +} + +int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions, + dma_addr_t **bufs, int region_cnt) +{ + struct hns_roce_buf_region *r; + int i; + + for (i = 0; i < region_cnt; i++) { + r = ®ions[i]; + bufs[i] = kcalloc(r->count, sizeof(dma_addr_t), GFP_KERNEL); + if (!bufs[i]) + goto err_alloc; + } + + return 0; + +err_alloc: + hns_roce_free_buf_list(bufs, i); + + return -ENOMEM; +} + void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) { if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ec2ed5cac3bc..4974c42a77de 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1211,6 +1211,18 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt, struct ib_umem *umem); +void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum, + int offset, int buf_cnt); +int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions, + dma_addr_t **bufs, int count); +void hns_roce_free_buf_list(dma_addr_t **bufs, int count); + +int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, + int buf_cnt, int start, struct hns_roce_buf *buf); +int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, + int buf_cnt, int start, struct ib_umem *umem, + int page_shift); + int hns_roce_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); From 8d18ad83f19b7fb67485f977a51408287e3f801f Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 8 Jun 2019 14:46:10 +0800 Subject: [PATCH 068/194] RDMA/hns: Fix bug when wqe num is larger than 16K hip08 can support up to 32768 wqes in one qp. currently if the wqe num is larger than 16384, the driver will lead a calltrace as follows. [21361.393725] Call trace: [21361.398605] hns_roce_v2_modify_qp+0xbcc/0x1360 [hns_roce_hw_v2] [21361.410627] hns_roce_modify_qp+0x1d8/0x2f8 [hns_roce] [21361.420906] _ib_modify_qp+0x70/0x118 [21361.428222] ib_modify_qp+0x14/0x1c [21361.435193] rt_ktest_modify_qp+0xb8/0x650 [rdma_test] [21361.445472] exec_modify_qp_cmd+0x110/0x4d8 [rdma_test] [21361.455924] rt_ktest_dispatch_cmd_3+0xa94/0x2edc [rdma_test] [21361.467422] rt_ktest_dispatch_cmd_2+0x9c/0x108 [rdma_test] [21361.478570] rt_ktest_dispatch_cmd+0x138/0x904 [rdma_test] [21361.489545] rt_ktest_dev_write+0x328/0x4b0 [rdma_test] [21361.499998] __vfs_write+0x38/0x15c [21361.506966] vfs_write+0xa8/0x1a0 [21361.513586] ksys_write+0x50/0xb0 [21361.520206] sys_write+0xc/0x14 [21361.526479] el0_svc_naked+0x30/0x34 [21361.533622] Code: 1ac10841 d37d7c22 0b000021 d37df021 (f86268c0) [21361.545815] ---[ end trace e2a1feb2c3d7f13c ]--- When the wqe num is larger than 16384, hns_roce_table_find will return an invalid mtt, this will lead an kernel paging requet error if the driver try to access it. It's the mtt design defect which can't support up to the max wqe num of hip08. This patch fixs it by replacing mtt with mtr for wqe. Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_device.h | 11 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 118 +++++++----- drivers/infiniband/hw/hns/hns_roce_qp.c | 189 +++++++++++++++----- 3 files changed, 235 insertions(+), 83 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 4974c42a77de..a548b28aab63 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -660,6 +660,14 @@ struct hns_roce_qp { struct ib_umem *umem; struct hns_roce_mtt mtt; + struct hns_roce_mtr mtr; + + /* this define must less than HNS_ROCE_MAX_BT_REGION */ +#define HNS_ROCE_WQE_REGION_MAX 3 + struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX]; + int region_cnt; + int wqe_bt_pg_shift; + u32 buff_size; struct mutex mutex; u8 port; @@ -870,6 +878,9 @@ struct hns_roce_caps { u32 mtt_ba_pg_sz; u32 mtt_buf_pg_sz; u32 mtt_hop_num; + u32 wqe_sq_hop_num; + u32 wqe_sge_hop_num; + u32 wqe_rq_hop_num; u32 sccc_ba_pg_sz; u32 sccc_buf_pg_sz; u32 sccc_hop_num; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2d27dc91a823..e7024b3e171a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1574,6 +1574,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->mtt_ba_pg_sz = 0; caps->mtt_buf_pg_sz = 0; caps->mtt_hop_num = HNS_ROCE_MTT_HOP_NUM; + caps->wqe_sq_hop_num = 2; + caps->wqe_sge_hop_num = 1; + caps->wqe_rq_hop_num = 2; caps->cqe_ba_pg_sz = 0; caps->cqe_buf_pg_sz = 0; caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM; @@ -3021,7 +3024,6 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, } static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct hns_roce_v2_qp_context *context, @@ -3517,6 +3519,31 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, } } +static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, int mtt_cnt, + u32 page_size) +{ + struct device *dev = hr_dev->dev; + + if (hr_qp->rq.wqe_cnt < 1) + return true; + + if (mtt_cnt < 1) { + dev_err(dev, "qp(0x%lx) rqwqe buf ba find failed\n", + hr_qp->qpn); + return false; + } + + if (mtt_cnt < MTT_MIN_COUNT && + (hr_qp->rq.offset + page_size) < hr_qp->buff_size) { + dev_err(dev, "qp(0x%lx) next rqwqe buf ba find failed\n", + hr_qp->qpn); + return false; + } + + return true; +} + static int modify_qp_init_to_rtr(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, struct hns_roce_v2_qp_context *context, @@ -3526,25 +3553,27 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct device *dev = hr_dev->dev; + u64 mtts[MTT_MIN_COUNT] = { 0 }; dma_addr_t dma_handle_3; dma_addr_t dma_handle_2; - dma_addr_t dma_handle; + u64 wqe_sge_ba; u32 page_size; u8 port_num; u64 *mtts_3; u64 *mtts_2; - u64 *mtts; + int count; u8 *dmac; u8 *smac; int port; /* Search qp buf's mtts */ - mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table, - hr_qp->mtt.first_seg, &dma_handle); - if (!mtts) { - dev_err(dev, "qp buf pa find failed\n"); - return -EINVAL; - } + page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, + hr_qp->rq.offset / page_size, mtts, + MTT_MIN_COUNT, &wqe_sge_ba); + if (!ibqp->srq) + if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size)) + return -EINVAL; /* Search IRRL's mtts */ mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table, @@ -3568,7 +3597,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, } dmac = (u8 *)attr->ah_attr.roce.dmac; - context->wqe_sge_ba = (u32)(dma_handle >> 3); + context->wqe_sge_ba = (u32)(wqe_sge_ba >> 3); qpc_mask->wqe_sge_ba = 0; /* @@ -3578,22 +3607,23 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, * 0 at the same time, else set them to 0x1. */ roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M, - V2_QPC_BYTE_12_WQE_SGE_BA_S, dma_handle >> (32 + 3)); + V2_QPC_BYTE_12_WQE_SGE_BA_S, wqe_sge_ba >> (32 + 3)); roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M, V2_QPC_BYTE_12_WQE_SGE_BA_S, 0); roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M, V2_QPC_BYTE_12_SQ_HOP_NUM_S, - hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ? - 0 : hr_dev->caps.mtt_hop_num); + hr_dev->caps.wqe_sq_hop_num == HNS_ROCE_HOP_NUM_0 ? + 0 : hr_dev->caps.wqe_sq_hop_num); roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M, V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0); roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, - ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? - hr_dev->caps.mtt_hop_num : 0); + ((ibqp->qp_type == IB_QPT_GSI) || + hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? + hr_dev->caps.wqe_sge_hop_num : 0); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0); @@ -3601,8 +3631,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQ_HOP_NUM_M, V2_QPC_BYTE_20_RQ_HOP_NUM_S, - hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ? - 0 : hr_dev->caps.mtt_hop_num); + hr_dev->caps.wqe_rq_hop_num == HNS_ROCE_HOP_NUM_0 ? + 0 : hr_dev->caps.wqe_rq_hop_num); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQ_HOP_NUM_M, V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0); @@ -3610,7 +3640,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, - hr_dev->caps.mtt_ba_pg_sz + PG_SHIFT_OFFSET); + hr_qp->wqe_bt_pg_shift + PG_SHIFT_OFFSET); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0); @@ -3623,29 +3653,24 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0); - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - context->rq_cur_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size] - >> PAGE_ADDR_SHIFT); + context->rq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT); qpc_mask->rq_cur_blk_addr = 0; roce_set_field(context->byte_92_srq_info, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, - mtts[hr_qp->rq.offset / page_size] - >> (32 + PAGE_ADDR_SHIFT)); + mtts[0] >> (32 + PAGE_ADDR_SHIFT)); roce_set_field(qpc_mask->byte_92_srq_info, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0); - context->rq_nxt_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size + 1] - >> PAGE_ADDR_SHIFT); + context->rq_nxt_blk_addr = (u32)(mtts[1] >> PAGE_ADDR_SHIFT); qpc_mask->rq_nxt_blk_addr = 0; roce_set_field(context->byte_104_rq_sge, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, - mtts[hr_qp->rq.offset / page_size + 1] - >> (32 + PAGE_ADDR_SHIFT)); + mtts[1] >> (32 + PAGE_ADDR_SHIFT)); roce_set_field(qpc_mask->byte_104_rq_sge, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0); @@ -3773,18 +3798,30 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct device *dev = hr_dev->dev; - dma_addr_t dma_handle; + u64 sge_cur_blk = 0; + u64 sq_cur_blk = 0; u32 page_size; - u64 *mtts; + int count; /* Search qp buf's mtts */ - mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table, - hr_qp->mtt.first_seg, &dma_handle); - if (!mtts) { - dev_err(dev, "qp buf pa find failed\n"); + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); + if (count < 1) { + dev_err(dev, "qp(0x%lx) buf pa find failed\n", hr_qp->qpn); return -EINVAL; } + if (hr_qp->sge.offset) { + page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, + hr_qp->sge.offset / page_size, + &sge_cur_blk, 1, NULL); + if (count < 1) { + dev_err(dev, "qp(0x%lx) sge pa find failed\n", + hr_qp->qpn); + return -EINVAL; + } + } + /* Not support alternate path and path migration */ if ((attr_mask & IB_QP_ALT_PATH) || (attr_mask & IB_QP_PATH_MIG_STATE)) { @@ -3798,38 +3835,37 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, * we should set all bits of the relevant fields in context mask to * 0 at the same time, else set them to 0x1. */ - context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT); + context->sq_cur_blk_addr = (u32)(sq_cur_blk >> PAGE_ADDR_SHIFT); roce_set_field(context->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, - mtts[0] >> (32 + PAGE_ADDR_SHIFT)); + sq_cur_blk >> (32 + PAGE_ADDR_SHIFT)); qpc_mask->sq_cur_blk_addr = 0; roce_set_field(qpc_mask->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - ((u32)(mtts[hr_qp->sge.offset / page_size] >> + ((u32)(sge_cur_blk >> PAGE_ADDR_SHIFT)) : 0; roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - (mtts[hr_qp->sge.offset / page_size] >> + (sge_cur_blk >> (32 + PAGE_ADDR_SHIFT)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; roce_set_field(qpc_mask->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0); - context->rx_sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT); + context->rx_sq_cur_blk_addr = (u32)(sq_cur_blk >> PAGE_ADDR_SHIFT); roce_set_field(context->byte_232_irrl_sge, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, - mtts[0] >> (32 + PAGE_ADDR_SHIFT)); + sq_cur_blk >> (32 + PAGE_ADDR_SHIFT)); qpc_mask->rx_sq_cur_blk_addr = 0; roce_set_field(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, @@ -4230,7 +4266,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, V2_QPC_BYTE_60_QP_ST_S, 0); /* SW pass context to HW */ - ret = hns_roce_v2_qp_modify(hr_dev, &hr_qp->mtt, cur_state, new_state, + ret = hns_roce_v2_qp_modify(hr_dev, cur_state, new_state, context, hr_qp); if (ret) { dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 8db2817a249e..99ec5d43b99b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -422,6 +422,91 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, return 0; } +static int split_wqe_buf_region(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_buf_region *regions, + int region_max, int page_shift) +{ + int page_size = 1 << page_shift; + bool is_extend_sge; + int region_cnt = 0; + int buf_size; + int buf_cnt; + + if (hr_qp->buff_size < 1 || region_max < 1) + return region_cnt; + + if (hr_qp->sge.sge_cnt > 0) + is_extend_sge = true; + else + is_extend_sge = false; + + /* sq region */ + if (is_extend_sge) + buf_size = hr_qp->sge.offset - hr_qp->sq.offset; + else + buf_size = hr_qp->rq.offset - hr_qp->sq.offset; + + if (buf_size > 0 && region_cnt < region_max) { + buf_cnt = DIV_ROUND_UP(buf_size, page_size); + hns_roce_init_buf_region(®ions[region_cnt], + hr_dev->caps.wqe_sq_hop_num, + hr_qp->sq.offset / page_size, + buf_cnt); + region_cnt++; + } + + /* sge region */ + if (is_extend_sge) { + buf_size = hr_qp->rq.offset - hr_qp->sge.offset; + if (buf_size > 0 && region_cnt < region_max) { + buf_cnt = DIV_ROUND_UP(buf_size, page_size); + hns_roce_init_buf_region(®ions[region_cnt], + hr_dev->caps.wqe_sge_hop_num, + hr_qp->sge.offset / page_size, + buf_cnt); + region_cnt++; + } + } + + /* rq region */ + buf_size = hr_qp->buff_size - hr_qp->rq.offset; + if (buf_size > 0) { + buf_cnt = DIV_ROUND_UP(buf_size, page_size); + hns_roce_init_buf_region(®ions[region_cnt], + hr_dev->caps.wqe_rq_hop_num, + hr_qp->rq.offset / page_size, + buf_cnt); + region_cnt++; + } + + return region_cnt; +} + +static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev, + struct hns_roce_buf_region *regions, + int region_cnt) +{ + int bt_pg_shift; + int ba_num; + int ret; + + bt_pg_shift = PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz; + + /* all root ba entries must in one bt page */ + do { + ba_num = (1 << bt_pg_shift) / BA_BYTE_LEN; + ret = hns_roce_hem_list_calc_root_ba(regions, region_cnt, + ba_num); + if (ret <= ba_num) + break; + + bt_pg_shift++; + } while (ret > ba_num); + + return bt_pg_shift - PAGE_SHIFT; +} + static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp) @@ -534,15 +619,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct ib_udata *udata, unsigned long sqpn, struct hns_roce_qp *hr_qp) { + dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { 0 }; struct device *dev = hr_dev->dev; struct hns_roce_ib_create_qp ucmd; struct hns_roce_ib_create_qp_resp resp = {}; struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context( udata, struct hns_roce_ucontext, ibucontext); + struct hns_roce_buf_region *r; unsigned long qpn = 0; - int ret = 0; u32 page_shift; - u32 npages; + int buf_count; + int ret; int i; mutex_init(&hr_qp->mutex); @@ -596,6 +683,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, init_attr->cap.max_recv_sge]; } + page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; if (udata) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { dev_err(dev, "ib_copy_from_udata error for create qp\n"); @@ -617,32 +705,28 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = PTR_ERR(hr_qp->umem); goto err_rq_sge_list; } - - hr_qp->mtt.mtt_type = MTT_TYPE_WQE; - page_shift = PAGE_SHIFT; - if (hr_dev->caps.mtt_buf_pg_sz) { - npages = (ib_umem_page_count(hr_qp->umem) + - (1 << hr_dev->caps.mtt_buf_pg_sz) - 1) / - (1 << hr_dev->caps.mtt_buf_pg_sz); - page_shift += hr_dev->caps.mtt_buf_pg_sz; - ret = hns_roce_mtt_init(hr_dev, npages, - page_shift, - &hr_qp->mtt); - } else { - ret = hns_roce_mtt_init(hr_dev, - ib_umem_page_count(hr_qp->umem), - page_shift, &hr_qp->mtt); - } + hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp, + hr_qp->regions, ARRAY_SIZE(hr_qp->regions), + page_shift); + ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list, + hr_qp->region_cnt); if (ret) { - dev_err(dev, "hns_roce_mtt_init error for create qp\n"); - goto err_buf; + dev_err(dev, "alloc buf_list error for create qp\n"); + goto err_alloc_list; } - ret = hns_roce_ib_umem_write_mtt(hr_dev, &hr_qp->mtt, - hr_qp->umem); - if (ret) { - dev_err(dev, "hns_roce_ib_umem_write_mtt error for create qp\n"); - goto err_mtt; + for (i = 0; i < hr_qp->region_cnt; i++) { + r = &hr_qp->regions[i]; + buf_count = hns_roce_get_umem_bufs(hr_dev, + buf_list[i], r->count, r->offset, + hr_qp->umem, page_shift); + if (buf_count != r->count) { + dev_err(dev, + "get umem buf err, expect %d,ret %d.\n", + r->count, buf_count); + ret = -ENOBUFS; + goto err_get_bufs; + } } if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) && @@ -653,7 +737,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, &hr_qp->sdb); if (ret) { dev_err(dev, "sq record doorbell map failed!\n"); - goto err_mtt; + goto err_get_bufs; } /* indicate kernel supports sq record db */ @@ -715,7 +799,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } /* Allocate QP buf */ - page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size, (1 << page_shift) * 2, &hr_qp->hr_buf, page_shift)) { @@ -723,21 +806,28 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = -ENOMEM; goto err_db; } - - hr_qp->mtt.mtt_type = MTT_TYPE_WQE; - /* Write MTT */ - ret = hns_roce_mtt_init(hr_dev, hr_qp->hr_buf.npages, - hr_qp->hr_buf.page_shift, &hr_qp->mtt); + hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp, + hr_qp->regions, ARRAY_SIZE(hr_qp->regions), + page_shift); + ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list, + hr_qp->region_cnt); if (ret) { - dev_err(dev, "hns_roce_mtt_init error for kernel create qp\n"); - goto err_buf; + dev_err(dev, "alloc buf_list error for create qp!\n"); + goto err_alloc_list; } - ret = hns_roce_buf_write_mtt(hr_dev, &hr_qp->mtt, - &hr_qp->hr_buf); - if (ret) { - dev_err(dev, "hns_roce_buf_write_mtt error for kernel create qp\n"); - goto err_mtt; + for (i = 0; i < hr_qp->region_cnt; i++) { + r = &hr_qp->regions[i]; + buf_count = hns_roce_get_kmem_bufs(hr_dev, + buf_list[i], r->count, r->offset, + &hr_qp->hr_buf); + if (buf_count != r->count) { + dev_err(dev, + "get kmem buf err, expect %d,ret %d.\n", + r->count, buf_count); + ret = -ENOBUFS; + goto err_get_bufs; + } } hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64), @@ -761,6 +851,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } } + hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions, + hr_qp->region_cnt); + hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift, + page_shift); + ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, + hr_qp->regions, hr_qp->region_cnt); + if (ret) { + dev_err(dev, "mtr attatch error for create qp\n"); + goto err_mtr; + } + if (init_attr->qp_type == IB_QPT_GSI && hr_dev->hw_rev == HNS_ROCE_HW_VER1) { /* In v1 engine, GSI QP context in RoCE engine's register */ @@ -796,6 +897,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } hr_qp->event = hns_roce_ib_qp_event; + hns_roce_free_buf_list(buf_list, hr_qp->region_cnt); return 0; @@ -810,6 +912,9 @@ err_qpn: if (!sqpn) hns_roce_release_range_qp(hr_dev, qpn, 1); +err_mtr: + hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr); + err_wrid: if (udata) { if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && @@ -829,10 +934,10 @@ err_sq_dbmap: hns_roce_qp_has_sq(init_attr)) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); -err_mtt: - hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); +err_get_bufs: + hns_roce_free_buf_list(buf_list, hr_qp->region_cnt); -err_buf: +err_alloc_list: if (hr_qp->umem) ib_umem_release(hr_qp->umem); else From a49b1dc7ae447d7085360cd587fc1c8b9ec6c871 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 12 Jun 2019 15:27:41 +0300 Subject: [PATCH 069/194] RDMA: Convert destroy_wq to be void All callers of destroy WQ are always success and there is no need to check their return value, so convert destroy_wq to be void. Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 12 +++++------- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 +- drivers/infiniband/hw/mlx4/qp.c | 4 +--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/qp.c | 4 +--- include/rdma/ib_verbs.h | 2 +- 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 588f1d195fd2..16ef8a9bda4c 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2235,19 +2235,17 @@ EXPORT_SYMBOL(ib_create_wq); */ int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) { - int err; struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; if (atomic_read(&wq->usecnt)) return -EBUSY; - err = wq->device->ops.destroy_wq(wq, udata); - if (!err) { - atomic_dec(&pd->usecnt); - atomic_dec(&cq->usecnt); - } - return err; + wq->device->ops.destroy_wq(wq, udata); + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + + return 0; } EXPORT_SYMBOL(ib_destroy_wq); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 81b3d85e5167..eb53bb4c0c91 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -906,7 +906,7 @@ void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); -int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); +void mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 5221c0794d1d..520364defa28 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -4248,7 +4248,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, return err; } -int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) +void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibwq->device); struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); @@ -4259,8 +4259,6 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata); kfree(qp); - - return 0; } struct ib_rwq_ind_table diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 35e2c8f5ae78..82cfe86087b6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1201,7 +1201,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); -int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); +void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f6623c77443a..ae847709b3d3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -6047,7 +6047,7 @@ err: return ERR_PTR(err); } -int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) +void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(wq->device); struct mlx5_ib_rwq *rwq = to_mrwq(wq); @@ -6055,8 +6055,6 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); destroy_user_rq(dev, wq->pd, rwq, udata); kfree(rwq); - - return 0; } struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6f09fcc21d7a..805148a12660 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2509,7 +2509,7 @@ struct ib_device_ops { struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); - int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); + void (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table *(*create_rwq_ind_table)( From 89a6da3cb8f30ee0aeca924d84bef688f22f883e Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 14 Jun 2019 22:56:03 +0800 Subject: [PATCH 070/194] RDMA/hns: reset function when removing module During removing the driver, we needs to notify the roce engine to stop working immediately,and symmetrically recycle the hardware resources requested during initialization. The hardware provides a command called function clear that can package these operations,so that the driver can only focus on releasing resources that applied from the operating system. This patch implements the call of this command. Signed-off-by: Lang Cheng Signed-off-by: Lijun Ou Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 44 ++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 17 +++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e7024b3e171a..5c8551b54444 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1130,6 +1130,47 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) return 0; } +static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) +{ + bool fclr_write_fail_flag = false; + struct hns_roce_func_clear *resp; + struct hns_roce_cmq_desc desc; + unsigned long end; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false); + resp = (struct hns_roce_func_clear *)desc.data; + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + fclr_write_fail_flag = true; + dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n", + ret); + return; + } + + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL); + end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS; + while (end) { + msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT); + end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, + true); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + continue; + + if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) { + hr_dev->is_reset = true; + return; + } + } + + dev_err(hr_dev->dev, "Func clear fail.\n"); +} + static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev) { struct hns_roce_query_fw_info *resp; @@ -1894,6 +1935,9 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; + if (hr_dev->pci_dev->revision == 0x21) + hns_roce_function_clear(hr_dev); + hns_roce_free_link_table(hr_dev, &priv->tpq); hns_roce_free_link_table(hr_dev, &priv->tsq); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index bce21fd2ebb6..478f5a5b7aa1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -241,6 +241,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_POST_MB = 0x8504, HNS_ROCE_OPC_QUERY_MB_ST = 0x8505, HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506, + HNS_ROCE_OPC_FUNC_CLEAR = 0x8508, HNS_ROCE_OPC_CLR_SCCC = 0x8509, HNS_ROCE_OPC_QUERY_SCCC = 0x850a, HNS_ROCE_OPC_RESET_SCCC = 0x850b, @@ -1230,6 +1231,22 @@ struct hns_roce_query_fw_info { __le32 rsv[5]; }; +struct hns_roce_func_clear { + __le32 rst_funcid_en; + __le32 func_done; + __le32 rsv[4]; +}; + +#define FUNC_CLEAR_RST_FUN_DONE_S 0 +/* Each physical function manages up to 248 virtual functions; + * it takes up to 100ms for each function to execute clear; + * if an abnormal reset occurs, it is executed twice at most; + * so it takes up to 249 * 2 * 100ms. + */ +#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS (249 * 2 * 100) +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL 40 +#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT 20 + struct hns_roce_cfg_llm_a { __le32 base_addr_l; __le32 base_addr_h; From 836a0fbb3e76f704ad65ddfb57f00725245e509b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 16 Jun 2019 15:05:20 +0300 Subject: [PATCH 071/194] RDMA: Check umem pointer validity prior to release Update ib_umem_release() to behave similarly to kfree() and allow submitting NULL pointer as safe input to this function. Fixes: a52c8e2469c3 ("RDMA: Clean destroy CQ in drivers do not return errors") Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 3 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 29 ++++++-------------- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +- drivers/infiniband/hw/cxgb4/mem.c | 3 +- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 8 ++---- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 13 ++++----- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 5 ++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 14 ++++------ drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +- drivers/infiniband/hw/mlx4/cq.c | 14 ++++------ drivers/infiniband/hw/mlx4/qp.c | 7 ++--- drivers/infiniband/hw/mlx4/srq.c | 7 ++--- drivers/infiniband/hw/mlx5/cq.c | 20 ++++---------- drivers/infiniband/hw/mlx5/mr.c | 13 ++++----- drivers/infiniband/hw/mlx5/qp.c | 9 ++---- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 +- drivers/infiniband/hw/qedr/verbs.c | 9 ++---- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 6 ++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 3 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 16 +++-------- drivers/infiniband/sw/rdmavt/mr.c | 3 +- drivers/infiniband/sw/rxe/rxe_mr.c | 3 +- 26 files changed, 73 insertions(+), 132 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 54628ef879f0..08da840ed7ee 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -361,6 +361,9 @@ static void __ib_umem_release_tail(struct ib_umem *umem) */ void ib_umem_release(struct ib_umem *umem) { + if (!umem) + return; + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 44cc5f19df3b..a91653aabf38 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -805,10 +805,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) rdev->sqp_ah = NULL; } - if (!IS_ERR_OR_NULL(qp->rumem)) - ib_umem_release(qp->rumem); - if (!IS_ERR_OR_NULL(qp->sumem)) - ib_umem_release(qp->sumem); + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -1201,12 +1199,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, qp_destroy: bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); free_umem: - if (udata) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); fail: kfree(qp); return ERR_PTR(rc); @@ -1302,8 +1296,7 @@ void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) if (qplib_srq->cq) nq = qplib_srq->cq->nq; bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); - if (srq->umem) - ib_umem_release(srq->umem); + ib_umem_release(srq->umem); atomic_dec(&rdev->srq_count); if (nq) nq->budget--; @@ -1412,8 +1405,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, return 0; fail: - if (srq->umem) - ib_umem_release(srq->umem); + ib_umem_release(srq->umem); exit: return rc; } @@ -2528,8 +2520,7 @@ void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) nq = cq->qplib_cq.nq; bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); - if (!cq->umem) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); atomic_dec(&rdev->cq_count); nq->budget--; @@ -2632,8 +2623,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return 0; c2fail: - if (udata) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); fail: kfree(cq->cql); return rc; @@ -3340,8 +3330,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) mr->npages = 0; mr->pages = NULL; } - if (!IS_ERR_OR_NULL(mr->ib_umem)) - ib_umem_release(mr->ib_umem); + ib_umem_release(mr->ib_umem); kfree(mr); atomic_dec(&rdev->mr_count); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 810fa96af2e9..e775c1a1a450 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -346,8 +346,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) xa_erase_irq(&rhp->mrs, mmid); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); - if (mhp->umem) - ib_umem_release(mhp->umem); + ib_umem_release(mhp->umem); pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); kfree(mhp); return 0; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index fe3733c4026d..aa772ee0706f 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -808,8 +808,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) mhp->attr.pbl_size << 3); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); - if (mhp->umem) - ib_umem_release(mhp->umem); + ib_umem_release(mhp->umem); pr_debug("mmid 0x%x ptr %p\n", mmid, mhp); c4iw_put_wr_wait(mhp->wr_waitp); kfree(mhp); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index a9372c9e4b30..5e6e5eb65cff 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1513,8 +1513,8 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) err = efa_com_dereg_mr(&dev->edev, ¶ms); if (err) return err; - ib_umem_release(mr->umem); } + ib_umem_release(mr->umem); kfree(mr); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 7e198c9ffbfe..6b4d8e50aabe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -423,9 +423,8 @@ err_dbmap: err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - if (udata) - ib_umem_release(hr_cq->umem); - else + ib_umem_release(hr_cq->umem); + if (!udata) hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe); @@ -451,9 +450,8 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hns_roce_free_cq(hr_dev, hr_cq); hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); + ib_umem_release(hr_cq->umem); if (udata) { - ib_umem_release(hr_cq->umem); - if (hr_cq->db_en == 1) hns_roce_db_unmap_user(rdma_udata_to_drv_context( udata, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index c899879da222..cb004190ccba 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1163,8 +1163,7 @@ free_mr: hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, key_to_hw_index(mr->key), 0); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); @@ -3641,9 +3640,8 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); - if (udata) - ib_umem_release(hr_qp->umem); - else { + ib_umem_release(hr_qp->umem); + if (!udata) { kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); @@ -3694,9 +3692,8 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - if (ibcq->uobject) - ib_umem_release(hr_cq->umem); - else { + ib_umem_release(hr_cq->umem); + if (!udata) { /* Free the buff of stored cq */ cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz; hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5c8551b54444..edd62b4dc0a0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4582,7 +4582,6 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1)) hns_roce_db_unmap_user(context, &hr_qp->rdb); - ib_umem_release(hr_qp->umem); } else { kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); @@ -4590,6 +4589,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (hr_qp->rq.wqe_cnt) hns_roce_free_db(hr_dev, &hr_qp->rdb); } + ib_umem_release(hr_qp->umem); if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && hr_qp->rq.wqe_cnt) { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 6db0dae18ab7..adf075183dfb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1298,9 +1298,7 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } else { hns_roce_mr_free(hr_dev, mr); - if (mr->umem) - ib_umem_release(mr->umem); - + ib_umem_release(mr->umem); kfree(mr); } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 99ec5d43b99b..7e9db8236072 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -938,10 +938,9 @@ err_get_bufs: hns_roce_free_buf_list(buf_list, hr_qp->region_cnt); err_alloc_list: - if (hr_qp->umem) - ib_umem_release(hr_qp->umem); - else + if (!hr_qp->umem) hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); + ib_umem_release(hr_qp->umem); err_db: if (!udata && hns_roce_qp_has_rq(init_attr) && diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index c222f243953a..de645be8aa48 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -380,8 +380,7 @@ err_idx_buf: hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); err_idx_mtt: - if (udata) - ib_umem_release(srq->idx_que.umem); + ib_umem_release(srq->idx_que.umem); err_create_idx: hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, @@ -392,9 +391,8 @@ err_srq_mtt: hns_roce_mtt_cleanup(hr_dev, &srq->mtt); err_buf: - if (udata) - ib_umem_release(srq->umem); - else + ib_umem_release(srq->umem); + if (!udata) hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf); return ret; @@ -408,15 +406,15 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) hns_roce_srq_free(hr_dev, srq); hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - if (ibsrq->uobject) { + if (udata) { hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - ib_umem_release(srq->idx_que.umem); - ib_umem_release(srq->umem); } else { kvfree(srq->wrid); hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift, &srq->buf); } + ib_umem_release(srq->idx_que.umem); + ib_umem_release(srq->umem); } int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 3100b0c31b0a..3c0c6aabc64e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2004,8 +2004,7 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) struct cqp_commands_info *cqp_info; u32 stag_idx; - if (iwmr->region) - ib_umem_release(iwmr->region); + ib_umem_release(iwmr->region); if (iwmr->type != IW_MEMREG_TYPE_MEM) { /* region is released. only test for userness. */ diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 72f238ddafb5..a7d238d312f0 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -277,9 +277,8 @@ err_dbmap: err_mtt: mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); - if (udata) - ib_umem_release(cq->umem); - else + ib_umem_release(cq->umem); + if (!udata) mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); err_db: @@ -468,11 +467,8 @@ err_buf: kfree(cq->resize_buf); cq->resize_buf = NULL; - if (cq->resize_umem) { - ib_umem_release(cq->resize_umem); - cq->resize_umem = NULL; - } - + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; out: mutex_unlock(&cq->resize_mutex); @@ -494,11 +490,11 @@ void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) struct mlx4_ib_ucontext, ibucontext), &mcq->db); - ib_umem_release(mcq->umem); } else { mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } + ib_umem_release(mcq->umem); } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 520364defa28..82aff2f2fdc2 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1207,10 +1207,9 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &qp->mtt); err_buf: - if (qp->umem) - ib_umem_release(qp->umem); - else + if (!qp->umem) mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + ib_umem_release(qp->umem); err_db: if (!udata && qp_has_rq(init_attr)) @@ -1421,7 +1420,6 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_ib_db_unmap_user(mcontext, &qp->db); } - ib_umem_release(qp->umem); } else { kvfree(qp->sq.wrid); kvfree(qp->rq.wrid); @@ -1432,6 +1430,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } + ib_umem_release(qp->umem); del_gid_entries(qp); } diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index c9f555e04c9f..848db7264cc9 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -204,10 +204,9 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &srq->mtt); err_buf: - if (srq->umem) - ib_umem_release(srq->umem); - else + if (!srq->umem) mlx4_buf_free(dev->dev, buf_size, &srq->buf); + ib_umem_release(srq->umem); err_db: if (!udata) @@ -275,13 +274,13 @@ void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata) struct mlx4_ib_ucontext, ibucontext), &msrq->db); - ib_umem_release(msrq->umem); } else { kvfree(msrq->wrid); mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, &msrq->buf); mlx4_db_free(dev->dev, &msrq->db); } + ib_umem_release(msrq->umem); } void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 07b73df0e1a3..22230fd7d741 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1125,11 +1125,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, return 0; } -static void un_resize_user(struct mlx5_ib_cq *cq) -{ - ib_umem_release(cq->resize_umem); -} - static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, int entries, int cqe_size) { @@ -1152,12 +1147,6 @@ ex: return err; } -static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) -{ - free_cq_buf(dev, cq->resize_buf); - cq->resize_buf = NULL; -} - static int copy_resize_cqes(struct mlx5_ib_cq *cq) { struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); @@ -1338,10 +1327,11 @@ ex_alloc: kvfree(in); ex_resize: - if (udata) - un_resize_user(cq); - else - un_resize_kernel(dev, cq); + ib_umem_release(cq->resize_umem); + if (!udata) { + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; + } ex: mutex_unlock(&cq->resize_mutex); return err; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4d033796dcfc..994abcebb057 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1507,10 +1507,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, return 0; err: - if (mr->umem) { - ib_umem_release(mr->umem); - mr->umem = NULL; - } + ib_umem_release(mr->umem); + mr->umem = NULL; + clean_mr(dev, mr); return err; } @@ -1630,10 +1629,10 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) * remove the DMA mapping. */ mlx5_mr_cache_free(dev, mr); - if (umem) { - ib_umem_release(umem); + ib_umem_release(umem); + if (umem) atomic_sub(npages, &dev->mdev->priv.reg_pages); - } + if (!mr->allocated_from_cache) kfree(mr); } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ae847709b3d3..12ccbd584d2a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -790,8 +790,7 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, atomic_dec(&dev->delay_drop.rqs_cnt); mlx5_ib_db_unmap_user(context, &rwq->db); - if (rwq->umem) - ib_umem_release(rwq->umem); + ib_umem_release(rwq->umem); } static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, @@ -977,8 +976,7 @@ err_free: kvfree(*in); err_umem: - if (ubuffer->umem) - ib_umem_release(ubuffer->umem); + ib_umem_release(ubuffer->umem); err_bfreg: if (bfregn != MLX5_IB_INVALID_BFREG) @@ -997,8 +995,7 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, ibucontext); mlx5_ib_db_unmap_user(context, &qp->db); - if (base->ubuffer.umem) - ib_umem_release(base->ubuffer.umem); + ib_umem_release(base->ubuffer.umem); /* * Free only the BFREGs which are handled by the kernel. diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index d97124bee703..23554d8bf241 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -953,8 +953,7 @@ static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata) struct mthca_mr *mmr = to_mmr(mr); mthca_free_mr(to_mdev(mr->device), mmr); - if (mmr->umem) - ib_umem_release(mmr->umem); + ib_umem_release(mmr->umem); kfree(mmr); return 0; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 10b35edb286b..bccc11378109 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -925,8 +925,7 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); /* it could be user registered memory. */ - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); /* Don't stop cleanup, in case FW is unresponsive */ diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3fc7a4e901c3..27d90a84ea01 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1572,12 +1572,10 @@ qedr_iwarp_populate_user_qp(struct qedr_dev *dev, static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp) { - if (qp->usq.umem) - ib_umem_release(qp->usq.umem); + ib_umem_release(qp->usq.umem); qp->usq.umem = NULL; - if (qp->urq.umem) - ib_umem_release(qp->urq.umem); + ib_umem_release(qp->urq.umem); qp->urq.umem = NULL; } @@ -2680,8 +2678,7 @@ int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); /* it could be user registered memory. */ - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 38573fc0a9bf..7800e6930502 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -213,8 +213,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); err_umem: - if (!cq->is_kernel) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); return ret; @@ -226,8 +225,7 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) complete(&cq->free); wait_for_completion(&cq->free); - if (!cq->is_kernel) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); pvrdma_page_dir_cleanup(dev, &cq->pdir); } diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index 65dc47ffb8f3..f3a3d22ee8d7 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -290,8 +290,7 @@ int pvrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) "could not deregister mem region, error: %d\n", ret); pvrdma_page_dir_cleanup(dev, &mr->pdir); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr->pages); kfree(mr); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 0eaaead5baec..bca6a58a442e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -391,12 +391,8 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, err_pdir: pvrdma_page_dir_cleanup(dev, &qp->pdir); err_umem: - if (!qp->is_kernel) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); err_qp: kfree(qp); atomic_dec(&dev->num_qps); @@ -429,12 +425,8 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp) complete(&qp->free); wait_for_completion(&qp->free); - if (!qp->is_kernel) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); pvrdma_page_dir_cleanup(dev, &qp->pdir); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 54f3f9c27552..db800eb2b1f5 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -560,8 +560,7 @@ int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (ret) goto out; rvt_deinit_mregion(&mr->mr); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); out: return ret; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index f501f72489d8..ea6a819b7167 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -96,8 +96,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg) struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem); int i; - if (mem->umem) - ib_umem_release(mem->umem); + ib_umem_release(mem->umem); if (mem->map) { for (i = 0; i < mem->num_map; i++) From da3929218a4481fc12f9eaa30c9edb09aad5ff24 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 16 Jun 2019 15:05:58 +0300 Subject: [PATCH 072/194] RDMa/hns: Don't stuck in endless timeout loop The "end" variable is declared as unsigned and can't be negative, it leads to the situation where timeout limit is not honored, so let's convert logic to ensure that loop is bounded. drivers/infiniband/hw/hns/hns_roce_hw_v1.c: In function _hns_roce_v1_clear_hem_: drivers/infiniband/hw/hns/hns_roce_hw_v1.c:2471:12: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] 2471 | if (end < 0) { | ^ Fixes: 669cefb654cb ("RDMA/hns: Remove jiffies operation in disable interrupt context") Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hem.h | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index e865fc8a18a7..f1ccb8f35fe5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -34,8 +34,8 @@ #ifndef _HNS_ROCE_HEM_H #define _HNS_ROCE_HEM_H -#define HW_SYNC_TIMEOUT_MSECS 500 #define HW_SYNC_SLEEP_TIME_INTERVAL 20 +#define HW_SYNC_TIMEOUT_MSECS (25 * HW_SYNC_SLEEP_TIME_INTERVAL) #define BT_CMD_SYNC_SHIFT 31 enum { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index cb004190ccba..2c0bc2536fda 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -2467,7 +2467,7 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, end = HW_SYNC_TIMEOUT_MSECS; while (1) { if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) { - if (end < 0) { + if (!end) { dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n"); spin_unlock_irqrestore(&hr_dev->bt_cmd_lock, flags); From d384742ed1afa925bb78fd50ff337538f202f19c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 14 Jun 2019 00:13:51 +0000 Subject: [PATCH 073/194] RDMA/uverbs: Use offsetofend instead of opencoding Discovered this was available already. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5c00d9a5698a..d13b87d27ce5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3708,9 +3708,6 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) * trailing driver_data flex array. In this case the size of the base struct * cannot be changed. */ -#define offsetof_after(_struct, _member) \ - (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member)) - #define UAPI_DEF_WRITE_IO(req, resp) \ .write.has_resp = 1 + \ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ @@ -3741,11 +3738,11 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) */ #define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \ .write.has_resp = 1, \ - .write.req_size = offsetof_after(req, req_last_member), \ - .write.resp_size = offsetof_after(resp, resp_last_member) + .write.req_size = offsetofend(req, req_last_member), \ + .write.resp_size = offsetofend(resp, resp_last_member) #define UAPI_DEF_WRITE_I_EX(req, req_last_member) \ - .write.req_size = offsetof_after(req, req_last_member) + .write.req_size = offsetofend(req, req_last_member) const struct uapi_definition uverbs_def_write_intf[] = { DECLARE_UVERBS_OBJECT( From dd82e668892ead6fe97c97eabd7ba28e296052c6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 13 Jun 2019 21:46:45 -0300 Subject: [PATCH 074/194] RDMA/odp: Do not leak dma maps when working with huge pages The ib_dma_unmap_page() must match the length of the ib_dma_map_page(), which is based on odp_shift. Otherwise iommu resources under this API will not be properly freed. Signed-off-by: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index eb9939d52818..2a75c6f8d827 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -730,7 +730,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, WARN_ON(!dma_addr); - ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + ib_dma_unmap_page(dev, dma_addr, + BIT(umem_odp->page_shift), DMA_BIDIRECTIONAL); if (dma & ODP_WRITE_ALLOWED_BIT) { struct page *head_page = compound_head(page); From f56044d686c82bd31713fc0398d68e322813dc62 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 13 Jun 2019 08:30:44 -0400 Subject: [PATCH 075/194] IB/rdmavt: Add new completion inline There is opencoded send completion logic all over all the drivers. We need to convert to this routine to enforce ordering issues for completions. This routine fixes an ordering issue where the read of the SWQE fields necessary for creating the completion can race with a post send if the post send catches a send queue at the edge of being full. Is is possible in that situation to read SWQE fields that are being written. This new routine insures that SWQE fields are read prior to advancing the index that post send uses to determine queue fullness. Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 72 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 68e38c20afc0..6014f1766907 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -737,6 +737,78 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); } +/** + * rvt_qp_sqwe_incr - increment ring index + * @qp: the qp + * @val: the starting value + * + * Return: the new value wrapping as appropriate + */ +static inline u32 +rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val) +{ + if (++val >= qp->s_size) + val = 0; + return val; +} + +/** + * rvt_qp_complete_swqe - insert send completion + * @qp - the qp + * @wqe - the send wqe + * @opcode - wc operation (driver dependent) + * @status - completion status + * + * Update the s_last information, and then insert a send + * completion into the completion + * queue if the qp indicates it should be done. + * + * See IBTA 10.7.3.1 for info on completion + * control. + * + * Return: new last + */ +static inline u32 +rvt_qp_complete_swqe(struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_opcode opcode, + enum ib_wc_status status) +{ + bool need_completion; + u64 wr_id; + u32 byte_len, last; + int flags = wqe->wr.send_flags; + + rvt_put_qp_swqe(qp, wqe); + + need_completion = + !(flags & RVT_SEND_RESERVE_USED) && + (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || + (flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS); + if (need_completion) { + wr_id = wqe->wr.wr_id; + byte_len = wqe->length; + /* above fields required before writing s_last */ + } + last = rvt_qp_swqe_incr(qp, qp->s_last); + /* see rvt_qp_is_avail() */ + smp_store_release(&qp->s_last, last); + if (need_completion) { + struct ib_wc w = { + .wr_id = wr_id, + .status = status, + .opcode = opcode, + .qp = &qp->ibqp, + .byte_len = byte_len, + }; + + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &w, + status != IB_WC_SUCCESS); + } + return last; +} + extern const int ib_rvt_state_ops[]; struct rvt_dev_info; From 4a9ceb7dbadf9e1435644b1f49720ee87431ce26 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 13 Jun 2019 08:30:52 -0400 Subject: [PATCH 076/194] IB/{rdmavt, qib, hfi1}: Convert to new completion API Convert all completions to use the new completion routine that fixes a race between post send and completion where fields from a SWQE can be read after SWQE has been freed. This patch also addresses issues reported in https://marc.info/?l=linux-kernel&m=155656897409107&w=2. The reserved operation path has no need for any barrier. The barrier for the other path is addressed by the smp_load_acquire() barrier. Cc: Andrea Parri Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 26 ++++----------------- drivers/infiniband/hw/qib/qib_rc.c | 26 ++++----------------- drivers/infiniband/sw/rdmavt/qp.c | 31 ++++++++----------------- include/rdma/rdmavt_qp.h | 36 ------------------------------ 4 files changed, 17 insertions(+), 102 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index a922edcf23d6..84b51cc36dbd 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1819,23 +1819,14 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } while (qp->s_last != qp->s_acked) { - u32 s_last; - wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; trdma_clean_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); - s_last = qp->s_last; - trace_hfi1_qp_send_completion(qp, wqe, s_last); - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - rvt_qp_swqe_complete(qp, + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, wqe, ib_hfi1_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); @@ -1879,19 +1870,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, trace_hfi1_rc_completion(qp, wqe->lpsn); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - u32 s_last; - trdma_clean_swqe(qp, wqe); - rvt_put_qp_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); - s_last = qp->s_last; - trace_hfi1_qp_send_completion(qp, wqe, s_last); - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_qp_swqe_complete(qp, + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, wqe, ib_hfi1_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 2ac4c67f5ba1..8d9a94d6f685 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -921,20 +921,11 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr) rvt_add_retry_timer(qp); while (qp->s_last != qp->s_acked) { - u32 s_last; - wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; - s_last = qp->s_last; - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - rvt_qp_swqe_complete(qp, + rvt_qp_complete_swqe(qp, wqe, ib_qib_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); @@ -972,21 +963,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * is finished. */ if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || - qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - u32 s_last; - - rvt_put_qp_swqe(qp, wqe); - s_last = qp->s_last; - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_qp_swqe_complete(qp, + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) + rvt_qp_complete_swqe(qp, wqe, ib_qib_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); - } else + else this_cpu_inc(*ibp->rvp.rc_delayed_comp); qp->s_retry = qp->s_retry_cnt; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a60f5faea198..dfbc7d8640fb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1853,10 +1853,9 @@ static inline int rvt_qp_is_avail( /* see rvt_qp_wqe_unreserve() */ smp_mb__before_atomic(); - reserved_used = atomic_read(&qp->s_reserved_used); if (unlikely(reserved_op)) { /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); if (reserved_used >= rdi->dparms.reserved_operations) return -ENOMEM; return 0; @@ -1864,14 +1863,13 @@ static inline int rvt_qp_is_avail( /* non-reserved operations */ if (likely(qp->s_avail)) return 0; - slast = READ_ONCE(qp->s_last); + /* See rvt_qp_complete_swqe() */ + slast = smp_load_acquire(&qp->s_last); if (qp->s_head >= slast) avail = qp->s_size - (qp->s_head - slast); else avail = slast - qp->s_head; - /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); reserved_used = atomic_read(&qp->s_reserved_used); avail = avail - 1 - (rdi->dparms.reserved_operations - reserved_used); @@ -2664,27 +2662,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) { u32 old_last, last; - struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_dev_info *rdi; if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; + rdi = ib_to_rvt(qp->ibqp.device); - last = qp->s_last; - old_last = last; - trace_rvt_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_rvt_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - - rvt_qp_swqe_complete(qp, - wqe, - rdi->wc_opcode[wqe->wr.opcode], - status); - + old_last = qp->s_last; + trace_rvt_qp_send_completion(qp, wqe, old_last); + last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode], + status); if (qp->s_acked == old_last) qp->s_acked = last; if (qp->s_cur == old_last) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6014f1766907..84d0f36afc2f 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -565,42 +565,6 @@ static inline void rvt_qp_wqe_unreserve( extern const enum ib_wc_opcode ib_rvt_wc_opcode[]; -/** - * rvt_qp_swqe_complete() - insert send completion - * @qp - the qp - * @wqe - the send wqe - * @status - completion status - * - * Insert a send completion into the completion - * queue if the qp indicates it should be done. - * - * See IBTA 10.7.3.1 for info on completion - * control. - */ -static inline void rvt_qp_swqe_complete( - struct rvt_qp *qp, - struct rvt_swqe *wqe, - enum ib_wc_opcode opcode, - enum ib_wc_status status) -{ - if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) - return; - if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || - (wqe->wr.send_flags & IB_SEND_SIGNALED) || - status != IB_WC_SUCCESS) { - struct ib_wc wc; - - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr.wr_id; - wc.status = status; - wc.opcode = opcode; - wc.qp = &qp->ibqp; - wc.byte_len = wqe->length; - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, - status != IB_WC_SUCCESS); - } -} - /* * Compare the lower 24 bits of the msn values. * Returns an integer <, ==, or > than zero. From 40ddb3f020834f9afb7aab31385994811f4db259 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 13 Jun 2019 12:10:12 +0300 Subject: [PATCH 077/194] RDMA/efa: Use API to get contiguous memory blocks aligned to device supported page size Use the ib_umem_find_best_pgsz() and rdma_for_each_block() API when registering an MR instead of coding it in the driver. ib_umem_find_best_pgsz() is used to find the best suitable page size which replaces the existing efa_cont_pages() implementation. rdma_for_each_block() is used to iterate the umem in aligned contiguous memory blocks. Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Reviewed-by: Shiraz Saleem Signed-off-by: Gal Pressman Signed-off-by: Doug Ledford --- drivers/infiniband/hw/efa/efa_verbs.c | 88 +++++++-------------------- 1 file changed, 21 insertions(+), 67 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 5e6e5eb65cff..f187f1acdb55 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1011,21 +1011,15 @@ static int umem_to_page_list(struct efa_dev *dev, u8 hp_shift) { u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); - struct sg_dma_page_iter sg_iter; - unsigned int page_idx = 0; + struct ib_block_iter biter; unsigned int hp_idx = 0; ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", hp_cnt, pages_in_hp); - for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { - if (page_idx % pages_in_hp == 0) { - page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter); - hp_idx++; - } - - page_idx++; - } + rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, + BIT(hp_shift)) + page_list[hp_idx++] = rdma_block_iter_dma_address(&biter); return 0; } @@ -1356,56 +1350,6 @@ static int efa_create_pbl(struct efa_dev *dev, return 0; } -static void efa_cont_pages(struct ib_umem *umem, u64 addr, - unsigned long max_page_shift, - int *count, u8 *shift, u32 *ncont) -{ - struct scatterlist *sg; - u64 base = ~0, p = 0; - unsigned long tmp; - unsigned long m; - u64 len, pfn; - int i = 0; - int entry; - - addr = addr >> PAGE_SHIFT; - tmp = (unsigned long)addr; - m = find_first_bit(&tmp, BITS_PER_LONG); - if (max_page_shift) - m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m); - - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE); - pfn = sg_dma_address(sg) >> PAGE_SHIFT; - if (base + p != pfn) { - /* - * If either the offset or the new - * base are unaligned update m - */ - tmp = (unsigned long)(pfn | p); - if (!IS_ALIGNED(tmp, 1 << m)) - m = find_first_bit(&tmp, BITS_PER_LONG); - - base = pfn; - p = 0; - } - - p += len; - i += len; - } - - if (i) { - m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); - *ncont = DIV_ROUND_UP(i, (1 << m)); - } else { - m = 0; - *ncont = 0; - } - - *shift = PAGE_SHIFT + m; - *count = i; -} - struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -1413,11 +1357,10 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, struct efa_dev *dev = to_edev(ibpd->device); struct efa_com_reg_mr_params params = {}; struct efa_com_reg_mr_result result = {}; - unsigned long max_page_shift; struct pbl_context pbl; + unsigned int pg_sz; struct efa_mr *mr; int inline_size; - int npages; int err; if (udata->inlen && @@ -1454,13 +1397,24 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, params.iova = virt_addr; params.mr_length_in_bytes = length; params.permissions = access_flags & 0x1; - max_page_shift = fls64(dev->dev_attr.page_size_cap); - efa_cont_pages(mr->umem, start, max_page_shift, &npages, - ¶ms.page_shift, ¶ms.page_num); + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->dev_attr.page_size_cap, + virt_addr); + if (!pg_sz) { + err = -EOPNOTSUPP; + ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n", + dev->dev_attr.page_size_cap); + goto err_unmap; + } + + params.page_shift = __ffs(pg_sz); + params.page_num = DIV_ROUND_UP(length + (start & (pg_sz - 1)), + pg_sz); + ibdev_dbg(&dev->ibdev, - "start %#llx length %#llx npages %d params.page_shift %u params.page_num %u\n", - start, length, npages, params.page_shift, params.page_num); + "start %#llx length %#llx params.page_shift %u params.page_num %u\n", + start, length, params.page_shift, params.page_num); inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array); if (params.page_num <= inline_size) { From b41f75724ab8aeeba1001e63c31db8623432f001 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 13 Jun 2019 12:10:13 +0300 Subject: [PATCH 078/194] RDMA/efa: Be consistent with success flow return value The EFA driver is written with success oriented flows in mind, meaning that functions should mostly end with a return 0 statement. Error flows return their error value on their own instead of assuming that the function will return the error at the end. This commit fixes a bunch of functions that were not aligned with this behavior. Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Doug Ledford --- drivers/infiniband/hw/efa/efa_com_cmd.c | 4 ++-- drivers/infiniband/hw/efa/efa_main.c | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 6 ++++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 91e7f2195802..d2464c8390bb 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -56,7 +56,7 @@ int efa_com_create_qp(struct efa_com_dev *edev, res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx; res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx; - return err; + return 0; } int efa_com_modify_qp(struct efa_com_dev *edev, @@ -178,7 +178,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, result->cq_idx = cmd_completion.cq_idx; result->actual_depth = params->cq_depth; - return err; + return 0; } int efa_com_destroy_cq(struct efa_com_dev *edev, diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 46861461dd2d..dd1c6d49466f 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -100,7 +100,7 @@ static int efa_request_mgmnt_irq(struct efa_dev *dev) nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector); irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask); - return err; + return 0; } static void efa_setup_mgmnt_irq(struct efa_dev *dev) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index f187f1acdb55..7b4e0fa99817 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1594,13 +1594,15 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext, err = -EINVAL; } - if (err) + if (err) { ibdev_dbg( &dev->ibdev, "Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n", entry->address, length, entry->mmap_flag, err); + return err; + } - return err; + return 0; } int efa_mmap(struct ib_ucontext *ibucontext, From 4b06843d404855063decbccd206d3dc927280b0c Mon Sep 17 00:00:00 2001 From: Firas Jahjah Date: Thu, 13 Jun 2019 12:10:14 +0300 Subject: [PATCH 079/194] RDMA/efa: Print address on AH creation failure For debugging purposes, print destination address if failed to create AH. Signed-off-by: Firas Jahjah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Doug Ledford --- drivers/infiniband/hw/efa/efa_com_cmd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index d2464c8390bb..fad5c2ee7bb1 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -300,7 +300,8 @@ int efa_com_create_ah(struct efa_com_dev *edev, (struct efa_admin_acq_entry *)&cmd_completion, sizeof(cmd_completion)); if (err) { - ibdev_err(edev->efa_dev, "Failed to create ah [%d]\n", err); + ibdev_err(edev->efa_dev, "Failed to create ah for %pI6 [%d]\n", + ah_cmd.dest_addr, err); return err; } From 36b1e47ff0c196a95d5e55a05b3f988f827cce7e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:37 +0300 Subject: [PATCH 080/194] RDMA/core: Introduce new header file for signature operations Ease the exhausted ib_verbs.h file and make the code more readable. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 112 +----------------------------------- include/rdma/signature.h | 120 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 111 deletions(-) create mode 100644 include/rdma/signature.h diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 805148a12660..dc59fa12669a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -264,17 +265,6 @@ enum ib_device_cap_flags { IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), }; -enum ib_signature_prot_cap { - IB_PROT_T10DIF_TYPE_1 = 1, - IB_PROT_T10DIF_TYPE_2 = 1 << 1, - IB_PROT_T10DIF_TYPE_3 = 1 << 2, -}; - -enum ib_signature_guard_cap { - IB_GUARD_T10DIF_CRC = 1, - IB_GUARD_T10DIF_CSUM = 1 << 1, -}; - enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, @@ -799,106 +789,6 @@ enum ib_mr_type { IB_MR_TYPE_SG_GAPS, }; -/** - * Signature types - * IB_SIG_TYPE_NONE: Unprotected. - * IB_SIG_TYPE_T10_DIF: Type T10-DIF - */ -enum ib_signature_type { - IB_SIG_TYPE_NONE, - IB_SIG_TYPE_T10_DIF, -}; - -/** - * Signature T10-DIF block-guard types - * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. - * IB_T10DIF_CSUM: Corresponds to IP checksum rules. - */ -enum ib_t10_dif_bg_type { - IB_T10DIF_CRC, - IB_T10DIF_CSUM -}; - -/** - * struct ib_t10_dif_domain - Parameters specific for T10-DIF - * domain. - * @bg_type: T10-DIF block guard type (CRC|CSUM) - * @pi_interval: protection information interval. - * @bg: seed of guard computation. - * @app_tag: application tag of guard block - * @ref_tag: initial guard block reference tag. - * @ref_remap: Indicate wethear the reftag increments each block - * @app_escape: Indicate to skip block check if apptag=0xffff - * @ref_escape: Indicate to skip block check if reftag=0xffffffff - * @apptag_check_mask: check bitmask of application tag. - */ -struct ib_t10_dif_domain { - enum ib_t10_dif_bg_type bg_type; - u16 pi_interval; - u16 bg; - u16 app_tag; - u32 ref_tag; - bool ref_remap; - bool app_escape; - bool ref_escape; - u16 apptag_check_mask; -}; - -/** - * struct ib_sig_domain - Parameters for signature domain - * @sig_type: specific signauture type - * @sig: union of all signature domain attributes that may - * be used to set domain layout. - */ -struct ib_sig_domain { - enum ib_signature_type sig_type; - union { - struct ib_t10_dif_domain dif; - } sig; -}; - -/** - * struct ib_sig_attrs - Parameters for signature handover operation - * @check_mask: bitmask for signature byte check (8 bytes) - * @mem: memory domain layout desciptor. - * @wire: wire domain layout desciptor. - */ -struct ib_sig_attrs { - u8 check_mask; - struct ib_sig_domain mem; - struct ib_sig_domain wire; -}; - -enum ib_sig_err_type { - IB_SIG_BAD_GUARD, - IB_SIG_BAD_REFTAG, - IB_SIG_BAD_APPTAG, -}; - -/** - * Signature check masks (8 bytes in total) according to the T10-PI standard: - * -------- -------- ------------ - * | GUARD | APPTAG | REFTAG | - * | 2B | 2B | 4B | - * -------- -------- ------------ - */ -enum { - IB_SIG_CHECK_GUARD = 0xc0, - IB_SIG_CHECK_APPTAG = 0x30, - IB_SIG_CHECK_REFTAG = 0x0f, -}; - -/** - * struct ib_sig_err - signature error descriptor - */ -struct ib_sig_err { - enum ib_sig_err_type err_type; - u32 expected; - u32 actual; - u64 sig_err_offset; - u32 key; -}; - enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; diff --git a/include/rdma/signature.h b/include/rdma/signature.h new file mode 100644 index 000000000000..5998fe94dfd4 --- /dev/null +++ b/include/rdma/signature.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */ +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_SIGNATURE_H_ +#define _RDMA_SIGNATURE_H_ + +enum ib_signature_prot_cap { + IB_PROT_T10DIF_TYPE_1 = 1, + IB_PROT_T10DIF_TYPE_2 = 1 << 1, + IB_PROT_T10DIF_TYPE_3 = 1 << 2, +}; + +enum ib_signature_guard_cap { + IB_GUARD_T10DIF_CRC = 1, + IB_GUARD_T10DIF_CSUM = 1 << 1, +}; + +/** + * enum ib_signature_type - Signature types + * @IB_SIG_TYPE_NONE: Unprotected. + * @IB_SIG_TYPE_T10_DIF: Type T10-DIF + */ +enum ib_signature_type { + IB_SIG_TYPE_NONE, + IB_SIG_TYPE_T10_DIF, +}; + +/** + * enum ib_t10_dif_bg_type - Signature T10-DIF block-guard types + * @IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. + * @IB_T10DIF_CSUM: Corresponds to IP checksum rules. + */ +enum ib_t10_dif_bg_type { + IB_T10DIF_CRC, + IB_T10DIF_CSUM, +}; + +/** + * struct ib_t10_dif_domain - Parameters specific for T10-DIF + * domain. + * @bg_type: T10-DIF block guard type (CRC|CSUM) + * @pi_interval: protection information interval. + * @bg: seed of guard computation. + * @app_tag: application tag of guard block + * @ref_tag: initial guard block reference tag. + * @ref_remap: Indicate wethear the reftag increments each block + * @app_escape: Indicate to skip block check if apptag=0xffff + * @ref_escape: Indicate to skip block check if reftag=0xffffffff + * @apptag_check_mask: check bitmask of application tag. + */ +struct ib_t10_dif_domain { + enum ib_t10_dif_bg_type bg_type; + u16 pi_interval; + u16 bg; + u16 app_tag; + u32 ref_tag; + bool ref_remap; + bool app_escape; + bool ref_escape; + u16 apptag_check_mask; +}; + +/** + * struct ib_sig_domain - Parameters for signature domain + * @sig_type: specific signauture type + * @sig: union of all signature domain attributes that may + * be used to set domain layout. + */ +struct ib_sig_domain { + enum ib_signature_type sig_type; + union { + struct ib_t10_dif_domain dif; + } sig; +}; + +/** + * struct ib_sig_attrs - Parameters for signature handover operation + * @check_mask: bitmask for signature byte check (8 bytes) + * @mem: memory domain layout descriptor. + * @wire: wire domain layout descriptor. + */ +struct ib_sig_attrs { + u8 check_mask; + struct ib_sig_domain mem; + struct ib_sig_domain wire; +}; + +enum ib_sig_err_type { + IB_SIG_BAD_GUARD, + IB_SIG_BAD_REFTAG, + IB_SIG_BAD_APPTAG, +}; + +/* + * Signature check masks (8 bytes in total) according to the T10-PI standard: + * -------- -------- ------------ + * | GUARD | APPTAG | REFTAG | + * | 2B | 2B | 4B | + * -------- -------- ------------ + */ +enum { + IB_SIG_CHECK_GUARD = 0xc0, + IB_SIG_CHECK_APPTAG = 0x30, + IB_SIG_CHECK_REFTAG = 0x0f, +}; + +/* + * struct ib_sig_err - signature error descriptor + */ +struct ib_sig_err { + enum ib_sig_err_type err_type; + u32 expected; + u32 actual; + u64 sig_err_offset; + u32 key; +}; + +#endif /* _RDMA_SIGNATURE_H_ */ From a0bc099abf7b45b16cb18459f3516af8c2fea781 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:38 +0300 Subject: [PATCH 081/194] RDMA/core: Save the MR type in the ib_mr structure This is a preparation for the signature verbs API change. This change is needed since the MR type will define, in the upcoming patches, the need for allocating internal resources in LLD for signature handover related operations. It will also help to make sure that signature related functions are called with an appropriate MR type and fail otherwise. Also introduce new mr types IB_MR_TYPE_USER, IB_MR_TYPE_DMA and IB_MR_TYPE_DM for correctness. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 + drivers/infiniband/core/uverbs_std_types_mr.c | 1 + drivers/infiniband/core/verbs.c | 2 ++ include/rdma/ib_verbs.h | 10 ++++++++++ 4 files changed, 14 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d13b87d27ce5..689275c2894f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -745,6 +745,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_USER; mr->dm = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 610d3b9f7654..7ca79bfa3487 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -128,6 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DM; mr->dm = dm; mr->uobject = uobj; atomic_inc(&pd->usecnt); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 16ef8a9bda4c..10ff85b79d25 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -299,6 +299,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DMA; mr->uobject = NULL; mr->need_inval = false; @@ -2020,6 +2021,7 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, mr->need_inval = false; mr->res.type = RDMA_RESTRACK_MR; rdma_restrack_kadd(&mr->res); + mr->type = mr_type; } return mr; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index dc59fa12669a..b6ec71ee4d3e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -782,11 +782,20 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) + * @IB_MR_TYPE_DM: memory region that is used for device + * memory registration + * @IB_MR_TYPE_USER: memory region that is used for the user-space + * application + * @IB_MR_TYPE_DMA: memory region that is used for DMA operations + * without address translations (VA=PA) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, IB_MR_TYPE_SG_GAPS, + IB_MR_TYPE_DM, + IB_MR_TYPE_USER, + IB_MR_TYPE_DMA, }; enum ib_mr_status_check { @@ -1719,6 +1728,7 @@ struct ib_mr { u64 iova; u64 length; unsigned int page_size; + enum ib_mr_type type; bool need_inval; union { struct ib_uobject *uobject; /* user */ From 26bc7eaee94fd904d1817fee4d864f8526807465 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:39 +0300 Subject: [PATCH 082/194] RDMA/core: Introduce IB_MR_TYPE_INTEGRITY and ib_alloc_mr_integrity API This is a preparation for signature verbs API re-design. In the new design a single MR with IB_MR_TYPE_INTEGRITY type will be used to perform the needed mapping for data integrity operations. Signed-off-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/verbs.c | 46 ++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 10 +++++++ 3 files changed, 57 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1de4ae5d5e0e..dba385410715 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2437,6 +2437,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_fmr); SET_DEVICE_OP(dev_ops, alloc_hw_stats); SET_DEVICE_OP(dev_ops, alloc_mr); + SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 10ff85b79d25..82d62bc7af65 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2011,6 +2011,9 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, if (!pd->device->ops.alloc_mr) return ERR_PTR(-EOPNOTSUPP); + if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY)) + return ERR_PTR(-EINVAL); + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata); if (!IS_ERR(mr)) { mr->device = pd->device; @@ -2028,6 +2031,49 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, } EXPORT_SYMBOL(ib_alloc_mr_user); +/** + * ib_alloc_mr_integrity() - Allocates an integrity memory region + * @pd: protection domain associated with the region + * @max_num_data_sg: maximum data sg entries available for registration + * @max_num_meta_sg: maximum metadata sg entries available for + * registration + * + * Notes: + * Memory registration page/sg lists must not exceed max_num_sg, + * also the integrity page/sg lists must not exceed max_num_meta_sg. + * + */ +struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg) +{ + struct ib_mr *mr; + + if (!pd->device->ops.alloc_mr_integrity) + return ERR_PTR(-EOPNOTSUPP); + + if (!max_num_meta_sg) + return ERR_PTR(-EINVAL); + + mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, + max_num_meta_sg); + if (IS_ERR(mr)) + return mr; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_kadd(&mr->res); + mr->type = IB_MR_TYPE_INTEGRITY; + + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr_integrity); + /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b6ec71ee4d3e..01bc04c8e220 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -788,6 +788,8 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * application * @IB_MR_TYPE_DMA: memory region that is used for DMA operations * without address translations (VA=PA) + * @IB_MR_TYPE_INTEGRITY: memory region that is used for + * data integrity operations */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, @@ -796,6 +798,7 @@ enum ib_mr_type { IB_MR_TYPE_DM, IB_MR_TYPE_USER, IB_MR_TYPE_DMA, + IB_MR_TYPE_INTEGRITY, }; enum ib_mr_status_check { @@ -2363,6 +2366,9 @@ struct ib_device_ops { int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); + struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg); int (*advise_mr)(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge, @@ -4042,6 +4048,10 @@ static inline struct ib_mr *ib_alloc_mr(struct ib_pd *pd, return ib_alloc_mr_user(pd, mr_type, max_num_sg, NULL); } +struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg); + /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. From 2cdfcdd8677b277b32d32ef8976802dc5d5f883f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:40 +0300 Subject: [PATCH 083/194] RDMA/core: Introduce ib_map_mr_sg_pi to map data/protection sgl's This function will map the previously dma mapped SG lists for PI (protection information) and data to an appropriate memory region for future registration. The given MR must be allocated as IB_MR_TYPE_INTEGRITY. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/verbs.c | 40 +++++++++++++++++++++++++++++++- include/rdma/ib_verbs.h | 9 +++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index dba385410715..669c2d58e695 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2497,6 +2497,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); + SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, modify_ah); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 82d62bc7af65..c892022aa8ea 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2049,7 +2049,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, { struct ib_mr *mr; - if (!pd->device->ops.alloc_mr_integrity) + if (!pd->device->ops.alloc_mr_integrity || + !pd->device->ops.map_mr_sg_pi) return ERR_PTR(-EOPNOTSUPP); if (!max_num_meta_sg) @@ -2430,6 +2431,43 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, } EXPORT_SYMBOL(ib_set_vf_guid); +/** + * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection + * information) and set an appropriate memory region for registration. + * @mr: memory region + * @data_sg: dma mapped scatterlist for data + * @data_sg_nents: number of entries in data_sg + * @data_sg_offset: offset in bytes into data_sg + * @meta_sg: dma mapped scatterlist for metadata + * @meta_sg_nents: number of entries in meta_sg + * @meta_sg_offset: offset in bytes into meta_sg + * @page_size: page vector desired page size + * + * Constraints: + * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. + * + * Return: 0 on success. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->ops.map_mr_sg_pi || + WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) + return -EOPNOTSUPP; + + mr->page_size = page_size; + + return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, + meta_sg_nents, meta_sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg_pi); + /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 01bc04c8e220..632e133e7a59 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2437,6 +2437,11 @@ struct ib_device_ops { int (*read_counters)(struct ib_counters *counters, struct ib_counters_read_attr *counters_read_attr, struct uverbs_attr_bundle *attrs); + int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset); + /** * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the * driver initialized data. The struct is kfree()'ed by the sysfs @@ -4236,6 +4241,10 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); +int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, From 7c717d3aeeaabbfddd0fe949b501595a2e3469e4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:41 +0300 Subject: [PATCH 084/194] RDMA/core: Add signature attrs element for ib_mr structure This element will describe the needed characteristics for the signature operation per signature enabled memory region (type IB_MR_TYPE_INTEGRITY). Also add meta_length attribute to ib_sig_attrs structure for saving the mapped metadata length (needed for the new API implementation). Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 + drivers/infiniband/core/verbs.c | 13 ++++++++++++- include/rdma/ib_verbs.h | 2 +- include/rdma/signature.h | 2 ++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 689275c2894f..911533081db5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -747,6 +747,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->pd = pd; mr->type = IB_MR_TYPE_USER; mr->dm = NULL; + mr->sig_attrs = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); mr->res.type = RDMA_RESTRACK_MR; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c892022aa8ea..399c0d17b2b9 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1976,6 +1976,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; rdma_restrack_del(&mr->res); @@ -1984,6 +1985,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + kfree(sig_attrs); } return ret; @@ -2025,6 +2027,7 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, mr->res.type = RDMA_RESTRACK_MR; rdma_restrack_kadd(&mr->res); mr->type = mr_type; + mr->sig_attrs = NULL; } return mr; @@ -2048,6 +2051,7 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_meta_sg) { struct ib_mr *mr; + struct ib_sig_attrs *sig_attrs; if (!pd->device->ops.alloc_mr_integrity || !pd->device->ops.map_mr_sg_pi) @@ -2056,10 +2060,16 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, if (!max_num_meta_sg) return ERR_PTR(-EINVAL); + sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); + if (!sig_attrs) + return ERR_PTR(-ENOMEM); + mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, max_num_meta_sg); - if (IS_ERR(mr)) + if (IS_ERR(mr)) { + kfree(sig_attrs); return mr; + } mr->device = pd->device; mr->pd = pd; @@ -2070,6 +2080,7 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, mr->res.type = RDMA_RESTRACK_MR; rdma_restrack_kadd(&mr->res); mr->type = IB_MR_TYPE_INTEGRITY; + mr->sig_attrs = sig_attrs; return mr; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 632e133e7a59..995b217a1940 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { }; struct ib_dm *dm; - + struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ /* * Implementation details of the RDMA core, don't use in drivers: */ diff --git a/include/rdma/signature.h b/include/rdma/signature.h index 5998fe94dfd4..f24cc2a1d3c5 100644 --- a/include/rdma/signature.h +++ b/include/rdma/signature.h @@ -80,11 +80,13 @@ struct ib_sig_domain { * @check_mask: bitmask for signature byte check (8 bytes) * @mem: memory domain layout descriptor. * @wire: wire domain layout descriptor. + * @meta_length: metadata length */ struct ib_sig_attrs { u8 check_mask; struct ib_sig_domain mem; struct ib_sig_domain wire; + int meta_length; }; enum ib_sig_err_type { From 6c984472bad12da18b88e9f4345f4970bbec0b3e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:42 +0300 Subject: [PATCH 085/194] RDMA/mlx5: Implement mlx5_ib_map_mr_sg_pi and mlx5_ib_alloc_mr_integrity mlx5_ib_map_mr_sg_pi() will map the PI and data dma mapped SG lists to the mlx5 memory region prior to the registration operation. In the new API, the mlx5 driver will allocate an internal memory region for the UMR operation to register both PI and data SG lists. The internal MR will use KLM mode in order to map 2 (possibly non-contiguous/non-align) SG lists using 1 memory key. In the new API, each ULP will use 1 memory region for the signature operation (instead of 3 in the old API). This memory region will have a key that will be exposed to remote server to perform RDMA operation. The internal memory key that will map the SG lists will stay private. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 ++ drivers/infiniband/hw/mlx5/mr.c | 187 +++++++++++++++++++++++++-- 3 files changed, 189 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3b1985215cb9..23fedff9f080 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6146,6 +6146,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, + .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, .alloc_ucontext = mlx5_ib_alloc_ucontext, .attach_mcast = mlx5_ib_mcg_attach, @@ -6175,6 +6176,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, .map_mr_sg = mlx5_ib_map_mr_sg, + .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, .mmap = mlx5_ib_mmap, .modify_cq = mlx5_ib_modify_cq, .modify_device = mlx5_ib_modify_device, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 82cfe86087b6..7980814f355d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -587,6 +587,9 @@ struct mlx5_ib_mr { void *descs; dma_addr_t desc_map; int ndescs; + int data_length; + int meta_ndescs; + int meta_length; int max_descs; int desc_size; int access_mode; @@ -605,6 +608,7 @@ struct mlx5_ib_mr { int access_flags; /* Needed for rereg MR */ struct mlx5_ib_mr *parent; + struct mlx5_ib_mr *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */ atomic_t num_leaf_free; wait_queue_head_t q_leaf_free; struct mlx5_async_work cb_work; @@ -1148,8 +1152,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, + u32 max_num_meta_sg); int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 994abcebb057..af8ae1e76fd4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1639,16 +1639,22 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { - dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr)); + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + + if (ibmr->type == IB_MR_TYPE_INTEGRITY) + dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr); + + dereg_mr(to_mdev(ibmr->device), mmr); + return 0; } -struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, - u32 max_num_sg, struct ib_udata *udata) +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg) { struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - int ndescs = ALIGN(max_num_sg, 4); + int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); struct mlx5_ib_mr *mr; void *mkc; u32 *in; @@ -1670,8 +1676,72 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; + + MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, umr_en, 1); + + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); + if (err) + goto err_priv_descs; + + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->umem = NULL; + kfree(in); + + return mr; + +err_priv_descs: + mlx5_free_priv_descs(mr); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, u32 max_num_sg, + u32 max_num_meta_sg) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg, 4); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + if (mr_type == IB_MR_TYPE_MEM_REG) { mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(struct mlx5_mtt)); @@ -1682,6 +1752,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SG_GAPS) { mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, sizeof(struct mlx5_klm)); @@ -1689,11 +1760,13 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, goto err_free_in; mr->desc_size = sizeof(struct mlx5_klm); mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_SIGNATURE) { + } else if (mr_type == IB_MR_TYPE_SIGNATURE || + mr_type == IB_MR_TYPE_INTEGRITY) { u32 psv_index[2]; MLX5_SET(mkc, mkc, bsf_en, 1); MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); + MLX5_SET(mkc, mkc, translations_octword_size, 4); mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); if (!mr->sig) { err = -ENOMEM; @@ -1714,6 +1787,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr->sig->sig_err_exists = false; /* Next UMR, Arm SIGERR */ ++mr->sig->sigerr_count; + if (mr_type == IB_MR_TYPE_INTEGRITY) { + mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, + max_num_meta_sg); + if (IS_ERR(mr->pi_mr)) { + err = PTR_ERR(mr->pi_mr); + goto err_destroy_psv; + } + } } else { mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); err = -EINVAL; @@ -1727,7 +1808,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr->ibmr.device = pd->device; err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) - goto err_destroy_psv; + goto err_free_pi_mr; mr->mmkey.type = MLX5_MKEY_MR; mr->ibmr.lkey = mr->mmkey.key; @@ -1737,6 +1818,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, return &mr->ibmr; +err_free_pi_mr: + if (mr->pi_mr) { + dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr); + mr->pi_mr = NULL; + } err_destroy_psv: if (mr->sig) { if (mlx5_core_destroy_psv(dev->mdev, @@ -1758,6 +1844,19 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg, struct ib_udata *udata) +{ + return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); +} + +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg) +{ + return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, + max_num_meta_sg); +} + struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata) { @@ -1890,13 +1989,16 @@ static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, unsigned short sg_nents, - unsigned int *sg_offset_p) + unsigned int *sg_offset_p, + struct scatterlist *meta_sgl, + unsigned short meta_sg_nents, + unsigned int *meta_sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; - int i; + int i, j = 0; mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; @@ -1911,12 +2013,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, sg_offset = 0; } - mr->ndescs = i; if (sg_offset_p) *sg_offset_p = sg_offset; - return i; + mr->ndescs = i; + mr->data_length = mr->ibmr.length; + + if (meta_sg_nents) { + sg = meta_sgl; + sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; + for_each_sg(meta_sgl, sg, meta_sg_nents, j) { + if (unlikely(i + j >= mr->max_descs)) + break; + klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + + sg_offset); + klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - + sg_offset); + klms[i + j].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg) - sg_offset; + + sg_offset = 0; + } + if (meta_sg_offset_p) + *meta_sg_offset_p = sg_offset; + + mr->meta_ndescs = j; + mr->meta_length = mr->ibmr.length - mr->data_length; + } + + return i + j; } static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) @@ -1933,6 +2059,44 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + int n; + + WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + + pi_mr->ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, + meta_sg, meta_sg_nents, meta_sg_offset); + + /* This is zero-based memory region */ + pi_mr->ibmr.iova = 0; + ibmr->length = pi_mr->ibmr.length; + ibmr->iova = pi_mr->ibmr.iova; + ibmr->sig_attrs->meta_length = pi_mr->meta_length; + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + if (unlikely(n != data_sg_nents + meta_sg_nents)) + return -ENOMEM; + + return 0; +} + int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { @@ -1946,7 +2110,8 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, DMA_TO_DEVICE); if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) - n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, + NULL); else n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx5_set_page); From 62e3c379d4d713dbcb70adc7c65443fd8722b33f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:43 +0300 Subject: [PATCH 086/194] RDMA/mlx5: Add attr for max number page list length for PI operation PI offload (protection information) is a feature that each RDMA provider can implement differently. Thus, introduce new device attribute to define the maximal length of the page list for PI fast registration operation. For example, mlx5 driver uses a single internal MR to map both data and protection SGL's, so it's equal to max_fast_reg_page_list_len / 2. Signed-off-by: Max Gurtovoy Reviewed-by: Leon Romanovsky Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 ++ include/rdma/ib_verbs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 23fedff9f080..bd0322b61362 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1009,6 +1009,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + props->max_pi_fast_reg_page_list_len = + props->max_fast_reg_page_list_len / 2; get_atomic_caps_qp(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 995b217a1940..9169e798334f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -390,6 +390,7 @@ struct ib_device_attr { int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; + unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; From 9ac7c4bcd383adbac9ad02ec033cce9a3a261d28 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:44 +0300 Subject: [PATCH 087/194] RDMA/mlx5: Pass UMR segment flags instead of boolean UMR ctrl segment flags can vary between UMR operations. for example, using inline UMR or adding free/not-free checks for a memory key. This is a preparation commit before adding new signature API that will not need not-free checks for the internal memory key during the UMR operation. Signed-off-by: Max Gurtovoy Reviewed-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 12ccbd584d2a..6cb94bf507b4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4167,15 +4167,13 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr, bool umr_inline) + struct mlx5_ib_mr *mr, u8 flags) { int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - if (umr_inline) - umr->flags |= MLX5_UMR_INLINE; + umr->flags = flags; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -4756,12 +4754,14 @@ static int set_psv_wr(struct ib_sig_domain *domain, static int set_reg_wr(struct mlx5_ib_qp *qp, const struct ib_reg_wr *wr, - void **seg, int *size, void **cur_edge) + void **seg, int *size, void **cur_edge, + bool check_not_free) { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); size_t mr_list_size = mr->ndescs * mr->desc_size; bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + u8 flags = 0; if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), @@ -4769,7 +4769,12 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, return -EINVAL; } - set_reg_umr_seg(*seg, mr, umr_inline); + if (check_not_free) + flags |= MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + flags |= MLX5_UMR_INLINE; + + set_reg_umr_seg(*seg, mr, flags); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); @@ -5000,7 +5005,7 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, qp->sq.wr_data[idx] = IB_WR_REG_MR; ctrl->imm = cpu_to_be32(reg_wr(wr)->key); err = set_reg_wr(qp, reg_wr(wr), &seg, &size, - &cur_edge); + &cur_edge, true); if (err) { *bad_wr = wr; goto out; From 22465bba39c001ddac735531074d3d4d926d9088 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:45 +0300 Subject: [PATCH 088/194] RDMA/mlx5: Update set_sig_data_segment attribute for new signature API Explicitly pass the sig_mr and the access flags for the mkey segment configuration. This function will be used also in the new signature API, so modify it in order to use it in both APIs. This is a preparation commit before adding new signature API. Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6cb94bf507b4..ce8fccb04c3c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4645,17 +4645,15 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - const struct ib_sig_handover_wr *wr, u32 size, - u32 length, u32 pdn) + struct ib_mr *sig_mr, int access_flags, + u32 size, u32 length, u32 pdn) { - struct ib_mr *sig_mr = wr->sig_mr; u32 sig_key = sig_mr->rkey; u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; memset(seg, 0, sizeof(*seg)); - seg->flags = get_umr_flags(wr->access_flags) | - MLX5_MKC_ACCESS_MODE_KLMS; + seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS; seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); @@ -4712,7 +4710,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr, *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn); + set_sig_mkey_segment(*seg, wr->sig_mr, wr->access_flags, xlt_size, + region_len, pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); From 38ca87c6f1e514686d4a385246d1afe1e1f2e482 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:46 +0300 Subject: [PATCH 089/194] RDMA/mlx5: Introduce and implement new IB_WR_REG_MR_INTEGRITY work request This new WR will be used to perform PI (protection information) handover using the new API. Using the new API, the user will post a single WR that will internally perform all the needed actions to complete PI operation. This new WR will use a memory region that was allocated as IB_MR_TYPE_INTEGRITY and was mapped using ib_map_mr_sg_pi to perform the registration. In the old API, in order to perform a signature handover operation, each ULP should perform the following: 1. Map and register the data buffers. 2. Map and register the protection buffers. 3. Post a special reg WR to configure the signature handover operation layout. 4. Invalidate the signature memory key. 5. Invalidate protection buffers memory key. 6. Invalidate data buffers memory key. In the new API, the mapping of both data and protection buffers is performed using a single call to ib_map_mr_sg_pi function. Also the registration of the buffers and the configuration of the signature operation layout is done by a single new work request called IB_WR_REG_MR_INTEGRITY. This patch implements this operation for mlx5 devices that are capable to offload data integrity generation/validation while performing the actual buffer transfer. This patch will not remove the old signature API that is used by the iSER initiator and target drivers. This will be done in the future. In the internal implementation, for each IB_WR_REG_MR_INTEGRITY work request, we are using a single UMR operation to register both data and protection buffers using KLM's. Afterwards, another UMR operation will describe the strided block format. These will be followed by 2 SET_PSV operations to set the memory/wire domains initial signature parameters passed by the user. In the end of the whole transaction, only the signature memory key (the one that exposed for the RDMA operation) will be invalidated. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 218 +++++++++++++++++++++++++++++--- include/linux/mlx5/qp.h | 3 +- include/rdma/ib_verbs.h | 1 + 3 files changed, 201 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ce8fccb04c3c..f6651b93e469 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4169,7 +4169,7 @@ static __be64 sig_mkey_mask(void) static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, struct mlx5_ib_mr *mr, u8 flags) { - int size = mr->ndescs * mr->desc_size; + int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; memset(umr, 0, sizeof(*umr)); @@ -4300,7 +4300,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, struct mlx5_ib_mr *mr, u32 key, int access) { - int ndescs = ALIGN(mr->ndescs, 8) >> 1; + int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); @@ -4351,7 +4351,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, struct mlx5_ib_mr *mr, struct mlx5_ib_pd *pd) { - int bcount = mr->desc_size * mr->ndescs; + int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs); dseg->addr = cpu_to_be64(mr->desc_map); dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); @@ -4544,23 +4544,52 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, return 0; } -static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, - struct mlx5_ib_qp *qp, void **seg, - int *size, void **cur_edge) +static int set_sig_data_segment(const struct ib_send_wr *send_wr, + struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) { - struct ib_sig_attrs *sig_attrs = wr->sig_attrs; - struct ib_mr *sig_mr = wr->sig_mr; struct mlx5_bsf *bsf; - u32 data_len = wr->wr.sg_list->length; - u32 data_key = wr->wr.sg_list->lkey; - u64 data_va = wr->wr.sg_list->addr; + u32 data_len; + u32 data_key; + u64 data_va; + u32 prot_len = 0; + u32 prot_key = 0; + u64 prot_va = 0; + bool prot = false; int ret; int wqe_size; - if (!wr->prot || - (data_key == wr->prot->lkey && - data_va == wr->prot->addr && - data_len == wr->prot->length)) { + if (send_wr->opcode == IB_WR_REG_SIG_MR) { + const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + + data_len = wr->wr.sg_list->length; + data_key = wr->wr.sg_list->lkey; + data_va = wr->wr.sg_list->addr; + if (wr->prot) { + prot_len = wr->prot->length; + prot_key = wr->prot->lkey; + prot_va = wr->prot->addr; + prot = true; + } + } else { + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->ibmr.iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->ibmr.iova + data_len; + prot = true; + } + } + + if (!prot || (data_key == prot_key && data_va == prot_va && + data_len == prot_len)) { /** * Source domain doesn't contain signature information * or data and protection are interleaved in memory. @@ -4594,8 +4623,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, struct mlx5_stride_block_ctrl_seg *sblock_ctrl; struct mlx5_stride_block_entry *data_sentry; struct mlx5_stride_block_entry *prot_sentry; - u32 prot_key = wr->prot->lkey; - u64 prot_va = wr->prot->addr; u16 block_size = sig_attrs->mem.sig.dif.pi_interval; int prot_size; @@ -4673,6 +4700,56 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = sig_mkey_mask(); } +static int set_pi_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + const struct ib_reg_wr *wr = reg_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); + struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; + struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; + u32 pdn = get_pd(qp)->pdn; + u32 xlt_size; + int region_len, ret; + + if (unlikely(send_wr->num_sge != 0) || + unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = pi_mr->ibmr.length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) + xlt_size = 0x30; + else + xlt_size = sizeof(struct mlx5_klm); + + set_sig_umr_segment(*seg, xlt_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, + pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, + cur_edge); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} static int set_sig_umr_wr(const struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, void **seg, int *size, @@ -4716,7 +4793,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr, *size += sizeof(struct mlx5_mkey_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - ret = set_sig_data_segment(wr, qp, seg, size, cur_edge); + ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg, + size, cur_edge); if (ret) return ret; @@ -4758,7 +4836,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); - size_t mr_list_size = mr->ndescs * mr->desc_size; + int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; u8 flags = 0; @@ -4899,8 +4977,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; + struct ib_reg_wr reg_pi_wr; struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct ib_sig_attrs *sig_attrs; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; void *cur_edge; @@ -4954,7 +5035,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto out; } - if (wr->opcode == IB_WR_REG_MR) { + if (wr->opcode == IB_WR_REG_MR || + wr->opcode == IB_WR_REG_MR_INTEGRITY) { fence = dev->umr_fence; next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; } else { @@ -5012,6 +5094,102 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, num_sge = 0; break; + case IB_WR_REG_MR_INTEGRITY: + memset(®_pi_wr, 0, sizeof(struct ib_reg_wr)); + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; + ctrl->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + protection registration */ + err = set_reg_wr(qp, ®_pi_wr, &seg, &size, + &cur_edge, false); + if (err) { + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, fence, + MLX5_OPCODE_UMR); + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, &seg, &size, + &cur_edge); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, fence, + MLX5_OPCODE_UMR); + + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + sig_attrs = mr->ibmr.sig_attrs; + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq, false, + true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + err = set_psv_wr(&sig_attrs->mem, + mr->sig->psv_memory.psv_idx, + &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, next_fence, + MLX5_OPCODE_SET_PSV); + + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq, false, + true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + err = set_psv_wr(&sig_attrs->wire, + mr->sig->psv_wire.psv_idx, + &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, next_fence, + MLX5_OPCODE_SET_PSV); + + qp->next_fence = + MLX5_FENCE_MODE_INITIATOR_SMALL; + num_sge = 0; + goto skip_psv; + case IB_WR_REG_SIG_MR: qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; mr = to_mmr(sig_handover_wr(wr)->sig_mr); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3ba4edbd17a6..08e43cd9e742 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -37,7 +37,8 @@ #include #define MLX5_INVALID_LKEY 0x100 -#define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 5) +/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */ +#define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 8) #define MLX5_DIF_SIZE 8 #define MLX5_STRIDE_BLOCK_OP 0x400 #define MLX5_CPY_GRD_MASK 0xc0 diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9169e798334f..28db256cbdb9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1236,6 +1236,7 @@ enum ib_wr_opcode { /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. From b76a439982f8483beb2ffcfe93be50026940030a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:47 +0300 Subject: [PATCH 090/194] IB/iser: Use IB_WR_REG_MR_INTEGRITY for PI handover Using this new API reduces iSER code complexity. It also reduces the maximum number of work requests per task and the need of dealing with multiple MRs (and their registrations and invalidations) per task. It is done by using a single WR and a special MR type (IB_MR_TYPE_INTEGRITY) for PI operation. The setup of the tested benchmark: - 2 servers with 24 cores (1 initiator and 1 target) - 24 target sessions with 1 LUN each - ramdisk backstore - PI active Performance results running fio (24 jobs, 128 iodepth) using write_generate=0 and read_verify=0 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1236.6K/1164.3K 1357.2K/1332.8K 1k 1196.5K/1163.8K 1348.4K/1262.7K 2k 1016.7K/921950 1003.7K/931230 4k 662728/600545 595423/501513 8k 385954/384345 333775/277090 16k 222864/222820 170317/170671 32k 116869/114896 82331/82244 64k 55205/54931 40264/40021 Using write_generate=1 and read_verify=1 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1090.1K/1030.9K 1303.9K/1101.4K 1k 1057.7K/904583 1318.4K/988085 2k 965226/638799 1008.6K/692514 4k 555479/410151 542414/414517 8k 298675/224964 264729/237508 16k 133485/122481 164625/138647 32k 74329/67615 80143/78743 64k 35716/35519 39294/37334 We get performance improvement at all block sizes. The most significant improvement is when writing 4k bs (almost 30% more iops). Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.h | 38 ++--- drivers/infiniband/ulp/iser/iser_initiator.c | 12 +- drivers/infiniband/ulp/iser/iser_memory.c | 98 +++++-------- drivers/infiniband/ulp/iser/iser_verbs.c | 142 ++++++------------- 4 files changed, 96 insertions(+), 194 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 36d525110fd2..6bf9eaa8ec96 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -225,13 +225,11 @@ enum iser_desc_type { ISCSI_TX_DATAOUT }; -/* Maximum number of work requests per task: - * Data memory region local invalidate + fast registration - * Protection memory region local invalidate + fast registration - * Signature memory region local invalidate + fast registration - * PDU send +/* + * Maximum number of work requests per task + * (invalidate, registration, send) */ -#define ISER_MAX_WRS 7 +#define ISER_MAX_WRS 3 /** * struct iser_tx_desc - iSER TX descriptor @@ -247,9 +245,6 @@ enum iser_desc_type { * @mapped: Is the task header mapped * @wr_idx: Current WR index * @wrs: Array of WRs per task - * @data_reg: Data buffer registration details - * @prot_reg: Protection buffer registration details - * @sig_attrs: Signature attributes */ struct iser_tx_desc { struct iser_ctrl iser_header; @@ -264,11 +259,7 @@ struct iser_tx_desc { union iser_wr { struct ib_send_wr send; struct ib_reg_wr fast_reg; - struct ib_sig_handover_wr sig; } wrs[ISER_MAX_WRS]; - struct iser_mem_reg data_reg; - struct iser_mem_reg prot_reg; - struct ib_sig_attrs sig_attrs; }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ @@ -388,6 +379,7 @@ struct iser_device { * * @mr: memory region * @fmr_pool: pool of fmrs + * @sig_mr: signature memory region * @page_vec: fast reg page list used by fmr pool * @mr_valid: is mr valid indicator */ @@ -396,36 +388,22 @@ struct iser_reg_resources { struct ib_mr *mr; struct ib_fmr_pool *fmr_pool; }; + struct ib_mr *sig_mr; struct iser_page_vec *page_vec; u8 mr_valid:1; }; -/** - * struct iser_pi_context - Protection information context - * - * @rsc: protection buffer registration resources - * @sig_mr: signature enable memory region - * @sig_mr_valid: is sig_mr valid indicator - * @sig_protected: is region protected indicator - */ -struct iser_pi_context { - struct iser_reg_resources rsc; - struct ib_mr *sig_mr; - u8 sig_mr_valid:1; - u8 sig_protected:1; -}; - /** * struct iser_fr_desc - Fast registration descriptor * * @list: entry in connection fastreg pool * @rsc: data buffer registration resources - * @pi_ctx: protection information context + * @sig_protected: is region protected indicator */ struct iser_fr_desc { struct list_head list; struct iser_reg_resources rsc; - struct iser_pi_context *pi_ctx; + bool sig_protected; struct list_head all_list; }; diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 96af06cfe0af..5cbb4b3a0566 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -592,15 +592,14 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) static inline int iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) { - if (likely(rkey == desc->rsc.mr->rkey)) { - desc->rsc.mr_valid = 0; - } else if (likely(desc->pi_ctx && rkey == desc->pi_ctx->sig_mr->rkey)) { - desc->pi_ctx->sig_mr_valid = 0; - } else { + if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) || + (desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) { iser_err("Bogus remote invalidation for rkey %#x\n", rkey); return -EINVAL; } + desc->rsc.mr_valid = 0; + return 0; } @@ -750,6 +749,9 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) iser_task->prot[ISER_DIR_IN].data_len = 0; iser_task->prot[ISER_DIR_OUT].data_len = 0; + iser_task->prot[ISER_DIR_IN].dma_nents = 0; + iser_task->prot[ISER_DIR_OUT].dma_nents = 0; + memset(&iser_task->rdma_reg[ISER_DIR_IN], 0, sizeof(struct iser_mem_reg)); memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index f431c9b4065c..d66e17c2a085 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -376,16 +376,16 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task, - struct iser_pi_context *pi_ctx, - struct iser_mem_reg *data_reg, - struct iser_mem_reg *prot_reg, + struct iser_data_buf *mem, + struct iser_data_buf *sig_mem, + struct iser_reg_resources *rsc, struct iser_mem_reg *sig_reg) { struct iser_tx_desc *tx_desc = &iser_task->desc; - struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs; struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; - struct ib_sig_handover_wr *wr; - struct ib_mr *mr = pi_ctx->sig_mr; + struct ib_mr *mr = rsc->sig_mr; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; + struct ib_reg_wr *wr; int ret; memset(sig_attrs, 0, sizeof(*sig_attrs)); @@ -395,33 +395,36 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); - if (pi_ctx->sig_mr_valid) + if (rsc->mr_valid) iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); - wr = container_of(iser_tx_next_wr(tx_desc), struct ib_sig_handover_wr, - wr); - wr->wr.opcode = IB_WR_REG_SIG_MR; + ret = ib_map_mr_sg_pi(mr, mem->sg, mem->dma_nents, NULL, + sig_mem->sg, sig_mem->dma_nents, NULL, SZ_4K); + if (unlikely(ret)) { + iser_err("failed to map PI sg (%d)\n", + mem->dma_nents + sig_mem->dma_nents); + goto err; + } + + wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr); + memset(wr, 0, sizeof(*wr)); + wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; wr->wr.wr_cqe = cqe; - wr->wr.sg_list = &data_reg->sge; - wr->wr.num_sge = 1; + wr->wr.num_sge = 0; wr->wr.send_flags = 0; - wr->sig_attrs = sig_attrs; - wr->sig_mr = mr; - if (scsi_prot_sg_count(iser_task->sc)) - wr->prot = &prot_reg->sge; - else - wr->prot = NULL; - wr->access_flags = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - pi_ctx->sig_mr_valid = 1; + wr->mr = mr; + wr->key = mr->rkey; + wr->access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + rsc->mr_valid = 1; sig_reg->sge.lkey = mr->lkey; sig_reg->rkey = mr->rkey; - sig_reg->sge.addr = 0; - sig_reg->sge.length = scsi_transfer_length(iser_task->sc); + sig_reg->sge.addr = mr->iova; + sig_reg->sge.length = mr->length; iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n", sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, @@ -477,21 +480,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, return 0; } -static int -iser_reg_prot_sg(struct iscsi_iser_task *task, - struct iser_data_buf *mem, - struct iser_fr_desc *desc, - bool use_dma_key, - struct iser_mem_reg *reg) -{ - struct iser_device *device = task->iser_conn->ib_conn.device; - - if (use_dma_key) - return iser_reg_dma(device, mem, reg); - - return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg); -} - static int iser_reg_data_sg(struct iscsi_iser_task *task, struct iser_data_buf *mem, @@ -515,7 +503,6 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task, struct iser_device *device = ib_conn->device; struct iser_data_buf *mem = &task->data[dir]; struct iser_mem_reg *reg = &task->rdma_reg[dir]; - struct iser_mem_reg *data_reg; struct iser_fr_desc *desc = NULL; bool use_dma_key; int err; @@ -528,32 +515,17 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task, reg->mem_h = desc; } - if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) - data_reg = reg; - else - data_reg = &task->desc.data_reg; - - err = iser_reg_data_sg(task, mem, desc, use_dma_key, data_reg); - if (unlikely(err)) - goto err_reg; - - if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) { - struct iser_mem_reg *prot_reg = &task->desc.prot_reg; - - if (scsi_prot_sg_count(task->sc)) { - mem = &task->prot[dir]; - err = iser_reg_prot_sg(task, mem, desc, - use_dma_key, prot_reg); - if (unlikely(err)) - goto err_reg; - } - - err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg, - prot_reg, reg); + if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) { + err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg); + if (unlikely(err)) + goto err_reg; + } else { + err = iser_reg_sig_mr(task, mem, &task->prot[dir], + &desc->rsc, reg); if (unlikely(err)) goto err_reg; - desc->pi_ctx->sig_protected = 1; + desc->sig_protected = 1; } return 0; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 4ff3d98fa6a4..ffd6bbc819f7 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -233,85 +233,6 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) kfree(desc); } -static int -iser_alloc_reg_res(struct iser_device *device, - struct ib_pd *pd, - struct iser_reg_resources *res, - unsigned int size) -{ - struct ib_device *ib_dev = device->ib_device; - enum ib_mr_type mr_type; - int ret; - - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) - mr_type = IB_MR_TYPE_SG_GAPS; - else - mr_type = IB_MR_TYPE_MEM_REG; - - res->mr = ib_alloc_mr(pd, mr_type, size); - if (IS_ERR(res->mr)) { - ret = PTR_ERR(res->mr); - iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); - return ret; - } - res->mr_valid = 0; - - return 0; -} - -static void -iser_free_reg_res(struct iser_reg_resources *rsc) -{ - ib_dereg_mr(rsc->mr); -} - -static int -iser_alloc_pi_ctx(struct iser_device *device, - struct ib_pd *pd, - struct iser_fr_desc *desc, - unsigned int size) -{ - struct iser_pi_context *pi_ctx = NULL; - int ret; - - desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!desc->pi_ctx) - return -ENOMEM; - - pi_ctx = desc->pi_ctx; - - ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size); - if (ret) { - iser_err("failed to allocate reg_resources\n"); - goto alloc_reg_res_err; - } - - pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); - if (IS_ERR(pi_ctx->sig_mr)) { - ret = PTR_ERR(pi_ctx->sig_mr); - goto sig_mr_failure; - } - pi_ctx->sig_mr_valid = 0; - desc->pi_ctx->sig_protected = 0; - - return 0; - -sig_mr_failure: - iser_free_reg_res(&pi_ctx->rsc); -alloc_reg_res_err: - kfree(desc->pi_ctx); - - return ret; -} - -static void -iser_free_pi_ctx(struct iser_pi_context *pi_ctx) -{ - iser_free_reg_res(&pi_ctx->rsc); - ib_dereg_mr(pi_ctx->sig_mr); - kfree(pi_ctx); -} - static struct iser_fr_desc * iser_create_fastreg_desc(struct iser_device *device, struct ib_pd *pd, @@ -319,32 +240,58 @@ iser_create_fastreg_desc(struct iser_device *device, unsigned int size) { struct iser_fr_desc *desc; + struct ib_device *ib_dev = device->ib_device; + enum ib_mr_type mr_type; int ret; desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return ERR_PTR(-ENOMEM); - ret = iser_alloc_reg_res(device, pd, &desc->rsc, size); - if (ret) - goto reg_res_alloc_failure; + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + desc->rsc.mr = ib_alloc_mr(pd, mr_type, size); + if (IS_ERR(desc->rsc.mr)) { + ret = PTR_ERR(desc->rsc.mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto err_alloc_mr; + } if (pi_enable) { - ret = iser_alloc_pi_ctx(device, pd, desc, size); - if (ret) - goto pi_ctx_alloc_failure; + desc->rsc.sig_mr = ib_alloc_mr_integrity(pd, size, size); + if (IS_ERR(desc->rsc.sig_mr)) { + ret = PTR_ERR(desc->rsc.sig_mr); + iser_err("Failed to allocate sig_mr err=%d\n", ret); + goto err_alloc_mr_integrity; + } } + desc->rsc.mr_valid = 0; return desc; -pi_ctx_alloc_failure: - iser_free_reg_res(&desc->rsc); -reg_res_alloc_failure: +err_alloc_mr_integrity: + ib_dereg_mr(desc->rsc.mr); +err_alloc_mr: kfree(desc); return ERR_PTR(ret); } +static void iser_destroy_fastreg_desc(struct iser_fr_desc *desc) +{ + struct iser_reg_resources *res = &desc->rsc; + + ib_dereg_mr(res->mr); + if (res->sig_mr) { + ib_dereg_mr(res->sig_mr); + res->sig_mr = NULL; + } + kfree(desc); +} + /** * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors * for fast registration work requests. @@ -399,10 +346,7 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) { list_del(&desc->all_list); - iser_free_reg_res(&desc->rsc); - if (desc->pi_ctx) - iser_free_pi_ctx(desc->pi_ctx); - kfree(desc); + iser_destroy_fastreg_desc(desc); ++i; } @@ -707,6 +651,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, struct ib_device_attr *attr = &device->ib_device->attrs; unsigned short sg_tablesize, sup_sg_tablesize; unsigned short reserved_mr_pages; + u32 max_num_sg; /* * FRs without SG_GAPS or FMRs can only map up to a (device) page per @@ -720,12 +665,17 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, else reserved_mr_pages = 1; + if (iser_conn->ib_conn.pi_support) + max_num_sg = attr->max_pi_fast_reg_page_list_len; + else + max_num_sg = attr->max_fast_reg_page_list_len; + sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) sup_sg_tablesize = min_t( uint, ISCSI_ISER_MAX_SG_TABLESIZE, - attr->max_fast_reg_page_list_len - reserved_mr_pages); + max_num_sg - reserved_mr_pages); else sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE; @@ -1118,9 +1068,9 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, struct ib_mr_status mr_status; int ret; - if (desc && desc->pi_ctx->sig_protected) { - desc->pi_ctx->sig_protected = 0; - ret = ib_check_mr_status(desc->pi_ctx->sig_mr, + if (desc && desc->sig_protected) { + desc->sig_protected = 0; + ret = ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); if (ret) { pr_err("ib_check_mr_status failed, ret %d\n", ret); From b9294f8b7c4bfdad35f4eb8330974892aedd1aaf Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:48 +0300 Subject: [PATCH 091/194] IB/iser: Unwind WR union at iser_tx_desc After decreasing WRs array size from 7 to 3 it is more readable to give each WR a descriptive name. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.c | 3 +- drivers/infiniband/ulp/iser/iscsi_iser.h | 34 ++++------------------- drivers/infiniband/ulp/iser/iser_memory.c | 16 ++++++----- drivers/infiniband/ulp/iser/iser_verbs.c | 12 ++++++-- 4 files changed, 27 insertions(+), 38 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index dbad8275b3bc..c7a3d75fb308 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -205,7 +205,8 @@ iser_initialize_task_headers(struct iscsi_task *task, goto out; } - tx_desc->wr_idx = 0; + tx_desc->inv_wr.next = NULL; + tx_desc->reg_wr.wr.next = NULL; tx_desc->mapped = true; tx_desc->dma_addr = dma_addr; tx_desc->tx_sg[0].addr = tx_desc->dma_addr; diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 6bf9eaa8ec96..39bf213444cb 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -225,12 +225,6 @@ enum iser_desc_type { ISCSI_TX_DATAOUT }; -/* - * Maximum number of work requests per task - * (invalidate, registration, send) - */ -#define ISER_MAX_WRS 3 - /** * struct iser_tx_desc - iSER TX descriptor * @@ -243,8 +237,9 @@ enum iser_desc_type { * unsolicited data-out or control * @num_sge: number sges used on this TX task * @mapped: Is the task header mapped - * @wr_idx: Current WR index - * @wrs: Array of WRs per task + * reg_wr: registration WR + * send_wr: send WR + * inv_wr: invalidate WR */ struct iser_tx_desc { struct iser_ctrl iser_header; @@ -255,11 +250,9 @@ struct iser_tx_desc { int num_sge; struct ib_cqe cqe; bool mapped; - u8 wr_idx; - union iser_wr { - struct ib_send_wr send; - struct ib_reg_wr fast_reg; - } wrs[ISER_MAX_WRS]; + struct ib_reg_wr reg_wr; + struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ @@ -652,21 +645,6 @@ void iser_reg_desc_put_fmr(struct ib_conn *ib_conn, struct iser_fr_desc *desc); -static inline struct ib_send_wr * -iser_tx_next_wr(struct iser_tx_desc *tx_desc) -{ - struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx].send; - struct ib_send_wr *last_wr; - - if (tx_desc->wr_idx) { - last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1].send; - last_wr->next = cur_wr; - } - tx_desc->wr_idx++; - - return cur_wr; -} - static inline struct iser_conn * to_iser_conn(struct ib_conn *ib_conn) { diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index d66e17c2a085..2cc89a9b9e9b 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -365,13 +365,15 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) static inline void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr, - struct ib_cqe *cqe) + struct ib_cqe *cqe, + struct ib_send_wr *next_wr) { inv_wr->opcode = IB_WR_LOCAL_INV; inv_wr->wr_cqe = cqe; inv_wr->ex.invalidate_rkey = mr->rkey; inv_wr->send_flags = 0; inv_wr->num_sge = 0; + inv_wr->next = next_wr; } static int @@ -385,7 +387,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; struct ib_mr *mr = rsc->sig_mr; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; - struct ib_reg_wr *wr; + struct ib_reg_wr *wr = &tx_desc->reg_wr; int ret; memset(sig_attrs, 0, sizeof(*sig_attrs)); @@ -396,7 +398,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); if (rsc->mr_valid) - iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); + iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr); ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); @@ -408,8 +410,8 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, goto err; } - wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr); memset(wr, 0, sizeof(*wr)); + wr->wr.next = &tx_desc->send_wr; wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; wr->wr.wr_cqe = cqe; wr->wr.num_sge = 0; @@ -441,11 +443,11 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct iser_tx_desc *tx_desc = &iser_task->desc; struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; struct ib_mr *mr = rsc->mr; - struct ib_reg_wr *wr; + struct ib_reg_wr *wr = &tx_desc->reg_wr; int n; if (rsc->mr_valid) - iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); + iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr); ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); @@ -456,7 +458,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, return n < 0 ? n : -EINVAL; } - wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr); + wr->wr.next = &tx_desc->send_wr; wr->wr.opcode = IB_WR_REG_MR; wr->wr.wr_cqe = cqe; wr->wr.send_flags = 0; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ffd6bbc819f7..ea9cf04ad002 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1037,7 +1037,8 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal) { - struct ib_send_wr *wr = iser_tx_next_wr(tx_desc); + struct ib_send_wr *wr = &tx_desc->send_wr; + struct ib_send_wr *first_wr; int ib_ret; ib_dma_sync_single_for_device(ib_conn->device->ib_device, @@ -1051,7 +1052,14 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, wr->opcode = IB_WR_SEND; wr->send_flags = signal ? IB_SEND_SIGNALED : 0; - ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, NULL); + if (tx_desc->inv_wr.next) + first_wr = &tx_desc->inv_wr; + else if (tx_desc->reg_wr.wr.next) + first_wr = &tx_desc->reg_wr.wr; + else + first_wr = wr; + + ib_ret = ib_post_send(ib_conn->qp, first_wr, NULL); if (ib_ret) iser_err("ib_post_send failed, ret:%d opcode:%d\n", ib_ret, wr->opcode); From 5a6781a558cc3909851c04a0d44e3a87a35aad94 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:49 +0300 Subject: [PATCH 092/194] RDMA/core: Add an integrity MR pool support This is a preparation for adding new signature API to the rw-API. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mr_pool.c | 8 ++++++-- drivers/infiniband/core/rw.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- include/rdma/mr_pool.h | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c index 49d478b2ea94..132ff92626e1 100644 --- a/drivers/infiniband/core/mr_pool.c +++ b/drivers/infiniband/core/mr_pool.c @@ -42,14 +42,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) EXPORT_SYMBOL(ib_mr_pool_put); int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, - enum ib_mr_type type, u32 max_num_sg) + enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg) { struct ib_mr *mr; unsigned long flags; int ret, i; for (i = 0; i < nr; i++) { - mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (type == IB_MR_TYPE_INTEGRITY) + mr = ib_alloc_mr_integrity(qp->pd, max_num_sg, + max_num_meta_sg); + else + mr = ib_alloc_mr(qp->pd, type, max_num_sg); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto out; diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index acf9ea625811..f825990bacfa 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -719,7 +719,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) if (nr_mrs) { ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, IB_MR_TYPE_MEM_REG, - rdma_rw_fr_page_list_len(dev)); + rdma_rw_fr_page_list_len(dev), 0); if (ret) { pr_err("%s: failed to allocated %d MRs\n", __func__, nr_mrs); @@ -729,7 +729,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) if (nr_sig_mrs) { ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, - IB_MR_TYPE_SIGNATURE, 2); + IB_MR_TYPE_SIGNATURE, 2, 0); if (ret) { pr_err("%s: failed to allocated %d SIG MRs\n", __func__, nr_sig_mrs); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f383146e7d0f..0e033b621daf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -486,7 +486,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, queue->queue_size, IB_MR_TYPE_MEM_REG, - nvme_rdma_get_max_fr_pages(ibdev)); + nvme_rdma_get_max_fr_pages(ibdev), 0); if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h index 986010b812eb..2c042e6046d1 100644 --- a/include/rdma/mr_pool.h +++ b/include/rdma/mr_pool.h @@ -19,7 +19,7 @@ struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list); void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr); int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, - enum ib_mr_type type, u32 max_num_sg); + enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg); void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list); #endif /* _RDMA_MR_POOL_H */ From c0a6cbb9cbccffc249743afa16e64f16c46c80b2 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:50 +0300 Subject: [PATCH 093/194] RDMA/core: Rename signature qp create flag and signature device capability Rename IB_QP_CREATE_SIGNATURE_EN to IB_QP_CREATE_INTEGRITY_EN and IB_DEVICE_SIGNATURE_HANDOVER to IB_DEVICE_INTEGRITY_HANDOVER. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 4 ++-- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +-- drivers/infiniband/hw/mlx5/qp.c | 14 +++++++------- drivers/infiniband/ulp/iser/iser_verbs.c | 4 ++-- drivers/infiniband/ulp/isert/ib_isert.c | 4 ++-- include/rdma/ib_verbs.h | 4 ++-- 7 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index f825990bacfa..b45b49a2ccfc 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -688,7 +688,7 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) * we'll need two additional MRs for the registrations and the * invalidation. */ - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) factor += 6; /* (inv + reg) * (data + prot + sig) */ else if (rdma_rw_can_use_mr(dev, attr->port_num)) factor += 2; /* inv + reg */ @@ -709,7 +709,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) u32 nr_mrs = 0, nr_sig_mrs = 0; int ret = 0; - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { nr_sig_mrs = attr->cap.max_rdma_ctxs; nr_mrs = attr->cap.max_rdma_ctxs * 2; } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bd0322b61362..9aa10cfbc064 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -889,7 +889,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { - props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7980814f355d..5999792b5698 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -431,8 +431,7 @@ struct mlx5_ib_qp { int create_type; - /* Store signature errors */ - bool signature_en; + bool integrity_en; struct list_head qps_list; struct list_head cq_recv_list; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f6651b93e469..879162da63e3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -442,9 +442,9 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN && ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) - return MLX5_SIG_WQE_SIZE; + return MLX5_SIG_WQE_SIZE; else return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } @@ -496,8 +496,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) - qp->signature_en = true; + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) + qp->integrity_en = true; wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; @@ -1039,7 +1039,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, void *qpc; int err; - if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_IPOIB_UD_LSO | IB_QP_CREATE_NETIF_QP | @@ -4714,7 +4714,7 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr, if (unlikely(send_wr->num_sge != 0) || unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; @@ -4763,7 +4763,7 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr, if (unlikely(wr->wr.num_sge != 1) || unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ea9cf04ad002..a6548de0e218 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -399,7 +399,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) init_attr.qp_type = IB_QPT_RC; if (ib_conn->pi_support) { init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; - init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS); } else { @@ -712,7 +712,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) /* connection T10-PI support */ if (iser_pi_enable) { if (!(device->ib_device->attrs.device_cap_flags & - IB_DEVICE_SIGNATURE_HANDOVER)) { + IB_DEVICE_INTEGRITY_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", dev_name(&ib_conn->device->ib_device->dev)); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index ffef4ac152ca..53bc505f5292 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -141,7 +141,7 @@ isert_create_qp(struct isert_conn *isert_conn, attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; if (device->pi_capable) - attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; ret = rdma_create_qp(cma_id, device->pd, &attr); if (ret) { @@ -317,7 +317,7 @@ isert_create_device_ib_res(struct isert_device *device) /* Check signature cap */ device->pi_capable = ib_dev->attrs.device_cap_flags & - IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + IB_DEVICE_INTEGRITY_HANDOVER ? true : false; return 0; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 28db256cbdb9..6de0ea1aafc3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -253,7 +253,7 @@ enum ib_device_cap_flags { */ IB_DEVICE_CROSS_CHANNEL = (1 << 27), IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), - IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), + IB_DEVICE_INTEGRITY_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), IB_DEVICE_SG_GAPS_REG = (1ULL << 32), IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), @@ -1056,7 +1056,7 @@ enum ib_qp_create_flags { IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, - IB_QP_CREATE_SIGNATURE_EN = 1 << 6, + IB_QP_CREATE_INTEGRITY_EN = 1 << 6, /* FREE = 1 << 7, */ IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, From 185eddc45798b9f73e5470964948d79b4c8df4b7 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:51 +0300 Subject: [PATCH 094/194] RDMA/core: Validate integrity handover device cap Protect the case that a ULP tries to allocate a QP with signature enabled flag while the LLD doesn't support this feature. While we're here, also move integrity_en attribute from mlx5_qp to ib_qp as a preparation for adding new integrity API to the rw-API (that is part of ib_core module). Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 6 ++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 -- drivers/infiniband/hw/mlx5/qp.c | 7 ++----- include/rdma/ib_verbs.h | 1 + 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 399c0d17b2b9..4a04e94a72db 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1158,6 +1158,10 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd, qp_init_attr->cap.max_recv_sge)) return ERR_PTR(-EINVAL); + if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) && + !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER)) + return ERR_PTR(-EINVAL); + /* * If the callers is using the RDMA API calculate the resources * needed for the RDMA READ/WRITE operations. @@ -1233,6 +1237,8 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd, qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); + if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) + qp->integrity_en = true; return qp; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5999792b5698..d418219e68c6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -431,8 +431,6 @@ struct mlx5_ib_qp { int create_type; - bool integrity_en; - struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 879162da63e3..d77a64c551ea 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -496,9 +496,6 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; - if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) - qp->integrity_en = true; - wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { @@ -4714,7 +4711,7 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr, if (unlikely(send_wr->num_sge != 0) || unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) || + unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; @@ -4763,7 +4760,7 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr, if (unlikely(wr->wr.num_sge != 1) || unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->integrity_en) || + unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || unlikely(!sig_mr->sig->sig_status_checked)) return -EINVAL; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6de0ea1aafc3..14b5eab76ed8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1711,6 +1711,7 @@ struct ib_qp { struct ib_qp_security *qp_sec; u8 port; + bool integrity_en; /* * Implementation details of the RDMA core, don't use in drivers: */ From 6cb2d5b105c93efd453d990aa2aea3ebb9405940 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:52 +0300 Subject: [PATCH 095/194] RDMA/rw: Introduce rdma_rw_inv_key helper This is a preparation for adding new signature API to the rw-API. Signed-off-by: Israel Rukshin Suggested-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index b45b49a2ccfc..f87f0de82e24 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -65,6 +65,22 @@ static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); } +static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) +{ + int count = 0; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + return count; +} + /* Caller must have zero-initialized *reg. */ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, @@ -78,14 +94,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, if (!reg->mr) return -EAGAIN; - if (reg->mr->need_inval) { - reg->inv_wr.opcode = IB_WR_LOCAL_INV; - reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; - reg->inv_wr.next = ®->reg_wr.wr; - count++; - } else { - reg->inv_wr.next = NULL; - } + count += rdma_rw_inv_key(reg); ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); if (ret < 0 || ret < nents) { From e9a53e73a2507f3a1680538bd167b2e49533659a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:53 +0300 Subject: [PATCH 096/194] RDMA/rw: Use IB_WR_REG_MR_INTEGRITY for PI handover Replace the old signature handover API with the new one. The new API simplifes PI handover code complexity for ULPs and improve performance. For RW API it will reduce the maximum number of work requests per task and the need of dealing with multiple MRs (and their registrations and invalidations) per task. All the mappings and registration of the data and the protection buffers is done by the LLD using a single WR and a special MR type (IB_MR_TYPE_INTEGRITY) for the PI handover operation. The setup of the tested benchmark (using iSER ULP): - 2 servers with 24 cores (1 initiator and 1 target) - ConnectX-4/ConnectX-5 adapters - 24 target sessions with 1 LUN each - ramdisk backstore - PI active Performance results running fio (24 jobs, 128 iodepth) using write_generate=1 and read_verify=1 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1243.3K/1182.3K 1725.1K/1680.2K 4k 571233/528835 743293/748259 32k 72388/71086 71789/93573 Using write_generate=0 and read_verify=0 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1572.1K/1427.2K 1823.5K/1724.3K 4k 921992/916194 753772/768267 32k 75052/73960 73180/95484 There is a performance degradation when writing big block sizes. Degradation is caused by the complexity of combining multiple indirections and perform RDMA READ operation from it. This will be fixed in the following patches by reducing the indirections if possible. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 167 ++++++++++-------------- drivers/infiniband/ulp/isert/ib_isert.c | 4 +- include/rdma/rw.h | 9 -- 3 files changed, 72 insertions(+), 108 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index f87f0de82e24..cc5feb301c05 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -59,10 +59,18 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, return false; } -static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev, + bool pi_support) { + u32 max_pages; + + if (pi_support) + max_pages = dev->attrs.max_pi_fast_reg_page_list_len; + else + max_pages = dev->attrs.max_fast_reg_page_list_len; + /* arbitrary limit to avoid allocating gigantic resources */ - return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); + return min_t(u32, max_pages, 256); } static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) @@ -86,7 +94,8 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) { - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); u32 nents = min(sg_cnt, pages_per_mr); int count = 0, ret; @@ -119,7 +128,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct rdma_rw_reg_ctx *prev = NULL; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); int i, j, ret = 0, count = 0; ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; @@ -360,9 +370,9 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); struct ib_rdma_wr *rdma_wr; - struct ib_send_wr *prev_wr = NULL; int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { @@ -376,75 +386,58 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, return -ENOMEM; sg_cnt = ret; - ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); - if (!ret) { - ret = -ENOMEM; - goto out_unmap_sg; + if (prot_sg_cnt) { + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; } - prot_sg_cnt = ret; ctx->type = RDMA_RW_SIG_MR; ctx->nr_ops = 1; - ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); - if (!ctx->sig) { + ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { ret = -ENOMEM; goto out_unmap_prot_sg; } - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); - if (ret < 0) - goto out_free_ctx; - count += ret; - prev_wr = &ctx->sig->data.reg_wr.wr; - - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, - prot_sg, prot_sg_cnt, 0); - if (ret < 0) - goto out_destroy_data_mr; - count += ret; - - if (ctx->sig->prot.inv_wr.next) - prev_wr->next = &ctx->sig->prot.inv_wr; - else - prev_wr->next = &ctx->sig->prot.reg_wr.wr; - prev_wr = &ctx->sig->prot.reg_wr.wr; - - ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); - if (!ctx->sig->sig_mr) { + ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->reg->mr) { ret = -EAGAIN; - goto out_destroy_prot_mr; + goto out_free_ctx; } - if (ctx->sig->sig_mr->need_inval) { - memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + count += rdma_rw_inv_key(ctx->reg); - ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; - ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); - prev_wr->next = &ctx->sig->sig_inv_wr; - prev_wr = &ctx->sig->sig_inv_wr; + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg, + prot_sg_cnt, NULL, SZ_4K); + if (unlikely(ret)) { + pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt); + goto out_destroy_sig_mr; } - ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - ctx->sig->sig_wr.wr.wr_cqe = NULL; - ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; - ctx->sig->sig_wr.wr.num_sge = 1; - ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - ctx->sig->sig_wr.sig_attrs = sig_attrs; - ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; - if (prot_sg_cnt) - ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; - prev_wr->next = &ctx->sig->sig_wr.wr; - prev_wr = &ctx->sig->sig_wr.wr; + ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY; + ctx->reg->reg_wr.wr.wr_cqe = NULL; + ctx->reg->reg_wr.wr.num_sge = 0; + ctx->reg->reg_wr.wr.send_flags = 0; + ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + ctx->reg->reg_wr.mr = ctx->reg->mr; + ctx->reg->reg_wr.key = ctx->reg->mr->lkey; count++; - ctx->sig->sig_sge.addr = 0; - ctx->sig->sig_sge.length = ctx->sig->data.sge.length; - if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) - ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + ctx->reg->sge.addr = ctx->reg->mr->iova; + ctx->reg->sge.length = ctx->reg->mr->length; + if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE) + ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length; - rdma_wr = &ctx->sig->data.wr; - rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr = &ctx->reg->wr; + rdma_wr->wr.sg_list = &ctx->reg->sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; @@ -452,21 +445,18 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; - prev_wr->next = &rdma_wr->wr; - prev_wr = &rdma_wr->wr; + ctx->reg->reg_wr.wr.next = &rdma_wr->wr; count++; return count; -out_destroy_prot_mr: - if (prot_sg_cnt) - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); -out_destroy_data_mr: - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_destroy_sig_mr: + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); out_free_ctx: - kfree(ctx->sig); + kfree(ctx->reg); out_unmap_prot_sg: - ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); + if (prot_sg_cnt) + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); out_unmap_sg: ib_dma_unmap_sg(dev, sg, sg_cnt, dir); return ret; @@ -509,22 +499,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, switch (ctx->type) { case RDMA_RW_SIG_MR: - rdma_rw_update_lkey(&ctx->sig->data, true); - if (ctx->sig->prot.mr) - rdma_rw_update_lkey(&ctx->sig->prot, true); - - ctx->sig->sig_mr->need_inval = true; - ib_update_fast_reg_key(ctx->sig->sig_mr, - ib_inc_rkey(ctx->sig->sig_mr->lkey)); - ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; - - if (ctx->sig->data.inv_wr.next) - first_wr = &ctx->sig->data.inv_wr; - else - first_wr = &ctx->sig->data.reg_wr.wr; - last_wr = &ctx->sig->data.wr.wr; - break; case RDMA_RW_MR: + /* fallthrough */ for (i = 0; i < ctx->nr_ops; i++) { rdma_rw_update_lkey(&ctx->reg[i], ctx->reg[i].wr.wr.opcode != @@ -641,16 +617,12 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) return; - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); + kfree(ctx->reg); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); - - if (ctx->sig->prot.mr) { - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + if (prot_sg_cnt) ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); - } - - ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); - kfree(ctx->sig); } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); @@ -671,7 +643,7 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, unsigned int mr_pages; if (rdma_rw_can_use_mr(device, port_num)) - mr_pages = rdma_rw_fr_page_list_len(device); + mr_pages = rdma_rw_fr_page_list_len(device, false); else mr_pages = device->attrs.max_sge_rd; return DIV_ROUND_UP(maxpages, mr_pages); @@ -697,9 +669,8 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) * we'll need two additional MRs for the registrations and the * invalidation. */ - if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) - factor += 6; /* (inv + reg) * (data + prot + sig) */ - else if (rdma_rw_can_use_mr(dev, attr->port_num)) + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, attr->port_num)) factor += 2; /* inv + reg */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; @@ -715,20 +686,22 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) { struct ib_device *dev = qp->pd->device; - u32 nr_mrs = 0, nr_sig_mrs = 0; + u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0; int ret = 0; if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { nr_sig_mrs = attr->cap.max_rdma_ctxs; - nr_mrs = attr->cap.max_rdma_ctxs * 2; + nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, true); } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, false); } if (nr_mrs) { ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, IB_MR_TYPE_MEM_REG, - rdma_rw_fr_page_list_len(dev), 0); + max_num_sg, 0); if (ret) { pr_err("%s: failed to allocated %d MRs\n", __func__, nr_mrs); @@ -738,7 +711,7 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) if (nr_sig_mrs) { ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, - IB_MR_TYPE_SIGNATURE, 2, 0); + IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); if (ret) { pr_err("%s: failed to allocated %d SIG MRs\n", __func__, nr_sig_mrs); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 53bc505f5292..4b4998e888b9 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -1677,7 +1677,7 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr); + ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr); isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); if (ret) { @@ -1723,7 +1723,7 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) iscsit_stop_dataout_timer(cmd); if (isert_prot_cmd(isert_conn, se_cmd)) - ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr); + ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr); isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); cmd->write_data_done = 0; diff --git a/include/rdma/rw.h b/include/rdma/rw.h index a3cbbc7b6417..bcb221241b5d 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -47,15 +47,6 @@ struct rdma_rw_ctx { struct ib_send_wr inv_wr; struct ib_mr *mr; } *reg; - - struct { - struct rdma_rw_reg_ctx data; - struct rdma_rw_reg_ctx prot; - struct ib_send_wr sig_inv_wr; - struct ib_mr *sig_mr; - struct ib_sge sig_sge; - struct ib_sig_handover_wr sig_wr; - } *sig; }; }; From 5c171cbe3ab3d1390290eaa85e7b371cc26b1122 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:54 +0300 Subject: [PATCH 097/194] RDMA/mlx5: Remove unused IB_WR_REG_SIG_MR code IB_WR_REG_SIG_MR is not needed after IB_WR_REG_MR_INTEGRITY was used. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 15 +-- drivers/infiniband/hw/mlx5/qp.c | 154 ++-------------------- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 2 +- include/rdma/ib_verbs.h | 19 --- 4 files changed, 17 insertions(+), 173 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index af8ae1e76fd4..36d1d6f8bb47 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1760,8 +1760,7 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; mr->desc_size = sizeof(struct mlx5_klm); mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_SIGNATURE || - mr_type == IB_MR_TYPE_INTEGRITY) { + } else if (mr_type == IB_MR_TYPE_INTEGRITY) { u32 psv_index[2]; MLX5_SET(mkc, mkc, bsf_en, 1); @@ -1787,13 +1786,11 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, mr->sig->sig_err_exists = false; /* Next UMR, Arm SIGERR */ ++mr->sig->sigerr_count; - if (mr_type == IB_MR_TYPE_INTEGRITY) { - mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, - max_num_meta_sg); - if (IS_ERR(mr->pi_mr)) { - err = PTR_ERR(mr->pi_mr); - goto err_destroy_psv; - } + mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, + max_num_meta_sg); + if (IS_ERR(mr->pi_mr)) { + err = PTR_ERR(mr->pi_mr); + goto err_destroy_psv; } } else { mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d77a64c551ea..60536c9c008f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4557,32 +4557,17 @@ static int set_sig_data_segment(const struct ib_send_wr *send_wr, bool prot = false; int ret; int wqe_size; + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; - if (send_wr->opcode == IB_WR_REG_SIG_MR) { - const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); - - data_len = wr->wr.sg_list->length; - data_key = wr->wr.sg_list->lkey; - data_va = wr->wr.sg_list->addr; - if (wr->prot) { - prot_len = wr->prot->length; - prot_key = wr->prot->lkey; - prot_va = wr->prot->addr; - prot = true; - } - } else { - struct mlx5_ib_mr *mr = to_mmr(sig_mr); - struct mlx5_ib_mr *pi_mr = mr->pi_mr; - - data_len = pi_mr->data_length; - data_key = pi_mr->ibmr.lkey; - data_va = pi_mr->ibmr.iova; - if (pi_mr->meta_ndescs) { - prot_len = pi_mr->meta_length; - prot_key = pi_mr->ibmr.lkey; - prot_va = pi_mr->ibmr.iova + data_len; - prot = true; - } + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->ibmr.iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->ibmr.iova + data_len; + prot = true; } if (!prot || (data_key == prot_key && data_va == prot_va && @@ -4748,57 +4733,6 @@ static int set_pi_umr_wr(const struct ib_send_wr *send_wr, return 0; } -static int set_sig_umr_wr(const struct ib_send_wr *send_wr, - struct mlx5_ib_qp *qp, void **seg, int *size, - void **cur_edge) -{ - const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); - struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); - u32 pdn = get_pd(qp)->pdn; - u32 xlt_size; - int region_len, ret; - - if (unlikely(wr->wr.num_sge != 1) || - unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || - unlikely(!sig_mr->sig->sig_status_checked)) - return -EINVAL; - - /* length of the protected region, data + protection */ - region_len = wr->wr.sg_list->length; - if (wr->prot && - (wr->prot->lkey != wr->wr.sg_list->lkey || - wr->prot->addr != wr->wr.sg_list->addr || - wr->prot->length != wr->wr.sg_list->length)) - region_len += wr->prot->length; - - /** - * KLM octoword size - if protection was provided - * then we use strided block format (3 octowords), - * else we use single KLM (1 octoword) - **/ - xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm); - - set_sig_umr_segment(*seg, xlt_size); - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - set_sig_mkey_segment(*seg, wr->sig_mr, wr->access_flags, xlt_size, - region_len, pdn); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg, - size, cur_edge); - if (ret) - return ret; - - sig_mr->sig->sig_status_checked = false; - return 0; -} - static int set_psv_wr(struct ib_sig_domain *domain, u32 psv_idx, void **seg, int *size) { @@ -5187,74 +5121,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, num_sge = 0; goto skip_psv; - case IB_WR_REG_SIG_MR: - qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; - mr = to_mmr(sig_handover_wr(wr)->sig_mr); - - ctrl->imm = cpu_to_be32(mr->ibmr.rkey); - err = set_sig_umr_wr(wr, qp, &seg, &size, - &cur_edge); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, - MLX5_OPCODE_UMR); - /* - * SET_PSV WQEs are not signaled and solicited - * on error - */ - err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, - &size, &cur_edge, nreq, false, - true); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - - err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem, - mr->sig->psv_memory.psv_idx, &seg, - &size); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, - MLX5_OPCODE_SET_PSV); - err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, - &size, &cur_edge, nreq, false, - true); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - - err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, - mr->sig->psv_wire.psv_idx, &seg, - &size); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, - MLX5_OPCODE_SET_PSV); - qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; - num_sge = 0; - goto skip_psv; - default: break; } diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 3c633ab58052..c142f5e7f25f 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -456,7 +456,7 @@ static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op) return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP; case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD; - case IB_WR_REG_SIG_MR: + case IB_WR_REG_MR_INTEGRITY: return PVRDMA_WR_REG_SIG_MR; default: return PVRDMA_WR_ERROR; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 14b5eab76ed8..e2478b74551d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -776,9 +776,6 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration - * @IB_MR_TYPE_SIGNATURE: memory region that is used for - * signature operations (data-integrity - * capable regions) * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see @@ -794,7 +791,6 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, - IB_MR_TYPE_SIGNATURE, IB_MR_TYPE_SG_GAPS, IB_MR_TYPE_DM, IB_MR_TYPE_USER, @@ -1235,7 +1231,6 @@ enum ib_wr_opcode { /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, - IB_WR_REG_SIG_MR, IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. @@ -1346,20 +1341,6 @@ static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) return container_of(wr, struct ib_reg_wr, wr); } -struct ib_sig_handover_wr { - struct ib_send_wr wr; - struct ib_sig_attrs *sig_attrs; - struct ib_mr *sig_mr; - int access_flags; - struct ib_sge *prot; -}; - -static inline const struct ib_sig_handover_wr * -sig_handover_wr(const struct ib_send_wr *wr) -{ - return container_of(wr, struct ib_sig_handover_wr, wr); -} - struct ib_recv_wr { struct ib_recv_wr *next; union { From de0ae958deb5e6af35c4c6a4679d4fe9896a98ca Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 11 Jun 2019 18:52:55 +0300 Subject: [PATCH 098/194] RDMA/mlx5: Improve PI handover performance In some loads, there is performance degradation when using KLM mkey instead of MTT mkey. This is because KLM descriptor access is via indirection that might require more HW resources and cycles. Using KLM descriptor is not necessary when there are no gaps at the data/metadata sg lists. As an optimization, use MTT mkey whenever it is possible. For that matter, allocate internal MTT mkey and choose the effective pi_mr for in transaction according to the required mapping scheme. The setup of the tested benchmark (using iSER ULP): - 2 servers with 24 cores (1 initiator and 1 target) - ConnectX-4/ConnectX-5 adapters - 24 target sessions with 1 LUN each - ramdisk backstore - PI active Performance results running fio (24 jobs, 128 iodepth) using write_generate=1 and read_verify=1 (w/w.o/baseline): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1262.4K/1243.3K/1147.1K 1732.1K/1725.1K/1423.8K 4k 570902/571233/457874 773982/743293/642080 32k 72086/72388/71933 96164/71789/93249 Using write_generate=0 and read_verify=0 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1600.1K/1572.1K/1393.3K 1830.3K/1823.5K/1557.2K 4k 937272/921992/762934 815304/753772/646071 32k 77369/75052/72058 97435/73180/94612 Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Suggested-by: Max Gurtovoy Suggested-by: Idan Burstein Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 7 +- drivers/infiniband/hw/mlx5/mr.c | 181 +++++++++++++++++++++++---- drivers/infiniband/hw/mlx5/qp.c | 2 +- 3 files changed, 165 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d418219e68c6..405059521321 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -605,7 +605,12 @@ struct mlx5_ib_mr { int access_flags; /* Needed for rereg MR */ struct mlx5_ib_mr *parent; - struct mlx5_ib_mr *pi_mr; /* Needed for IB_MR_TYPE_INTEGRITY */ + /* Needed for IB_MR_TYPE_INTEGRITY */ + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr *klm_mr; + struct mlx5_ib_mr *mtt_mr; + u64 pi_iova; + atomic_t num_leaf_free; wait_queue_head_t q_leaf_free; struct mlx5_async_work cb_work; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 36d1d6f8bb47..8b40abd0070b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1641,8 +1641,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mlx5_ib_mr *mmr = to_mmr(ibmr); - if (ibmr->type == IB_MR_TYPE_INTEGRITY) - dereg_mr(to_mdev(mmr->pi_mr->ibmr.device), mmr->pi_mr); + if (ibmr->type == IB_MR_TYPE_INTEGRITY) { + dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr); + dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); + } dereg_mr(to_mdev(ibmr->device), mmr); @@ -1650,7 +1652,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, - u32 max_num_sg, u32 max_num_meta_sg) + u32 max_num_sg, u32 max_num_meta_sg, + int desc_size, int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); @@ -1673,16 +1676,17 @@ static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); - mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; + mr->access_mode = access_mode; - err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(struct mlx5_klm)); + err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); if (err) goto err_free_in; - mr->desc_size = sizeof(struct mlx5_klm); + mr->desc_size = desc_size; mr->max_descs = ndescs; MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); @@ -1786,12 +1790,22 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, mr->sig->sig_err_exists = false; /* Next UMR, Arm SIGERR */ ++mr->sig->sigerr_count; - mr->pi_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, - max_num_meta_sg); - if (IS_ERR(mr->pi_mr)) { - err = PTR_ERR(mr->pi_mr); + mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, + max_num_meta_sg, + sizeof(struct mlx5_klm), + MLX5_MKC_ACCESS_MODE_KLMS); + if (IS_ERR(mr->klm_mr)) { + err = PTR_ERR(mr->klm_mr); goto err_destroy_psv; } + mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, + max_num_meta_sg, + sizeof(struct mlx5_mtt), + MLX5_MKC_ACCESS_MODE_MTT); + if (IS_ERR(mr->mtt_mr)) { + err = PTR_ERR(mr->mtt_mr); + goto err_free_klm_mr; + } } else { mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); err = -EINVAL; @@ -1816,9 +1830,14 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, return &mr->ibmr; err_free_pi_mr: - if (mr->pi_mr) { - dereg_mr(to_mdev(mr->pi_mr->ibmr.device), mr->pi_mr); - mr->pi_mr = NULL; + if (mr->mtt_mr) { + dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); + mr->mtt_mr = NULL; + } +err_free_klm_mr: + if (mr->klm_mr) { + dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); + mr->klm_mr = NULL; } err_destroy_psv: if (mr->sig) { @@ -2056,16 +2075,95 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, +static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs)) + return -ENOMEM; + + descs = mr->descs; + descs[mr->ndescs + mr->meta_ndescs++] = + cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); + + return 0; +} + +static int +mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); - struct mlx5_ib_mr *pi_mr = mr->pi_mr; + struct mlx5_ib_mr *pi_mr = mr->mtt_mr; int n; + u64 iova; - WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + pi_mr->ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + pi_mr->ibmr.page_size = ibmr->page_size; + n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, + mlx5_set_page); + if (n != data_sg_nents) + return n; + + iova = pi_mr->ibmr.iova; + pi_mr->data_length = pi_mr->ibmr.length; + pi_mr->ibmr.length = pi_mr->data_length; + ibmr->length = pi_mr->data_length; + + if (meta_sg_nents) { + u64 page_mask = ~((u64)ibmr->page_size - 1); + + n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, + meta_sg_offset, mlx5_set_page_pi); + + pi_mr->meta_length = pi_mr->ibmr.length; + /* + * PI address for the HW is the offset of the metadata address + * relative to the first data page address. + * It equals to first data page address + size of data pages + + * metadata offset at the first metadata page + */ + pi_mr->pi_iova = (iova & page_mask) + + pi_mr->ndescs * ibmr->page_size + + (pi_mr->ibmr.iova & ~page_mask); + /* + * In order to use one MTT MR for data and metadata, we register + * also the gaps between the end of the data and the start of + * the metadata (the sig MR will verify that the HW will access + * to right addresses). This mapping is safe because we use + * internal mkey for the registration. + */ + pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; + pi_mr->ibmr.iova = iova; + ibmr->length += pi_mr->meta_length; + } + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + return n; +} + +static int +mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->klm_mr; + int n; pi_mr->ndescs = 0; pi_mr->meta_ndescs = 0; @@ -2078,19 +2176,56 @@ int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, meta_sg, meta_sg_nents, meta_sg_offset); - /* This is zero-based memory region */ - pi_mr->ibmr.iova = 0; - ibmr->length = pi_mr->ibmr.length; - ibmr->iova = pi_mr->ibmr.iova; - ibmr->sig_attrs->meta_length = pi_mr->meta_length; - ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, pi_mr->desc_size * pi_mr->max_descs, DMA_TO_DEVICE); + /* This is zero-based memory region */ + pi_mr->ibmr.iova = 0; + pi_mr->pi_iova = pi_mr->data_length; + ibmr->length = pi_mr->ibmr.length; + + return n; +} + +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->mtt_mr; + int n; + + WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + + /* + * As a performance optimization, if possible, there is no need to map + * the sg lists to KLM descriptors. First try to map the sg lists to MTT + * descriptors and fallback to KLM only in case of a failure. + * It's more efficient for the HW to work with MTT descriptors + * (especially in high load). + * Use KLM (indirect access) only if it's mandatory. + */ + n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; + + pi_mr = mr->klm_mr; + n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); if (unlikely(n != data_sg_nents + meta_sg_nents)) return -ENOMEM; +out: + /* This is zero-based memory region */ + ibmr->iova = 0; + mr->pi_mr = pi_mr; + ibmr->sig_attrs->meta_length = pi_mr->meta_length; + return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 60536c9c008f..f0962be5b11c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4566,7 +4566,7 @@ static int set_sig_data_segment(const struct ib_send_wr *send_wr, if (pi_mr->meta_ndescs) { prot_len = pi_mr->meta_length; prot_key = pi_mr->ibmr.lkey; - prot_va = pi_mr->ibmr.iova + data_len; + prot_va = pi_mr->pi_iova; prot = true; } From 2563e2f30acb4c914fc475331e476fa920eb4245 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:56 +0300 Subject: [PATCH 099/194] RDMA/mlx5: Use PA mapping for PI handover If possibe, avoid doing a UMR operation to register data and protection buffers (via MTT/KLM mkeys). Instead, use the local DMA key and map the SG lists using PA access. This is safe, since the internal key for data and protection never exposed to the remote server (only signature key might be exposed). If PA mappings are not possible, perform mapping using MTT/KLM descriptors. The setup of the tested benchmark (using iSER ULP): - 2 servers with 24 cores (1 initiator and 1 target) - ConnectX-4/ConnectX-5 adapters - 24 target sessions with 1 LUN each - ramdisk backstore - PI active Performance results running fio (24 jobs, 128 iodepth) using write_generate=1 and read_verify=1 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1266.4K/1262.4K 1720.1K/1732.1K 4k 793139/570902 1129.6K/773982 32k 72660/72086 97229/96164 Using write_generate=0 and read_verify=0 (w/w.o patch): bs IOPS(read) IOPS(write) ---- ---------- ---------- 512 1590.2K/1600.1K 1828.2K/1830.3K 4k 1078.1K/937272 1142.1K/815304 32k 77012/77369 98125/97435 Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Suggested-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/mr.c | 63 +++++++++++++++++++++-- drivers/infiniband/hw/mlx5/qp.c | 76 +++++++++++++++++++--------- 3 files changed, 112 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 405059521321..bdb83fc85f94 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -609,6 +609,7 @@ struct mlx5_ib_mr { struct mlx5_ib_mr *pi_mr; struct mlx5_ib_mr *klm_mr; struct mlx5_ib_mr *mtt_mr; + u64 data_iova; u64 pi_iova; atomic_t num_leaf_free; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8b40abd0070b..f2ef89e48afa 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -2001,6 +2001,40 @@ done: return ret; } +static int +mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + unsigned int sg_offset = 0; + int n = 0; + + mr->meta_length = 0; + if (data_sg_nents == 1) { + n++; + mr->ndescs = 1; + if (data_sg_offset) + sg_offset = *data_sg_offset; + mr->data_length = sg_dma_len(data_sg) - sg_offset; + mr->data_iova = sg_dma_address(data_sg) + sg_offset; + if (meta_sg_nents == 1) { + n++; + mr->meta_ndescs = 1; + if (meta_sg_offset) + sg_offset = *meta_sg_offset; + else + sg_offset = 0; + mr->meta_length = sg_dma_len(meta_sg) - sg_offset; + mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; + } + ibmr->length = mr->data_length + mr->meta_length; + } + + return n; +} + static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, @@ -2099,7 +2133,6 @@ mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_mr *pi_mr = mr->mtt_mr; int n; - u64 iova; pi_mr->ndescs = 0; pi_mr->meta_ndescs = 0; @@ -2115,13 +2148,14 @@ mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, if (n != data_sg_nents) return n; - iova = pi_mr->ibmr.iova; + pi_mr->data_iova = pi_mr->ibmr.iova; pi_mr->data_length = pi_mr->ibmr.length; pi_mr->ibmr.length = pi_mr->data_length; ibmr->length = pi_mr->data_length; if (meta_sg_nents) { u64 page_mask = ~((u64)ibmr->page_size - 1); + u64 iova = pi_mr->data_iova; n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, meta_sg_offset, mlx5_set_page_pi); @@ -2181,6 +2215,7 @@ mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, DMA_TO_DEVICE); /* This is zero-based memory region */ + pi_mr->data_iova = 0; pi_mr->ibmr.iova = 0; pi_mr->pi_iova = pi_mr->data_length; ibmr->length = pi_mr->ibmr.length; @@ -2194,11 +2229,27 @@ int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, unsigned int *meta_sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); - struct mlx5_ib_mr *pi_mr = mr->mtt_mr; + struct mlx5_ib_mr *pi_mr = NULL; int n; WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + mr->ndescs = 0; + mr->data_length = 0; + mr->data_iova = 0; + mr->meta_ndescs = 0; + mr->pi_iova = 0; + /* + * As a performance optimization, if possible, there is no need to + * perform UMR operation to register the data/metadata buffers. + * First try to map the sg lists to PA descriptors with local_dma_lkey. + * Fallback to UMR only in case of a failure. + */ + n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; /* * As a performance optimization, if possible, there is no need to map * the sg lists to KLM descriptors. First try to map the sg lists to MTT @@ -2207,6 +2258,7 @@ int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, * (especially in high load). * Use KLM (indirect access) only if it's mandatory. */ + pi_mr = mr->mtt_mr; n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, data_sg_offset, meta_sg, meta_sg_nents, meta_sg_offset); @@ -2224,7 +2276,10 @@ out: /* This is zero-based memory region */ ibmr->iova = 0; mr->pi_mr = pi_mr; - ibmr->sig_attrs->meta_length = pi_mr->meta_length; + if (pi_mr) + ibmr->sig_attrs->meta_length = pi_mr->meta_length; + else + ibmr->sig_attrs->meta_length = mr->meta_length; return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f0962be5b11c..4fbf60fed374 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4562,7 +4562,7 @@ static int set_sig_data_segment(const struct ib_send_wr *send_wr, data_len = pi_mr->data_length; data_key = pi_mr->ibmr.lkey; - data_va = pi_mr->ibmr.iova; + data_va = pi_mr->data_iova; if (pi_mr->meta_ndescs) { prot_len = pi_mr->meta_length; prot_key = pi_mr->ibmr.lkey; @@ -4912,6 +4912,7 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr pa_pi_mr; struct ib_sig_attrs *sig_attrs; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; @@ -5026,35 +5027,62 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, break; case IB_WR_REG_MR_INTEGRITY: - memset(®_pi_wr, 0, sizeof(struct ib_reg_wr)); + qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; mr = to_mmr(reg_wr(wr)->mr); pi_mr = mr->pi_mr; - reg_pi_wr.mr = &pi_mr->ibmr; - reg_pi_wr.access = reg_wr(wr)->access; - reg_pi_wr.key = pi_mr->ibmr.rkey; + if (pi_mr) { + memset(®_pi_wr, 0, + sizeof(struct ib_reg_wr)); - qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; - ctrl->imm = cpu_to_be32(reg_pi_wr.key); - /* UMR for data + protection registration */ - err = set_reg_wr(qp, ®_pi_wr, &seg, &size, - &cur_edge, false); - if (err) { - *bad_wr = wr; - goto out; - } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, - MLX5_OPCODE_UMR); + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; - err = begin_wqe(qp, &seg, &ctrl, wr, &idx, - &size, &cur_edge, nreq); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; + ctrl->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + prot registration */ + err = set_reg_wr(qp, ®_pi_wr, &seg, + &size, &cur_edge, + false); + if (err) { + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, + cur_edge, idx, wr->wr_id, + nreq, fence, + MLX5_OPCODE_UMR); + + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, &cur_edge, + nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + } else { + memset(&pa_pi_mr, 0, + sizeof(struct mlx5_ib_mr)); + /* No UMR, use local_dma_lkey */ + pa_pi_mr.ibmr.lkey = + mr->ibmr.pd->local_dma_lkey; + + pa_pi_mr.ndescs = mr->ndescs; + pa_pi_mr.data_length = mr->data_length; + pa_pi_mr.data_iova = mr->data_iova; + if (mr->meta_ndescs) { + pa_pi_mr.meta_ndescs = + mr->meta_ndescs; + pa_pi_mr.meta_length = + mr->meta_length; + pa_pi_mr.pi_iova = mr->pi_iova; + } + + pa_pi_mr.ibmr.length = mr->ibmr.length; + mr->pi_mr = &pa_pi_mr; } ctrl->imm = cpu_to_be32(mr->ibmr.rkey); /* UMR for sig MR */ From 7796d2a3bb4037f9c51b8d91d059f1d690ed301f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:57 +0300 Subject: [PATCH 100/194] RDMA/mlx5: Refactor MR descriptors allocation Improve code readability using static helpers for each memory region type. Re-use the common logic to get smaller functions that are easy to maintain and reduce code duplication. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 290 +++++++++++++++++--------------- 1 file changed, 157 insertions(+), 133 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index f2ef89e48afa..6ac77e09a34a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1651,15 +1651,63 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, + int access_mode, int page_shift) +{ + void *mkc; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, log_page_size, page_shift); +} + +static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, int desc_size, int page_shift, + int access_mode, u32 *in, int inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int err; + + mr->access_mode = access_mode; + mr->desc_size = desc_size; + mr->max_descs = ndescs; + + err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); + if (err) + return err; + + mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); + if (err) + goto err_free_descs; + + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + return 0; + +err_free_descs: + mlx5_free_priv_descs(mr); + return err; +} + static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, u32 max_num_sg, u32 max_num_meta_sg, int desc_size, int access_mode) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); + int page_shift = 0; struct mlx5_ib_mr *mr; - void *mkc; u32 *in; int err; @@ -1667,48 +1715,28 @@ static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, if (!mr) return ERR_PTR(-ENOMEM); + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } - mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, free, 1); - MLX5_SET(mkc, mkc, translations_octword_size, ndescs); if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) - MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); - MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + page_shift = PAGE_SHIFT; - mr->access_mode = access_mode; - - err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); + err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, + access_mode, in, inlen); if (err) goto err_free_in; - mr->desc_size = desc_size; - mr->max_descs = ndescs; - MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); - MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); - MLX5_SET(mkc, mkc, umr_en, 1); - - mr->ibmr.pd = pd; - mr->ibmr.device = pd->device; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); - if (err) - goto err_priv_descs; - - mr->mmkey.type = MLX5_MKEY_MR; - mr->ibmr.lkey = mr->mmkey.key; - mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; kfree(in); return mr; -err_priv_descs: - mlx5_free_priv_descs(mr); err_free_in: kfree(in); err_free: @@ -1716,6 +1744,92 @@ err_free: return ERR_PTR(err); } +static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), + PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, + inlen); +} + +static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), + 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); +} + +static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int max_num_sg, int max_num_meta_sg, + u32 *in, int inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 psv_index[2]; + void *mkc; + int err; + + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) + return -ENOMEM; + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); + if (err) + goto err_free_sig; + + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_klm), + MLX5_MKC_ACCESS_MODE_KLMS); + if (IS_ERR(mr->klm_mr)) { + err = PTR_ERR(mr->klm_mr); + goto err_destroy_psv; + } + mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_mtt), + MLX5_MKC_ACCESS_MODE_MTT); + if (IS_ERR(mr->mtt_mr)) { + err = PTR_ERR(mr->mtt_mr); + goto err_free_klm_mr; + } + + /* Set bsf descriptors for mkey */ + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); + + err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, + MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); + if (err) + goto err_free_mtt_mr; + + return 0; + +err_free_mtt_mr: + dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); + mr->mtt_mr = NULL; +err_free_klm_mr: + dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); + mr->klm_mr = NULL; +err_destroy_psv: + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); +err_free_sig: + kfree(mr->sig); + + return err; +} + static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, u32 max_num_meta_sg) @@ -1724,7 +1838,6 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); int ndescs = ALIGN(max_num_sg, 4); struct mlx5_ib_mr *mr; - void *mkc; u32 *in; int err; @@ -1738,121 +1851,32 @@ static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free; } - mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - MLX5_SET(mkc, mkc, free, 1); - MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + mr->ibmr.device = pd->device; + mr->umem = NULL; - if (mr_type == IB_MR_TYPE_MEM_REG) { - mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; - MLX5_SET(mkc, mkc, translations_octword_size, ndescs); - MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); - err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(struct mlx5_mtt)); - if (err) - goto err_free_in; - - mr->desc_size = sizeof(struct mlx5_mtt); - mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_SG_GAPS) { - mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; - MLX5_SET(mkc, mkc, translations_octword_size, ndescs); - - err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(struct mlx5_klm)); - if (err) - goto err_free_in; - mr->desc_size = sizeof(struct mlx5_klm); - mr->max_descs = ndescs; - } else if (mr_type == IB_MR_TYPE_INTEGRITY) { - u32 psv_index[2]; - - MLX5_SET(mkc, mkc, bsf_en, 1); - MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); - MLX5_SET(mkc, mkc, translations_octword_size, 4); - mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); - if (!mr->sig) { - err = -ENOMEM; - goto err_free_in; - } - - /* create mem & wire PSVs */ - err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, - 2, psv_index); - if (err) - goto err_free_sig; - - mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; - mr->sig->psv_memory.psv_idx = psv_index[0]; - mr->sig->psv_wire.psv_idx = psv_index[1]; - - mr->sig->sig_status_checked = true; - mr->sig->sig_err_exists = false; - /* Next UMR, Arm SIGERR */ - ++mr->sig->sigerr_count; - mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, - max_num_meta_sg, - sizeof(struct mlx5_klm), - MLX5_MKC_ACCESS_MODE_KLMS); - if (IS_ERR(mr->klm_mr)) { - err = PTR_ERR(mr->klm_mr); - goto err_destroy_psv; - } - mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, - max_num_meta_sg, - sizeof(struct mlx5_mtt), - MLX5_MKC_ACCESS_MODE_MTT); - if (IS_ERR(mr->mtt_mr)) { - err = PTR_ERR(mr->mtt_mr); - goto err_free_klm_mr; - } - } else { + switch (mr_type) { + case IB_MR_TYPE_MEM_REG: + err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_SG_GAPS: + err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_INTEGRITY: + err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, + max_num_meta_sg, in, inlen); + break; + default: mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); err = -EINVAL; - goto err_free_in; } - MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3); - MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7); - MLX5_SET(mkc, mkc, umr_en, 1); - - mr->ibmr.device = pd->device; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen); if (err) - goto err_free_pi_mr; + goto err_free_in; - mr->mmkey.type = MLX5_MKEY_MR; - mr->ibmr.lkey = mr->mmkey.key; - mr->ibmr.rkey = mr->mmkey.key; - mr->umem = NULL; kfree(in); return &mr->ibmr; -err_free_pi_mr: - if (mr->mtt_mr) { - dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); - mr->mtt_mr = NULL; - } -err_free_klm_mr: - if (mr->klm_mr) { - dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); - mr->klm_mr = NULL; - } -err_destroy_psv: - if (mr->sig) { - if (mlx5_core_destroy_psv(dev->mdev, - mr->sig->psv_memory.psv_idx)) - mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", - mr->sig->psv_memory.psv_idx); - if (mlx5_core_destroy_psv(dev->mdev, - mr->sig->psv_wire.psv_idx)) - mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", - mr->sig->psv_wire.psv_idx); - } - mlx5_free_priv_descs(mr); -err_free_sig: - kfree(mr->sig); err_free_in: kfree(in); err_free: From 7ef7587541d49c6de1c9650c3c819a7fdd7ade66 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Jun 2019 22:46:08 +0100 Subject: [PATCH 101/194] RDMA/hns: fix potential integer overflow on left shift There is a potential integer overflow when int i is left shifted as this is evaluated using 32 bit arithmetic but is being used in a context that expects an expression of type dma_addr_t. Fix this by casting integer i to dma_addr_t before shifting to avoid the overflow. Addresses-Coverity: ("Unintentional integer overflow") Fixes: 2ac0bc5e725e ("RDMA/hns: Add a group interfaces for optimizing buffers getting flow") Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 14fcc359599c..2c8defa94107 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -257,7 +257,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, for (i = start; i < end; i++) if (buf->nbufs == 1) bufs[total++] = buf->direct.map + - (i << buf->page_shift); + ((dma_addr_t)i << buf->page_shift); else bufs[total++] = buf->page_list[i].map; From b417c0879db72f810ca81d88b719e70d20566857 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 8 Jun 2019 12:27:14 +0300 Subject: [PATCH 102/194] RDMA/hns: Fix an error code in hns_roce_set_user_sq_size() This function is supposed to return negative kernel error codes but here it returns CMD_RST_PRC_EBUSY (2). The error code eventually gets passed to IS_ERR() and since it's not an error pointer it leads to an Oops in hns_roce_v1_rsv_lp_qp() Signed-off-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index edd62b4dc0a0..7f2da5e10e67 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1098,7 +1098,7 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, if (ret == CMD_RST_PRC_SUCCESS) return 0; if (ret == CMD_RST_PRC_EBUSY) - return ret; + return -EBUSY; ret = __hns_roce_cmq_send(hr_dev, desc, num); if (ret) { @@ -1106,7 +1106,7 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, if (retval == CMD_RST_PRC_SUCCESS) return 0; else if (retval == CMD_RST_PRC_EBUSY) - return retval; + return -EBUSY; } return ret; From 97162a1ee8a1735fc7a7159fe08de966d88354ce Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 8 Jun 2019 23:27:03 -0300 Subject: [PATCH 103/194] docs: infiniband: convert docs to ReST and rename to *.rst The InfiniBand docs are plain text with no markups. So, all we needed to do were to add the title markups and some markup sequences in order to properly parse tables, lists and literal blocks. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jason Gunthorpe --- .../{core_locking.txt => core_locking.rst} | 64 ++++++----- Documentation/infiniband/index.rst | 23 ++++ .../infiniband/{ipoib.txt => ipoib.rst} | 24 ++-- .../infiniband/{opa_vnic.txt => opa_vnic.rst} | 108 +++++++++--------- .../infiniband/{sysfs.txt => sysfs.rst} | 4 +- .../{tag_matching.txt => tag_matching.rst} | 5 + .../infiniband/{user_mad.txt => user_mad.rst} | 33 ++++-- .../{user_verbs.txt => user_verbs.rst} | 12 +- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/ulp/ipoib/Kconfig | 2 +- 10 files changed, 174 insertions(+), 103 deletions(-) rename Documentation/infiniband/{core_locking.txt => core_locking.rst} (78%) create mode 100644 Documentation/infiniband/index.rst rename Documentation/infiniband/{ipoib.txt => ipoib.rst} (90%) rename Documentation/infiniband/{opa_vnic.txt => opa_vnic.rst} (63%) rename Documentation/infiniband/{sysfs.txt => sysfs.rst} (69%) rename Documentation/infiniband/{tag_matching.txt => tag_matching.rst} (98%) rename Documentation/infiniband/{user_mad.txt => user_mad.rst} (90%) rename Documentation/infiniband/{user_verbs.txt => user_verbs.rst} (93%) diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.rst similarity index 78% rename from Documentation/infiniband/core_locking.txt rename to Documentation/infiniband/core_locking.rst index 4b1f36b6ada0..f34669beb4fe 100644 --- a/Documentation/infiniband/core_locking.txt +++ b/Documentation/infiniband/core_locking.rst @@ -1,4 +1,6 @@ -INFINIBAND MIDLAYER LOCKING +=========================== +InfiniBand Midlayer Locking +=========================== This guide is an attempt to make explicit the locking assumptions made by the InfiniBand midlayer. It describes the requirements on @@ -6,45 +8,47 @@ INFINIBAND MIDLAYER LOCKING protocols that use the midlayer. Sleeping and interrupt context +============================== With the following exceptions, a low-level driver implementation of all of the methods in struct ib_device may sleep. The exceptions are any methods from the list: - create_ah - modify_ah - query_ah - destroy_ah - post_send - post_recv - poll_cq - req_notify_cq - map_phys_fmr + - create_ah + - modify_ah + - query_ah + - destroy_ah + - post_send + - post_recv + - poll_cq + - req_notify_cq + - map_phys_fmr which may not sleep and must be callable from any context. The corresponding functions exported to upper level protocol consumers: - ib_create_ah - ib_modify_ah - ib_query_ah - ib_destroy_ah - ib_post_send - ib_post_recv - ib_req_notify_cq - ib_map_phys_fmr + - ib_create_ah + - ib_modify_ah + - ib_query_ah + - ib_destroy_ah + - ib_post_send + - ib_post_recv + - ib_req_notify_cq + - ib_map_phys_fmr are therefore safe to call from any context. In addition, the function - ib_dispatch_event + - ib_dispatch_event used by low-level drivers to dispatch asynchronous events through the midlayer is also safe to call from any context. Reentrancy +---------- All of the methods in struct ib_device exported by a low-level driver must be fully reentrant. The low-level driver is required to @@ -62,6 +66,7 @@ Reentrancy information between different calls of ib_poll_cq() is not defined. Callbacks +--------- A low-level driver must not perform a callback directly from the same callchain as an ib_device method call. For example, it is not @@ -74,18 +79,18 @@ Callbacks completion event handlers for the same CQ are not called simultaneously. The driver must guarantee that only one CQ event handler for a given CQ is running at a time. In other words, the - following situation is not allowed: + following situation is not allowed:: - CPU1 CPU2 + CPU1 CPU2 - low-level driver -> - consumer CQ event callback: - /* ... */ - ib_req_notify_cq(cq, ...); - low-level driver -> - /* ... */ consumer CQ event callback: - /* ... */ - return from CQ event handler + low-level driver -> + consumer CQ event callback: + /* ... */ + ib_req_notify_cq(cq, ...); + low-level driver -> + /* ... */ consumer CQ event callback: + /* ... */ + return from CQ event handler The context in which completion event and asynchronous event callbacks run is not defined. Depending on the low-level driver, it @@ -93,6 +98,7 @@ Callbacks Upper level protocol consumers may not sleep in a callback. Hot-plug +-------- A low-level driver announces that a device is ready for use by consumers when it calls ib_register_device(), all initialization diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst new file mode 100644 index 000000000000..22eea64de722 --- /dev/null +++ b/Documentation/infiniband/index.rst @@ -0,0 +1,23 @@ +:orphan: + +========== +InfiniBand +========== + +.. toctree:: + :maxdepth: 1 + + core_locking + ipoib + opa_vnic + sysfs + tag_matching + user_mad + user_verbs + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.rst similarity index 90% rename from Documentation/infiniband/ipoib.txt rename to Documentation/infiniband/ipoib.rst index 47c1dd9818f2..0dd36154c0c9 100644 --- a/Documentation/infiniband/ipoib.txt +++ b/Documentation/infiniband/ipoib.rst @@ -1,4 +1,6 @@ -IP OVER INFINIBAND +================== +IP over InfiniBand +================== The ib_ipoib driver is an implementation of the IP over InfiniBand protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib @@ -8,16 +10,17 @@ IP OVER INFINIBAND masqueraded to the kernel as ethernet interfaces). Partitions and P_Keys +===================== When the IPoIB driver is loaded, it creates one interface for each port using the P_Key at index 0. To create an interface with a different P_Key, write the desired P_Key into the main interface's - /sys/class/net//create_child file. For example: + /sys/class/net//create_child file. For example:: echo 0x8001 > /sys/class/net/ib0/create_child This will create an interface named ib0.8001 with P_Key 0x8001. To - remove a subinterface, use the "delete_child" file: + remove a subinterface, use the "delete_child" file:: echo 0x8001 > /sys/class/net/ib0/delete_child @@ -28,6 +31,7 @@ Partitions and P_Keys rtnl_link_ops, where children created using either way behave the same. Datagram vs Connected modes +=========================== The IPoIB driver supports two modes of operation: datagram and connected. The mode is set and read through an interface's @@ -51,6 +55,7 @@ Datagram vs Connected modes networking stack to use the smaller UD MTU for these neighbours. Stateless offloads +================== If the IB HW supports IPoIB stateless offloads, IPoIB advertises TCP/IP checksum and/or Large Send (LSO) offloading capability to the @@ -60,9 +65,10 @@ Stateless offloads on/off using ethtool calls. Currently LRO is supported only for checksum offload capable devices. - Stateless offloads are supported only in datagram mode. + Stateless offloads are supported only in datagram mode. Interrupt moderation +==================== If the underlying IB device supports CQ event moderation, one can use ethtool to set interrupt mitigation parameters and thus reduce @@ -71,6 +77,7 @@ Interrupt moderation moderation is supported. Debugging Information +===================== By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set to 'y', tracing messages are compiled into the driver. They are @@ -79,7 +86,7 @@ Debugging Information runtime through files in /sys/module/ib_ipoib/. CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs - virtual filesystem. By mounting this filesystem, for example with + virtual filesystem. By mounting this filesystem, for example with:: mount -t debugfs none /sys/kernel/debug @@ -96,10 +103,13 @@ Debugging Information performance, because it adds tests to the fast path. References +========== Transmission of IP over InfiniBand (IPoIB) (RFC 4391) - http://ietf.org/rfc/rfc4391.txt + http://ietf.org/rfc/rfc4391.txt + IP over InfiniBand (IPoIB) Architecture (RFC 4392) - http://ietf.org/rfc/rfc4392.txt + http://ietf.org/rfc/rfc4392.txt + IP over InfiniBand: Connected Mode (RFC 4755) http://ietf.org/rfc/rfc4755.txt diff --git a/Documentation/infiniband/opa_vnic.txt b/Documentation/infiniband/opa_vnic.rst similarity index 63% rename from Documentation/infiniband/opa_vnic.txt rename to Documentation/infiniband/opa_vnic.rst index 282e17be798a..2f888d9ffec0 100644 --- a/Documentation/infiniband/opa_vnic.txt +++ b/Documentation/infiniband/opa_vnic.rst @@ -1,3 +1,7 @@ +================================================================= +Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) +================================================================= + Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature supports Ethernet functionality over Omni-Path fabric by encapsulating the Ethernet packets between HFI nodes. @@ -17,70 +21,72 @@ an independent Ethernet network. The configuration is performed by an Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM) application. HFI nodes can have multiple VNICs each connected to a different virtual Ethernet switch. The below diagram presents a case -of two virtual Ethernet switches with two HFI nodes. +of two virtual Ethernet switches with two HFI nodes:: - +-------------------+ - | Subnet/ | - | Ethernet | - | Manager | - +-------------------+ - / / - / / - / / - / / -+-----------------------------+ +------------------------------+ -| Virtual Ethernet Switch | | Virtual Ethernet Switch | -| +---------+ +---------+ | | +---------+ +---------+ | -| | VPORT | | VPORT | | | | VPORT | | VPORT | | -+--+---------+----+---------+-+ +-+---------+----+---------+---+ - | \ / | - | \ / | - | \/ | - | / \ | - | / \ | - +-----------+------------+ +-----------+------------+ - | VNIC | VNIC | | VNIC | VNIC | - +-----------+------------+ +-----------+------------+ - | HFI | | HFI | - +------------------------+ +------------------------+ + +-------------------+ + | Subnet/ | + | Ethernet | + | Manager | + +-------------------+ + / / + / / + / / + / / + +-----------------------------+ +------------------------------+ + | Virtual Ethernet Switch | | Virtual Ethernet Switch | + | +---------+ +---------+ | | +---------+ +---------+ | + | | VPORT | | VPORT | | | | VPORT | | VPORT | | + +--+---------+----+---------+-+ +-+---------+----+---------+---+ + | \ / | + | \ / | + | \/ | + | / \ | + | / \ | + +-----------+------------+ +-----------+------------+ + | VNIC | VNIC | | VNIC | VNIC | + +-----------+------------+ +-----------+------------+ + | HFI | | HFI | + +------------------------+ +------------------------+ The Omni-Path encapsulated Ethernet packet format is as described below. -Bits Field ------------------------------------- +==================== ================================ +Bits Field +==================== ================================ Quad Word 0: -0-19 SLID (lower 20 bits) -20-30 Length (in Quad Words) -31 BECN bit -32-51 DLID (lower 20 bits) -52-56 SC (Service Class) -57-59 RC (Routing Control) -60 FECN bit -61-62 L2 (=10, 16B format) -63 LT (=1, Link Transfer Head Flit) +0-19 SLID (lower 20 bits) +20-30 Length (in Quad Words) +31 BECN bit +32-51 DLID (lower 20 bits) +52-56 SC (Service Class) +57-59 RC (Routing Control) +60 FECN bit +61-62 L2 (=10, 16B format) +63 LT (=1, Link Transfer Head Flit) Quad Word 1: -0-7 L4 type (=0x78 ETHERNET) -8-11 SLID[23:20] -12-15 DLID[23:20] -16-31 PKEY -32-47 Entropy -48-63 Reserved +0-7 L4 type (=0x78 ETHERNET) +8-11 SLID[23:20] +12-15 DLID[23:20] +16-31 PKEY +32-47 Entropy +48-63 Reserved Quad Word 2: -0-15 Reserved -16-31 L4 header -32-63 Ethernet Packet +0-15 Reserved +16-31 L4 header +32-63 Ethernet Packet Quad Words 3 to N-1: -0-63 Ethernet packet (pad extended) +0-63 Ethernet packet (pad extended) Quad Word N (last): -0-23 Ethernet packet (pad extended) -24-55 ICRC -56-61 Tail -62-63 LT (=01, Link Transfer Tail Flit) +0-23 Ethernet packet (pad extended) +24-55 ICRC +56-61 Tail +62-63 LT (=01, Link Transfer Tail Flit) +==================== ================================ Ethernet packet is padded on the transmit side to ensure that the VNIC OPA packet is quad word aligned. The 'Tail' field contains the number of bytes @@ -123,7 +129,7 @@ operation. It also handles the encapsulation of Ethernet packets with an Omni-Path header in the transmit path. For each VNIC interface, the information required for encapsulation is configured by the EM via VEMA MAD interface. It also passes any control information to the HW dependent driver -by invoking the RDMA netdev control operations. +by invoking the RDMA netdev control operations:: +-------------------+ +----------------------+ | | | Linux | diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.rst similarity index 69% rename from Documentation/infiniband/sysfs.txt rename to Documentation/infiniband/sysfs.rst index 9fab5062f84b..f0abd6fa48f4 100644 --- a/Documentation/infiniband/sysfs.txt +++ b/Documentation/infiniband/sysfs.rst @@ -1,4 +1,6 @@ -SYSFS FILES +=========== +Sysfs files +=========== The sysfs interface has moved to Documentation/ABI/stable/sysfs-class-infiniband. diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.rst similarity index 98% rename from Documentation/infiniband/tag_matching.txt rename to Documentation/infiniband/tag_matching.rst index d2a3bf819226..ef56ea585f92 100644 --- a/Documentation/infiniband/tag_matching.txt +++ b/Documentation/infiniband/tag_matching.rst @@ -1,12 +1,16 @@ +================== Tag matching logic +================== The MPI standard defines a set of rules, known as tag-matching, for matching source send operations to destination receives. The following parameters must match the following source and destination parameters: + * Communicator * User tag - wild card may be specified by the receiver * Source rank – wild car may be specified by the receiver * Destination rank – wild + The ordering rules require that when more than one pair of send and receive message envelopes may match, the pair that includes the earliest posted-send and the earliest posted-receive is the pair that must be used to satisfy the @@ -35,6 +39,7 @@ the header to initiate an RDMA READ operation directly to the matching buffer. A fin message needs to be received in order for the buffer to be reused. Tag matching implementation +=========================== There are two types of matching objects used, the posted receive list and the unexpected message list. The application posts receive buffers through calls diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.rst similarity index 90% rename from Documentation/infiniband/user_mad.txt rename to Documentation/infiniband/user_mad.rst index 7aca13a54a3a..d88abfc0e370 100644 --- a/Documentation/infiniband/user_mad.txt +++ b/Documentation/infiniband/user_mad.rst @@ -1,6 +1,9 @@ -USERSPACE MAD ACCESS +==================== +Userspace MAD access +==================== Device files +============ Each port of each InfiniBand device has a "umad" device and an "issm" device attached. For example, a two-port HCA will have two @@ -8,12 +11,13 @@ Device files device of each type (for switch port 0). Creating MAD agents +=================== A MAD agent can be created by filling in a struct ib_user_mad_reg_req and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file descriptor for the appropriate device file. If the registration request succeeds, a 32-bit id will be returned in the structure. - For example: + For example:: struct ib_user_mad_reg_req req = { /* ... */ }; ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req); @@ -26,12 +30,14 @@ Creating MAD agents ioctl. Also, all agents registered through a file descriptor will be unregistered when the descriptor is closed. - 2014 -- a new registration ioctl is now provided which allows additional + 2014 + a new registration ioctl is now provided which allows additional fields to be provided during registration. Users of this registration call are implicitly setting the use of pkey_index (see below). Receiving MADs +============== MADs are received using read(). The receive side now supports RMPP. The buffer passed to read() must be at least one @@ -41,7 +47,8 @@ Receiving MADs MAD (RMPP), the errno is set to ENOSPC and the length of the buffer needed is set in mad.length. - Example for normal MAD (non RMPP) reads: + Example for normal MAD (non RMPP) reads:: + struct ib_user_mad *mad; mad = malloc(sizeof *mad + 256); ret = read(fd, mad, sizeof *mad + 256); @@ -50,7 +57,8 @@ Receiving MADs free(mad); } - Example for RMPP reads: + Example for RMPP reads:: + struct ib_user_mad *mad; mad = malloc(sizeof *mad + 256); ret = read(fd, mad, sizeof *mad + 256); @@ -76,11 +84,12 @@ Receiving MADs poll()/select() may be used to wait until a MAD can be read. Sending MADs +============ MADs are sent using write(). The agent ID for sending should be filled into the id field of the MAD, the destination LID should be filled into the lid field, and so on. The send side does support - RMPP so arbitrary length MAD can be sent. For example: + RMPP so arbitrary length MAD can be sent. For example:: struct ib_user_mad *mad; @@ -97,6 +106,7 @@ Sending MADs perror("write"); Transaction IDs +=============== Users of the umad devices can use the lower 32 bits of the transaction ID field (that is, the least significant half of the @@ -105,6 +115,7 @@ Transaction IDs the kernel and will be overwritten before a MAD is sent. P_Key Index Handling +==================== The old ib_umad interface did not allow setting the P_Key index for MADs that are sent and did not provide a way for obtaining the P_Key @@ -119,6 +130,7 @@ P_Key Index Handling default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed. Setting IsSM Capability Bit +=========================== To set the IsSM capability bit for a port, simply open the corresponding issm device file. If the IsSM bit is already set, @@ -129,25 +141,26 @@ Setting IsSM Capability Bit the issm file. /dev files +========== To create the appropriate character device files automatically with - udev, a rule like + udev, a rule like:: KERNEL=="umad*", NAME="infiniband/%k" KERNEL=="issm*", NAME="infiniband/%k" - can be used. This will create device nodes named + can be used. This will create device nodes named:: /dev/infiniband/umad0 /dev/infiniband/issm0 for the first port, and so on. The InfiniBand device and port - associated with these devices can be determined from the files + associated with these devices can be determined from the files:: /sys/class/infiniband_mad/umad0/ibdev /sys/class/infiniband_mad/umad0/port - and + and:: /sys/class/infiniband_mad/issm0/ibdev /sys/class/infiniband_mad/issm0/port diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.rst similarity index 93% rename from Documentation/infiniband/user_verbs.txt rename to Documentation/infiniband/user_verbs.rst index 47ebf2f80b2b..8ddc4b1cfef2 100644 --- a/Documentation/infiniband/user_verbs.txt +++ b/Documentation/infiniband/user_verbs.rst @@ -1,4 +1,6 @@ -USERSPACE VERBS ACCESS +====================== +Userspace verbs access +====================== The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS, enables direct userspace access to IB hardware via "verbs," as @@ -13,6 +15,7 @@ USERSPACE VERBS ACCESS libmthca userspace driver be installed. User-kernel communication +========================= Userspace communicates with the kernel for slow path, resource management operations via the /dev/infiniband/uverbsN character @@ -28,6 +31,7 @@ User-kernel communication system call. Resource management +=================== Since creation and destruction of all IB resources is done by commands passed through a file descriptor, the kernel can keep track @@ -41,6 +45,7 @@ Resource management prevent one process from touching another process's resources. Memory pinning +============== Direct userspace I/O requires that memory regions that are potential I/O targets be kept resident at the same physical address. The @@ -54,13 +59,14 @@ Memory pinning number of pages pinned by a process. /dev files +========== To create the appropriate character device files automatically with - udev, a rule like + udev, a rule like:: KERNEL=="uverbs*", NAME="infiniband/%k" - can be used. This will create device nodes named + can be used. This will create device nodes named:: /dev/infiniband/uverbs0 diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 547090b41cfb..9f8a48016b41 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -745,7 +745,7 @@ found: "process %s did not enable P_Key index support.\n", current->comm); dev_warn(&file->port->dev, - " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); + " Documentation/infiniband/user_mad.rst has info on the new ABI.\n"); } } diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index cda8eac55fff..569d614d0c41 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -6,7 +6,7 @@ config INFINIBAND_IPOIB transports IP packets over InfiniBand so you can use your IB device as a fancy NIC. - See Documentation/infiniband/ipoib.txt for more information + See Documentation/infiniband/ipoib.rst for more information config INFINIBAND_IPOIB_CM bool "IP-over-InfiniBand Connected Mode support" From e9816ddf2a33f3afdf3dfc35c21aafad389ee482 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Wed, 19 Jun 2019 15:00:47 +0800 Subject: [PATCH 104/194] RDMA/hns: Cleanup unnecessary exported symbols This patch removes the hns-roce.ko for cleanup all the exported symbols in common part. Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/Kconfig | 15 +++++++-------- drivers/infiniband/hw/hns/Makefile | 13 +++++++++---- drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 -- drivers/infiniband/hw/hns/hns_roce_cmd.c | 4 ---- drivers/infiniband/hw/hns/hns_roce_cq.c | 5 ----- drivers/infiniband/hw/hns/hns_roce_db.c | 4 ---- drivers/infiniband/hw/hns/hns_roce_hem.c | 5 ----- drivers/infiniband/hw/hns/hns_roce_main.c | 3 --- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 ----- drivers/infiniband/hw/hns/hns_roce_pd.c | 2 -- drivers/infiniband/hw/hns/hns_roce_qp.c | 13 ------------- drivers/infiniband/hw/hns/hns_roce_srq.c | 1 - 12 files changed, 16 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index fddb5fdf92de..b548c003009b 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -7,25 +7,24 @@ config INFINIBAND_HNS is used in Hisilicon Hip06 and more further ICT SoC based on platform device. - To compile this driver as a module, choose M here: the module - will be called hns-roce. + To compile HIP06 or HIP08 driver as module, choose M here. config INFINIBAND_HNS_HIP06 - tristate "Hisilicon Hip06 Family RoCE support" + bool "Hisilicon Hip06 Family RoCE support" depends on INFINIBAND_HNS && HNS && HNS_DSAF && HNS_ENET ---help--- RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip06 and Hip07 SoC. These RoCE engines are platform devices. - To compile this driver as a module, choose M here: the module - will be called hns-roce-hw-v1. + To compile this driver, choose Y here: if INFINIBAND_HNS is m, this + module will be called hns-roce-hw-v1 config INFINIBAND_HNS_HIP08 - tristate "Hisilicon Hip08 Family RoCE support" + bool "Hisilicon Hip08 Family RoCE support" depends on INFINIBAND_HNS && PCI && HNS3 ---help--- RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC. The RoCE engine is a PCI device. - To compile this driver as a module, choose M here: the module - will be called hns-roce-hw-v2. + To compile this driver, choose Y here: if INFINIBAND_HNS is m, this + module will be called hns-roce-hw-v2. diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index eee5205f936f..b956cf430bdb 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,11 +4,16 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 -obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o -obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o + +ifdef CONFIG_INFINIBAND_HNS_HIP06 hns-roce-hw-v1-objs := hns_roce_hw_v1.o -obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o -hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o +obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v1.o $(hns-roce-objs) +endif + +ifdef CONFIG_INFINIBAND_HNS_HIP08 +hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o $(hns-roce-objs) +obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v2.o +endif diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 2c8defa94107..8c063c598d2a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -68,7 +68,6 @@ void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj, { hns_roce_bitmap_free_range(bitmap, obj, 1, rr); } -EXPORT_SYMBOL_GPL(hns_roce_bitmap_free); int hns_roce_bitmap_alloc_range(struct hns_roce_bitmap *bitmap, int cnt, int align, unsigned long *obj) @@ -175,7 +174,6 @@ void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size, kfree(buf->page_list); } } -EXPORT_SYMBOL_GPL(hns_roce_buf_free); int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, struct hns_roce_buf *buf, u32 page_shift) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 2acf946d02e5..b83d5bd92329 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -103,7 +103,6 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, context->out_param = out_param; complete(&context->done); } -EXPORT_SYMBOL_GPL(hns_roce_cmd_event); /* this should be called with "use_events" */ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, @@ -204,7 +203,6 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, return ret; } -EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox); int hns_roce_cmd_init(struct hns_roce_dev *hr_dev) { @@ -291,7 +289,6 @@ struct hns_roce_cmd_mailbox return mailbox; } -EXPORT_SYMBOL_GPL(hns_roce_alloc_cmd_mailbox); void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox) @@ -302,4 +299,3 @@ void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev, dma_pool_free(hr_dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } -EXPORT_SYMBOL_GPL(hns_roce_free_cmd_mailbox); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6b4d8e50aabe..4e50c22a2da4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -205,7 +205,6 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR); } -EXPORT_SYMBOL_GPL(hns_roce_free_cq); static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev, struct ib_udata *udata, @@ -435,7 +434,6 @@ err_db: err_cq: return ret; } -EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq); void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { @@ -465,7 +463,6 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hns_roce_free_db(hr_dev, &hr_cq->db); } } -EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq); void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn) { @@ -481,7 +478,6 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn) ++cq->arm_sn; cq->comp(cq); } -EXPORT_SYMBOL_GPL(hns_roce_cq_completion); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) { @@ -503,7 +499,6 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) if (atomic_dec_and_test(&cq->refcount)) complete(&cq->free); } -EXPORT_SYMBOL_GPL(hns_roce_cq_event); int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev) { diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index 3a040a9390d8..627aa46ef683 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -51,7 +51,6 @@ out: return ret; } -EXPORT_SYMBOL(hns_roce_db_map_user); void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, struct hns_roce_db *db) @@ -67,7 +66,6 @@ void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, mutex_unlock(&context->page_mutex); } -EXPORT_SYMBOL(hns_roce_db_unmap_user); static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir( struct device *dma_device) @@ -151,7 +149,6 @@ out: return ret; } -EXPORT_SYMBOL_GPL(hns_roce_alloc_db); void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db) { @@ -181,4 +178,3 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db) mutex_unlock(&hr_dev->pgdir_mutex); } -EXPORT_SYMBOL_GPL(hns_roce_free_db); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index d145e3ed21d4..f4da5bd2884f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -56,7 +56,6 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type) return false; } -EXPORT_SYMBOL_GPL(hns_roce_check_whether_mhop); static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx, u32 bt_chunk_num) @@ -234,7 +233,6 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, return 0; } -EXPORT_SYMBOL_GPL(hns_roce_calc_hem_mhop); static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev, int npages, @@ -621,7 +619,6 @@ out: mutex_unlock(&table->mutex); return ret; } -EXPORT_SYMBOL_GPL(hns_roce_table_get); static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, @@ -764,7 +761,6 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, mutex_unlock(&table->mutex); } -EXPORT_SYMBOL_GPL(hns_roce_table_put); void *hns_roce_table_find(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, @@ -837,7 +833,6 @@ out: mutex_unlock(&table->mutex); return addr; } -EXPORT_SYMBOL_GPL(hns_roce_table_find); int hns_roce_table_get_range(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c0e819ed8c9b..9f83acec6001 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -57,7 +57,6 @@ int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index) { return gid_index * hr_dev->caps.num_ports + port; } -EXPORT_SYMBOL_GPL(hns_get_gid_index); static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr) { @@ -974,7 +973,6 @@ error_failed_cmq_init: return ret; } -EXPORT_SYMBOL_GPL(hns_roce_init); void hns_roce_exit(struct hns_roce_dev *hr_dev) { @@ -995,7 +993,6 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->hw->reset) hr_dev->hw->reset(hr_dev, false); } -EXPORT_SYMBOL_GPL(hns_roce_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Wei Hu "); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index adf075183dfb..549e1a38dfe0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -47,7 +47,6 @@ unsigned long key_to_hw_index(u32 key) { return (key << 24) | (key >> 8); } -EXPORT_SYMBOL_GPL(key_to_hw_index); static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox, @@ -66,7 +65,6 @@ int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev, mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT, HNS_ROCE_CMD_TIMEOUT_MSECS); } -EXPORT_SYMBOL_GPL(hns_roce_hw2sw_mpt); static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order, unsigned long *seg) @@ -293,7 +291,6 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt) break; } } -EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup); static void hns_roce_loop_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int err_loop_index, @@ -1507,7 +1504,6 @@ void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, { hns_roce_hem_list_release(hr_dev, &mtr->hem_list); } -EXPORT_SYMBOL_GPL(hns_roce_mtr_cleanup); static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, dma_addr_t *bufs, @@ -1611,4 +1607,3 @@ done: return total; } -EXPORT_SYMBOL_GPL(hns_roce_mtr_find); diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 813401384d78..920ca76b5db1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -83,13 +83,11 @@ int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) return 0; } -EXPORT_SYMBOL_GPL(hns_roce_alloc_pd); void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) { hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn); } -EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd); int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) { diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 7e9db8236072..ee7e1fef31e7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -64,7 +64,6 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) if (atomic_dec_and_test(&qp->refcount)) complete(&qp->free); } -EXPORT_SYMBOL_GPL(hns_roce_qp_event); static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp, enum hns_roce_event type) @@ -139,7 +138,6 @@ enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state) return HNS_ROCE_QP_NUM_STATE; } } -EXPORT_SYMBOL_GPL(to_hns_roce_state); static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn, struct hns_roce_qp *hr_qp) @@ -242,7 +240,6 @@ void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) __xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1)); xa_unlock_irqrestore(xa, flags); } -EXPORT_SYMBOL_GPL(hns_roce_qp_remove); void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { @@ -260,7 +257,6 @@ void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn); } } -EXPORT_SYMBOL_GPL(hns_roce_qp_free); void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, int cnt) @@ -272,7 +268,6 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR); } -EXPORT_SYMBOL_GPL(hns_roce_release_range_qp); static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, bool is_user, int has_rq, @@ -1027,7 +1022,6 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd, return &hr_qp->ibqp; } -EXPORT_SYMBOL_GPL(hns_roce_create_qp); int to_hr_qp_type(int qp_type) { @@ -1046,7 +1040,6 @@ int to_hr_qp_type(int qp_type) return transport_type; } -EXPORT_SYMBOL_GPL(to_hr_qp_type); int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) @@ -1166,7 +1159,6 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } -EXPORT_SYMBOL_GPL(hns_roce_lock_cqs); void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) __releases(&send_cq->lock) @@ -1183,7 +1175,6 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, spin_unlock_irq(&recv_cq->lock); } } -EXPORT_SYMBOL_GPL(hns_roce_unlock_cqs); static void *get_wqe(struct hns_roce_qp *hr_qp, int offset) { @@ -1195,20 +1186,17 @@ void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n) { return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift)); } -EXPORT_SYMBOL_GPL(get_recv_wqe); void *get_send_wqe(struct hns_roce_qp *hr_qp, int n) { return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift)); } -EXPORT_SYMBOL_GPL(get_send_wqe); void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n) { return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset + (n << hr_qp->sge.sge_shift)); } -EXPORT_SYMBOL_GPL(get_send_extend_sge); bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq, struct ib_cq *ib_cq) @@ -1227,7 +1215,6 @@ bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq, return cur + nreq >= hr_wq->max_post; } -EXPORT_SYMBOL_GPL(hns_roce_wq_overflow); int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) { diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index de645be8aa48..38bb548eaa6d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -30,7 +30,6 @@ void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type) if (atomic_dec_and_test(&srq->refcount)) complete(&srq->free); } -EXPORT_SYMBOL_GPL(hns_roce_srq_event); static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, enum hns_roce_event event_type) From 34d65cd837d0c77fac0c0da632c616030b2927e3 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Jun 2019 17:00:44 -0400 Subject: [PATCH 105/194] RDMA/netlink: Audit policy settings for netlink attributes For all string attributes for which we don't currently accept the element as input, we only use it as output, set the string length to RDMA_NLDEV_ATTR_EMPTY_STRING which is defined as 1. That way we will only accept a null string for that element. This will prevent someone from writing a new input routine that uses the element without also updating the policy to have a valid value. Also while there, make sure the existing entries that are valid have the correct policy, if not, correct the policy. Remove unnecessary checks for nla_strlcpy() overflow once the policy has been set correctly. Signed-off-by: Doug Ledford Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 25 ++++++++++++------------- include/rdma/rdma_netlink.h | 6 ++++++ include/uapi/rdma/rdma_netlink.h | 4 ---- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 6006d23d0410..5499f5629dc2 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -49,29 +49,29 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, - .len = 128 }, + .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = IB_DEVICE_NAME_MAX - 1}, + .len = IB_DEVICE_NAME_MAX }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, - .len = IB_FW_VERSION_NAME_MAX - 1}, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, @@ -92,7 +92,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, - .len = TASK_COMM_LEN }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, @@ -120,7 +120,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, - .len = 16 }, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, @@ -1361,7 +1361,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; - char client_name[IB_DEVICE_NAME_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; struct ib_client_nl_info data = {}; struct ib_device *ibdev = NULL; struct sk_buff *msg; @@ -1373,9 +1373,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; - if (nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], - sizeof(client_name)) >= sizeof(client_name)) - return -EINVAL; + nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c7acbe083428..6631624e4d7c 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -6,6 +6,12 @@ #include #include +enum { + RDMA_NLDEV_ATTR_EMPTY_STRING = 1, + RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, + RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE = 32, +}; + struct rdma_nl_cbs { int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index b27c02185dcc..650cee8c4bf1 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -284,10 +284,6 @@ enum rdma_nldev_command { RDMA_NLDEV_NUM_OPS }; -enum { - RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, -}; - enum rdma_nldev_print_type { RDMA_NLDEV_PRINT_TYPE_UNSPEC, RDMA_NLDEV_PRINT_TYPE_HEX, From 10dcc7448e9ea49488a38bca7551de1a9da06ad9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Jun 2019 13:16:49 +0100 Subject: [PATCH 106/194] RDMA/hns: fix spelling mistake "attatch" -> "attach" There is a spelling mistake in an dev_err message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index ee7e1fef31e7..4f693cded74e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -853,7 +853,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, hr_qp->regions, hr_qp->region_cnt); if (ret) { - dev_err(dev, "mtr attatch error for create qp\n"); + dev_err(dev, "mtr attach error for create qp\n"); goto err_mtr; } From 239b0e52d8aa64d2559c672fd8c29cf1fffc3ec7 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:17 -0400 Subject: [PATCH 107/194] IB/hfi1: Move rvt_cq_wc struct into uapi directory The rvt_cq_wc struct elements are shared between rdmavt and the providers but not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2 The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. In that case, move rvt_cq_wc struct into the rvt-abi.h header file and create a rvt_k_cq_w for the kernel completion queue. Signed-off-by: Kamenee Arumugam Reviewed-by: Mike Marciniszyn Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 4 +- drivers/infiniband/sw/rdmavt/cq.c | 192 +++++++++++++++++++----------- include/rdma/rdmavt_cq.h | 22 +++- include/rdma/rdmavt_qp.h | 32 +++++ include/uapi/rdma/rvt-abi.h | 32 +++++ 5 files changed, 205 insertions(+), 77 deletions(-) create mode 100644 include/uapi/rdma/rvt-abi.h diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 4e0e9fc0a777..41261e72c429 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -702,8 +702,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) sde ? sde->this_idx : 0, send_context, send_context ? send_context->sw_index : 0, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, + ib_cq_head(qp->ibqp.send_cq), + ib_cq_tail(qp->ibqp.send_cq), qp->pid, qp->s_state, qp->s_ack_state, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index b46714a92b7a..2602ad8b8cb0 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -63,19 +63,33 @@ static struct workqueue_struct *comp_vector_wq; */ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { - struct rvt_cq_wc *wc; + struct ib_uverbs_wc *uqueue = NULL; + struct ib_wc *kqueue = NULL; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; unsigned long flags; u32 head; u32 next; + u32 tail; spin_lock_irqsave(&cq->lock, flags); + if (cq->ip) { + u_wc = cq->queue; + uqueue = &u_wc->uqueue[0]; + head = RDMA_READ_UAPI_ATOMIC(u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail); + } else { + k_wc = cq->kqueue; + kqueue = &k_wc->kqueue[0]; + head = k_wc->head; + tail = k_wc->tail; + } + /* - * Note that the head pointer might be writable by user processes. - * Take care to verify it is a sane value. + * Note that the head pointer might be writable by + * user processes.Take care to verify it is a sane value. */ - wc = cq->queue; - head = wc->head; if (head >= (unsigned)cq->ibcq.cqe) { head = cq->ibcq.cqe; next = 0; @@ -83,7 +97,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == wc->tail)) { + if (unlikely(next == tail)) { spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -96,27 +110,27 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) return; } trace_rvt_cq_enter(cq, entry, head); - if (cq->ip) { - wc->uqueue[head].wr_id = entry->wr_id; - wc->uqueue[head].status = entry->status; - wc->uqueue[head].opcode = entry->opcode; - wc->uqueue[head].vendor_err = entry->vendor_err; - wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = entry->ex.imm_data; - wc->uqueue[head].qp_num = entry->qp->qp_num; - wc->uqueue[head].src_qp = entry->src_qp; - wc->uqueue[head].wc_flags = entry->wc_flags; - wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); - wc->uqueue[head].sl = entry->sl; - wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; - wc->uqueue[head].port_num = entry->port_num; + if (uqueue) { + uqueue[head].wr_id = entry->wr_id; + uqueue[head].status = entry->status; + uqueue[head].opcode = entry->opcode; + uqueue[head].vendor_err = entry->vendor_err; + uqueue[head].byte_len = entry->byte_len; + uqueue[head].ex.imm_data = entry->ex.imm_data; + uqueue[head].qp_num = entry->qp->qp_num; + uqueue[head].src_qp = entry->src_qp; + uqueue[head].wc_flags = entry->wc_flags; + uqueue[head].pkey_index = entry->pkey_index; + uqueue[head].slid = ib_lid_cpu16(entry->slid); + uqueue[head].sl = entry->sl; + uqueue[head].dlid_path_bits = entry->dlid_path_bits; + uqueue[head].port_num = entry->port_num; /* Make sure entry is written before the head index. */ - smp_wmb(); + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next); } else { - wc->kqueue[head] = *entry; + kqueue[head] = *entry; + k_wc->head = next; } - wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && @@ -179,8 +193,9 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, { struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq); - struct rvt_cq_wc *wc; + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; @@ -204,22 +219,28 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (entries + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (entries + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } + /* * Return the address of the WC as the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); + int err; + + cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { err = -ENOMEM; goto bail_wc; @@ -264,7 +285,10 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); INIT_WORK(&cq->comptask, send_complete); - cq->queue = wc; + if (u_wc) + cq->queue = u_wc; + else + cq->kqueue = k_wc; trace_rvt_create_cq(cq, attr); return 0; @@ -272,7 +296,8 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, bail_ip: kfree(cq->ip); bail_wc: - vfree(wc); + vfree(u_wc); + vfree(k_wc); return err; } @@ -322,9 +347,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - cq->queue->head != cq->queue->tail) - ret = 1; + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + if (cq->queue) { + if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) != + RDMA_READ_UAPI_ATOMIC(cq->queue->tail)) + ret = 1; + } else { + if (cq->kqueue->head != cq->kqueue->tail) + ret = 1; + } + } spin_unlock_irqrestore(&cq->lock, flags); @@ -340,12 +372,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *old_wc; - struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; struct rvt_dev_info *rdi = cq->rdi; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_cq_wc *old_u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; + struct rvt_k_cq_wc *old_k_wc = NULL; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) return -EINVAL; @@ -353,17 +387,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; - + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (cqe + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (cqe + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { __u64 offset = 0; @@ -378,11 +414,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ - old_wc = cq->queue; - head = old_wc->head; + if (u_wc) { + old_u_wc = cq->queue; + head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail); + } else { + old_k_wc = cq->kqueue; + head = old_k_wc->head; + tail = old_k_wc->tail; + } + if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; - tail = old_wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) @@ -394,27 +437,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail_unlock; } for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; + if (u_wc) + u_wc->uqueue[n] = old_u_wc->uqueue[tail]; else - wc->kqueue[n] = old_wc->kqueue[tail]; + k_wc->kqueue[n] = old_k_wc->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; + if (u_wc) { + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n); + RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0); + cq->queue = u_wc; + } else { + k_wc->head = n; + k_wc->tail = 0; + cq->kqueue = k_wc; + } spin_unlock_irq(&cq->lock); - vfree(old_wc); + if (u_wc) + vfree(old_u_wc); + else + vfree(old_k_wc); if (cq->ip) { struct rvt_mmap_info *ip = cq->ip; - rvt_update_mmap_info(rdi, ip, sz, wc); + rvt_update_mmap_info(rdi, ip, sz, u_wc); /* * Return the offset to mmap. @@ -438,7 +490,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) bail_unlock: spin_unlock_irq(&cq->lock); bail_free: - vfree(wc); + vfree(u_wc); + vfree(k_wc); + return ret; } @@ -456,7 +510,7 @@ bail_free: int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *wc; + struct rvt_k_cq_wc *wc; unsigned long flags; int npolled; u32 tail; @@ -467,7 +521,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) spin_lock_irqsave(&cq->lock, flags); - wc = cq->queue; + wc = cq->kqueue; tail = wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 75dc65c0bfb8..ab22860a63e2 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -60,19 +60,28 @@ */ #define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) +/* + * Define read macro that apply smp_load_acquire memory barrier + * when reading indice of circular buffer that mmaped to user space. + */ +#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val) + +/* + * Define write macro that uses smp_store_release memory barrier + * when writing indice of circular buffer that mmaped to user space. + */ +#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x) +#include + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so * it can be mmap'ed into user space. */ -struct rvt_cq_wc { +struct rvt_k_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; + struct ib_wc kqueue[]; }; /* @@ -88,6 +97,7 @@ struct rvt_cq { struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; + struct rvt_k_cq_wc *kqueue; }; static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 84d0f36afc2f..7fcd687af278 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -820,6 +820,38 @@ struct rvt_qp_iter { int n; }; +/** + * ib_cq_tail - Return tail index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get tail + * of cq buffer. + */ +static inline u32 ib_cq_tail(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->tail) : + ibcq_to_rvtcq(send_cq)->kqueue->tail; +} + +/** + * ib_cq_head - Return head index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get head + * of cq buffer. + */ +static inline u32 ib_cq_head(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->head) : + ibcq_to_rvtcq(send_cq)->kqueue->head; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h new file mode 100644 index 000000000000..8e5f7e0c15fe --- /dev/null +++ b/include/uapi/rdma/rvt-abi.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef RVT_ABI_USER_H +#define RVT_ABI_USER_H + +#include +#include +#ifndef RDMA_ATOMIC_UAPI +#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name +#endif + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + /* index of next entry to fill */ + RDMA_ATOMIC_UAPI(__u32, head); + /* index of next ib_poll_cq() entry */ + RDMA_ATOMIC_UAPI(__u32, tail); + + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[]; +}; + +#endif /* RVT_ABI_USER_H */ From dabac6e460ce8473f1e685432a8ab7818d81a1f1 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:24 -0400 Subject: [PATCH 108/194] IB/hfi1: Move receive work queue struct into uapi directory The rvt_rwqe and rvt_rwq struct elements are shared between rdmavt and the providers but are not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2, The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. Move rvt_rwqe and rvt_rwq struct into rvt-abi.h header in uapi directory. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 152 +++++++++++++++++++++-------- drivers/infiniband/sw/rdmavt/qp.h | 2 + drivers/infiniband/sw/rdmavt/rc.c | 10 +- drivers/infiniband/sw/rdmavt/srq.c | 59 ++++++----- include/rdma/rdmavt_qp.h | 52 ++++++---- include/uapi/rdma/rvt-abi.h | 29 ++++++ 6 files changed, 212 insertions(+), 92 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0d804a58f954..1384060f175d 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -802,6 +802,46 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) } } +/** + * rvt_alloc_rq - allocate memory for user or kernel buffer + * @rq: receive queue data structure + * @size: number of request queue entries + * @node: The NUMA node + * @udata: True if user data is available or not false + * + * Return: If memory allocation failed, return -ENONEM + * This function is used by both shared receive + * queues and non-shared receive queues to allocate + * memory. + */ +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata) +{ + if (udata) { + rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size); + if (!rq->wq) + goto bail; + /* need kwq with no buffers */ + rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->wq->wq; + } else { + /* need kwq with buffers */ + rq->kwq = + vzalloc_node(sizeof(struct rvt_krwq) + size, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->kwq->wq; + } + + spin_lock_init(&rq->lock); + return 0; +bail: + rvt_free_rq(rq); + return -ENOMEM; +} + /** * rvt_init_qp - initialize the QP state to the reset state * @qp: the QP to init or reinit @@ -852,10 +892,6 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -1046,17 +1082,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.max_sge = init_attr->cap.max_recv_sge; sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + sizeof(struct rvt_rwqe); - if (udata) - qp->r_rq.wq = vmalloc_user( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz); - else - qp->r_rq.wq = vzalloc_node( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz, - rdi->dparms.node); - if (!qp->r_rq.wq) + err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz, + rdi->dparms.node, udata); + if (err) { + ret = ERR_PTR(err); goto bail_driver_priv; + } } /* @@ -1202,8 +1233,7 @@ bail_qpn: rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: - if (!qp->ip) - vfree(qp->r_rq.wq); + rvt_free_rq(&qp->r_rq); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1269,19 +1299,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) } wc.status = IB_WC_WR_FLUSH_ERR; - if (qp->r_rq.wq) { - struct rvt_rwq *wq; + if (qp->r_rq.kwq) { u32 head; u32 tail; + struct rvt_rwq *wq = NULL; + struct rvt_krwq *kwq = NULL; spin_lock(&qp->r_rq.lock); - + /* qp->ip used to validate if there is a user buffer mmaped */ + if (qp->ip) { + wq = qp->r_rq.wq; + head = RDMA_READ_UAPI_ATOMIC(wq->head); + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = qp->r_rq.kwq; + head = kwq->head; + tail = kwq->tail; + } /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; while (tail != head) { @@ -1290,8 +1327,10 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) tail = 0; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } - wq->tail = tail; - + if (qp->ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; spin_unlock(&qp->r_rq.lock); } else if (qp->ibqp.event_handler) { ret = 1; @@ -1634,8 +1673,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (qp->ip) kref_put(&qp->ip->ref, rvt_release_mmap_info); - else - vfree(qp->r_rq.wq); + kvfree(qp->r_rq.kwq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); @@ -1721,7 +1759,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); - struct rvt_rwq *wq = qp->r_rq.wq; + struct rvt_krwq *wq = qp->r_rq.kwq; unsigned long flags; int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && !qp->ibqp.srq; @@ -1746,7 +1784,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&qp->r_rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -1770,8 +1808,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, * Make sure queue entry is written * before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); } spin_unlock_irqrestore(&qp->r_rq.lock, flags); } @@ -2141,7 +2178,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); - struct rvt_rwq *wq; + struct rvt_krwq *wq; unsigned long flags; for (; wr; wr = wr->next) { @@ -2155,11 +2192,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, } spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; + wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&srq->rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -2171,8 +2208,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.lock, flags); } return 0; @@ -2229,6 +2265,25 @@ bad_lkey: return 0; } +/** + * get_rvt_head - get head indices of the circular buffer + * @rq: data structure for request queue entry + * @ip: the QP + * + * Return - head index value + */ +static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip) +{ + u32 head; + + if (ip) + head = RDMA_READ_UAPI_ATOMIC(rq->wq->head); + else + head = rq->kwq->head; + + return head; +} + /** * rvt_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP @@ -2243,21 +2298,26 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; + struct rvt_krwq *kwq; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; + u32 head; int ret; + void *ip = NULL; if (qp->ibqp.srq) { srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; + ip = srq->ip; } else { srq = NULL; handler = NULL; rq = &qp->r_rq; + ip = qp->ip; } spin_lock_irqsave(&rq->lock, flags); @@ -2265,17 +2325,24 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) ret = 0; goto unlock; } + if (ip) { + wq = rq->wq; + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = rq->kwq; + tail = kwq->tail; + } - wq = rq->wq; - tail = wq->tail; /* Validate tail before using it since it is user writable. */ if (tail >= rq->size) tail = 0; - if (unlikely(tail == wq->head)) { + + head = get_rvt_head(rq, ip); + if (unlikely(tail == head)) { ret = 0; goto unlock; } - /* Make sure entry is read after head index is read. */ + /* Make sure entry is read after the count is read. */ smp_rmb(); wqe = rvt_get_rwqe_ptr(rq, tail); /* @@ -2285,7 +2352,10 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) */ if (++tail >= rq->size) tail = 0; - wq->tail = tail; + if (ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; if (!wr_id_only && !init_sge(qp, wqe)) { ret = -1; goto unlock; @@ -2301,7 +2371,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) * Validate head pointer value and compute * the number of remaining WQEs. */ - n = wq->head; + n = get_rvt_head(rq, ip); if (n >= rq->size) n = 0; if (n < tail) diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 6db1619389b0..2cdba1283bf6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); int rvt_wss_init(struct rvt_dev_info *rdi); void rvt_wss_exit(struct rvt_dev_info *rdi); +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 09f0cf538be6..44cc7ee1b321 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -104,15 +104,19 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) } else { u32 min, max, x; u32 credits; - struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; /* sanity check pointers before trusting them */ - head = wq->head; + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; /* diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 8d6b3e764255..d306f6547cba 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -52,7 +52,7 @@ #include "srq.h" #include "vt.h" - +#include "qp.h" /** * rvt_driver_srq_init - init srq resources on a per driver basis * @rdi: rvt dev structure @@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, - dev->dparms.node); - if (!srq->rq.wq) { + if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz, + dev->dparms.node, udata)) { ret = -ENOMEM; goto bail_srq; } @@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, bail_ip: kfree(srq->ip); bail_wq: - vfree(srq->rq.wq); + rvt_free_rq(&srq->rq); bail_srq: return ret; } @@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); - struct rvt_rwq *wq; + struct rvt_rq tmp_rq = {}; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { - struct rvt_rwq *owq; + struct rvt_krwq *okwq = NULL; + struct rvt_rwq *owq = NULL; struct rvt_rwqe *p; u32 sz, size, n, head, tail; @@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, ((attr_mask & IB_SRQ_LIMIT) ? attr->srq_limit : srq->limit) > attr->max_wr) return -EINVAL; - sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + size * sz, - dev->dparms.node); - if (!wq) + if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node, + udata)) return -ENOMEM; - /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { __u64 offset_addr; @@ -218,9 +211,15 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, * validate head and tail pointer values and compute * the number of remaining WQEs. */ - owq = srq->rq.wq; - head = owq->head; - tail = owq->tail; + if (udata) { + owq = srq->rq.wq; + head = RDMA_READ_UAPI_ATOMIC(owq->head); + tail = RDMA_READ_UAPI_ATOMIC(owq->tail); + } else { + okwq = srq->rq.kwq; + head = okwq->head; + tail = okwq->tail; + } if (head >= srq->rq.size || tail >= srq->rq.size) { ret = -EINVAL; goto bail_unlock; @@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_unlock; } n = 0; - p = wq->wq; + p = tmp_rq.kwq->curr_wq; while (tail != head) { struct rvt_rwqe *wqe; int i; @@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (++tail >= srq->rq.size) tail = 0; } - srq->rq.wq = wq; + srq->rq.kwq = tmp_rq.kwq; + if (udata) { + srq->rq.wq = tmp_rq.wq; + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n); + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0); + } else { + tmp_rq.kwq->head = n; + tmp_rq.kwq->tail = 0; + } srq->rq.size = size; - wq->head = n; - wq->tail = 0; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; spin_unlock_irq(&srq->rq.lock); vfree(owq); + kvfree(okwq); if (srq->ip) { struct rvt_mmap_info *ip = srq->ip; struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device); u32 s = sizeof(struct rvt_rwq) + size * sz; - rvt_update_mmap_info(dev, ip, s, wq); + rvt_update_mmap_info(dev, ip, s, tmp_rq.wq); /* * Return the offset to mmap. @@ -301,7 +307,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, bail_unlock: spin_unlock_irq(&srq->rq.lock); bail_free: - vfree(wq); + rvt_free_rq(&tmp_rq); return ret; } @@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) spin_unlock(&dev->n_srqs_lock); if (srq->ip) kref_put(&srq->ip->ref, rvt_release_mmap_info); - else - vfree(srq->rq.wq); + kvfree(srq->rq.kwq); } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 7fcd687af278..ee55fd04f6da 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -52,6 +52,7 @@ #include #include #include +#include /* * Atomic bit definitions for r_aflags. */ @@ -177,33 +178,27 @@ struct rvt_swqe { struct rvt_sge sg_list[0]; }; -/* - * Receive work request queue entry. - * The size of the sg_list is determined when the QP (or SRQ) is created - * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). +/** + * struct rvt_krwq - kernel struct receive work request + * @head: index of next entry to fill + * @tail: index of next entry to pull + * @count: count is aproximate of total receive enteries posted + * @rvt_rwqe: struct of receive work request queue entry + * + * This structure is used to contain the head pointer, + * tail pointer and receive work queue entries for kernel + * mode user. */ -struct rvt_rwqe { - u64 wr_id; - u8 num_sge; - struct ib_sge sg_list[0]; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and receive work queue entries as a single memory allocation so - * it can be mmap'ed into user space. - * Note that the wq array elements are variable size so you can't - * just index into the array to get the N'th element; - * use get_rwqe_ptr() instead. - */ -struct rvt_rwq { +struct rvt_krwq { u32 head; /* new work requests posted to the head */ u32 tail; /* receives pull requests from here. */ - struct rvt_rwqe wq[0]; + struct rvt_rwqe *curr_wq; + struct rvt_rwqe wq[]; }; struct rvt_rq { struct rvt_rwq *wq; + struct rvt_krwq *kwq; u32 size; /* size of RWQE array */ u8 max_sge; /* protect changes in this struct */ @@ -472,7 +467,7 @@ static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) { return (struct rvt_rwqe *) - ((char *)rq->wq->wq + + ((char *)rq->kwq->curr_wq + (sizeof(struct rvt_rwqe) + rq->max_sge * sizeof(struct ib_sge)) * n); } @@ -852,6 +847,21 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) ibcq_to_rvtcq(send_cq)->kqueue->head; } +/** + * rvt_free_rq - free memory allocated for rvt_rq struct + * @rvt_rq: request queue data structure + * + * This function should only be called if the rvt_mmap_info() + * has not succeeded. + */ +static inline void rvt_free_rq(struct rvt_rq *rq) +{ + kvfree(rq->kwq); + rq->kwq = NULL; + vfree(rq->wq); + rq->wq = NULL; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index 8e5f7e0c15fe..d2e35d24f1a9 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,6 +10,7 @@ #include #include +#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif @@ -29,4 +30,32 @@ struct rvt_cq_wc { struct ib_uverbs_wc uqueue[]; }; +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + __u64 wr_id; + __u8 num_sge; + __u8 padding[7]; + struct ib_sge sg_list[]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr() + * for kernel space. + */ +struct rvt_rwq { + /* new work requests posted to the head */ + RDMA_ATOMIC_UAPI(__u32, head); + /* receives pull requests from here. */ + RDMA_ATOMIC_UAPI(__u32, tail); + struct rvt_rwqe wq[]; +}; #endif /* RVT_ABI_USER_H */ From f592ae3c999fbe4faeeb90dfde8ff7da49ee4ae6 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:30 -0400 Subject: [PATCH 109/194] IB/rdmavt: Fracture single lock used for posting and processing RWQEs Usage of single lock prevents fetching posted and processing receive work queue entries from progressing simultaneously and impacts overall performance. Fracture the single lock used for posting and processing Receive Work Queue Entries (RWQEs) to allow the circular buffer to be filled and emptied at the same time. Two new spinlocks - one for the producers and one for the consumers used for posting and processing RWQEs simultaneously and the two indices are define on two different cache lines. The threshold count is used to avoid reading other index in different cache line every time. Signed-off-by: Harish Chegondi Signed-off-by: Kamenee Arumugam Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 95 +++++++++++++++++++----------- drivers/infiniband/sw/rdmavt/rc.c | 43 +++++++------- drivers/infiniband/sw/rdmavt/srq.c | 10 ++-- include/rdma/rdmavt_qp.h | 7 +++ 4 files changed, 96 insertions(+), 59 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 1384060f175d..200b292be63e 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -58,6 +58,8 @@ #include "vt.h" #include "trace.h" +#define RVT_RWQ_COUNT_THRESHOLD 16 + static void rvt_rc_timeout(struct timer_list *t); /* @@ -835,7 +837,8 @@ int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, rq->kwq->curr_wq = rq->kwq->wq; } - spin_lock_init(&rq->lock); + spin_lock_init(&rq->kwq->p_lock); + spin_lock_init(&rq->kwq->c_lock); return 0; bail: rvt_free_rq(rq); @@ -892,6 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; + if (qp->r_rq.kwq) + qp->r_rq.kwq->count = qp->r_rq.size; qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -1097,7 +1102,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_lock); spin_lock_init(&qp->s_hlock); spin_lock_init(&qp->s_lock); - spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); atomic_set(&qp->local_ops_pending, 0); init_waitqueue_head(&qp->wait); @@ -1305,7 +1309,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) struct rvt_rwq *wq = NULL; struct rvt_krwq *kwq = NULL; - spin_lock(&qp->r_rq.lock); + spin_lock(&qp->r_rq.kwq->c_lock); /* qp->ip used to validate if there is a user buffer mmaped */ if (qp->ip) { wq = qp->r_rq.wq; @@ -1331,7 +1335,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); else kwq->tail = tail; - spin_unlock(&qp->r_rq.lock); + spin_unlock(&qp->r_rq.kwq->c_lock); } else if (qp->ibqp.event_handler) { ret = 1; } @@ -1780,12 +1784,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&qp->r_rq.lock, flags); + spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags); next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; if (next == READ_ONCE(wq->tail)) { - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -1810,7 +1814,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, */ smp_store_release(&wq->head, next); } - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); } return 0; } @@ -2191,13 +2195,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&srq->rq.lock, flags); + spin_lock_irqsave(&srq->rq.kwq->p_lock, flags); wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; if (next == READ_ONCE(wq->tail)) { - spin_unlock_irqrestore(&srq->rq.lock, flags); + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -2209,7 +2213,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe->sg_list[i] = wr->sg_list[i]; /* Make sure queue entry is written before the head index. */ smp_store_release(&wq->head, next); - spin_unlock_irqrestore(&srq->rq.lock, flags); + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); } return 0; } @@ -2265,6 +2269,31 @@ bad_lkey: return 0; } +/** + * get_count - count numbers of request work queue entries + * in circular buffer + * @rq: data structure for request queue entry + * @tail: tail indices of the circular buffer + * @head: head indices of the circular buffer + * + * Return - total number of entries in the circular buffer + */ +static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head) +{ + u32 count; + + count = head; + + if (count >= rq->size) + count = 0; + if (count < tail) + count += rq->size - tail; + else + count -= tail; + + return count; +} + /** * get_rvt_head - get head indices of the circular buffer * @rq: data structure for request queue entry @@ -2298,7 +2327,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; - struct rvt_krwq *kwq; + struct rvt_krwq *kwq = NULL; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; @@ -2320,16 +2349,16 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) ip = qp->ip; } - spin_lock_irqsave(&rq->lock, flags); + spin_lock_irqsave(&rq->kwq->c_lock, flags); if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { ret = 0; goto unlock; } + kwq = rq->kwq; if (ip) { wq = rq->wq; tail = RDMA_READ_UAPI_ATOMIC(wq->tail); } else { - kwq = rq->kwq; tail = kwq->tail; } @@ -2337,8 +2366,11 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) if (tail >= rq->size) tail = 0; - head = get_rvt_head(rq, ip); - if (unlikely(tail == head)) { + if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) { + head = get_rvt_head(rq, ip); + kwq->count = get_count(rq, tail, head); + } + if (unlikely(kwq->count == 0)) { ret = 0; goto unlock; } @@ -2362,36 +2394,31 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) } qp->r_wr_id = wqe->wr_id; + kwq->count--; ret = 1; set_bit(RVT_R_WRID_VALID, &qp->r_aflags); if (handler) { - u32 n; - /* * Validate head pointer value and compute * the number of remaining WQEs. */ - n = get_rvt_head(rq, ip); - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; + if (kwq->count < srq->limit) { + kwq->count = get_count(rq, tail, get_rvt_head(rq, ip)); + if (kwq->count < srq->limit) { + struct ib_event ev; - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; + srq->limit = 0; + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } } } unlock: - spin_unlock_irqrestore(&rq->lock, flags); + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); bail: return ret; } diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 44cc7ee1b321..890d7b760d2e 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -107,27 +107,30 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) u32 head; u32 tail; - /* sanity check pointers before trusting them */ - if (qp->ip) { - head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); - tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); - } else { - head = READ_ONCE(qp->r_rq.kwq->head); - tail = READ_ONCE(qp->r_rq.kwq->tail); + credits = READ_ONCE(qp->r_rq.kwq->count); + if (credits == 0) { + /* sanity check pointers before trusting them */ + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } + if (head >= qp->r_rq.size) + head = 0; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * There is a small chance that the pair of reads are + * not atomic, which is OK, since the fuzziness is + * resolved as further ACKs go out. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; } - if (head >= qp->r_rq.size) - head = 0; - if (tail >= qp->r_rq.size) - tail = 0; - /* - * Compute the number of credits available (RWQEs). - * There is a small chance that the pair of reads are - * not atomic, which is OK, since the fuzziness is - * resolved as further ACKs go out. - */ - credits = head - tail; - if ((int)credits < 0) - credits += qp->r_rq.size; /* * Binary search the credit table to find the code to * use. diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index d306f6547cba..24fef021d51d 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -206,7 +206,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_free; } - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); /* * validate head and tail pointer values and compute * the number of remaining WQEs. @@ -261,7 +261,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, srq->rq.size = size; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); vfree(owq); kvfree(okwq); @@ -295,17 +295,17 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, spin_unlock_irq(&dev->pending_lock); } } else if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); if (attr->srq_limit >= srq->rq.size) ret = -EINVAL; else srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); } return ret; bail_unlock: - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); bail_free: rvt_free_rq(&tmp_rq); return ret; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index ee55fd04f6da..de5915b244be 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -180,7 +180,9 @@ struct rvt_swqe { /** * struct rvt_krwq - kernel struct receive work request + * @p_lock: lock to protect producer of the kernel buffer * @head: index of next entry to fill + * @c_lock:lock to protect consumer of the kernel buffer * @tail: index of next entry to pull * @count: count is aproximate of total receive enteries posted * @rvt_rwqe: struct of receive work request queue entry @@ -190,8 +192,13 @@ struct rvt_swqe { * mode user. */ struct rvt_krwq { + spinlock_t p_lock; /* protect producer */ u32 head; /* new work requests posted to the head */ + + /* protect consumer */ + spinlock_t c_lock ____cacheline_aligned_in_smp; u32 tail; /* receives pull requests from here. */ + u32 count; /* approx count of receive entries posted */ struct rvt_rwqe *curr_wq; struct rvt_rwqe wq[]; }; From 5136bfea7e79b333af77594fac5bc70282a95313 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:21:52 -0400 Subject: [PATCH 110/194] IB/{hfi1, qib, rdmavt}: Put qp in error state when cq is full When a completion queue is full, the associated queue pairs are not put into the error state. According to the IBTA specification, this is a violation. Quote from IBTA spec: C9-218: A Requester Class F error occurs when the CQ is inaccessible or full and an attempt is made to complete a WQE. The Affected QP shall be moved to the error state and affiliated asynchronous errors generated as described in 11.6.3.1 Affiliated Asynchronous Events on page 678. The current WQE and any subsequent WQEs are left in an unknown state. C11-37: The CI shall generate a CQ Error when a CQ overrun is detected. This condition will result in an Affiliated Asynchronous Error for any associated Work Queues when they attempt to use that CQ. Completions can no longer be added to the CQ. It is not guaranteed that completions present in the CQ at the time the error occurred can be retrieved. Possible causes include a CQ overrun or a CQ protection error. Put the qp in error state when cq is full. Implement a state called full to continue to put other associated QPs in error state. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 3 +- drivers/infiniband/hw/hfi1/uc.c | 3 +- drivers/infiniband/hw/hfi1/ud.c | 5 ++-- drivers/infiniband/hw/qib/qib_rc.c | 3 +- drivers/infiniband/hw/qib/qib_uc.c | 3 +- drivers/infiniband/hw/qib/qib_ud.c | 6 ++-- drivers/infiniband/sw/rdmavt/cq.c | 15 ++++++++-- drivers/infiniband/sw/rdmavt/qp.c | 3 +- drivers/infiniband/sw/rdmavt/vt.h | 9 ++++++ include/rdma/rdmavt_cq.h | 3 +- include/rdma/rdmavt_qp.h | 47 +++++++++++++++++++++++++++--- 11 files changed, 75 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 235bdbc706ac..0477c14633ab 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -3008,8 +3008,7 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_ONLY): diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 4ed4fcfabd6c..0c77f18120ed 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -476,8 +476,7 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 4cb0fce5c096..e16d499cfd1e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -255,8 +255,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - swqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); @@ -1061,7 +1060,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited); + rvt_recv_cq(qp, &wc, solicited); return; drop: diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 8d9a94d6f685..1d5e2d4ee257 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1891,8 +1891,7 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 30c70ad0f4bf..e17b91e2c22a 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -400,8 +400,7 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 5cdedba2d164..32ad0b635fc6 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -210,8 +210,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - swqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); @@ -573,8 +572,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); return; drop: diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 2602ad8b8cb0..fac87b13329d 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -60,8 +60,11 @@ static struct workqueue_struct *comp_vector_wq; * @solicited: true if @entry is solicited * * This may be called with qp->s_lock held. + * + * Return: return true on success, else return + * false if cq is full. */ -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) +bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { struct ib_uverbs_wc *uqueue = NULL; struct ib_wc *kqueue = NULL; @@ -97,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == tail)) { + if (unlikely(next == tail || cq->cq_full)) { + struct rvt_dev_info *rdi = cq->rdi; + + if (!cq->cq_full) + rvt_pr_err_ratelimited(rdi, "CQ is full!\n"); + cq->cq_full = true; spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -107,7 +115,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) ev.event = IB_EVENT_CQ_ERR; cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); } - return; + return false; } trace_rvt_cq_enter(cq, entry, head); if (uqueue) { @@ -146,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) } spin_unlock_irqrestore(&cq->lock, flags); + return true; } EXPORT_SYMBOL(rvt_cq_enter); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 200b292be63e..17e192a2c8b6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -3103,8 +3103,7 @@ do_write: wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: spin_unlock_irqrestore(&qp->r_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index 0675ea6c3872..d19ff817c2c7 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -78,6 +78,12 @@ fmt, \ ##__VA_ARGS__) +#define rvt_pr_err_ratelimited(rdi, fmt, ...) \ + __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \ + rvt_get_ibdev_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + #define __rvt_pr_info(pdev, name, fmt, ...) \ dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) @@ -87,6 +93,9 @@ #define __rvt_pr_err(pdev, name, fmt, ...) \ dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) +#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \ + dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__) + static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num) { struct rvt_dev_info *rdi = ib_to_rvt(ibdev); diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index ab22860a63e2..04c519ef6d71 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -93,6 +93,7 @@ struct rvt_cq { spinlock_t lock; /* protect changes in this struct */ u8 notify; u8 triggered; + u8 cq_full; int comp_vector_cpu; struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; @@ -105,6 +106,6 @@ static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) return container_of(ibcq, struct rvt_cq, ibcq); } -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); +bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); #endif /* DEF_RDMAVT_INCCQH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index de5915b244be..e4be869c4f21 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -718,6 +718,48 @@ rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val) return val; } +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); + +/** + * rvt_recv_cq - add a new entry to completion queue + * by receive queue + * @qp: receive queue + * @wc: work completion entry to add + * @solicited: true if @entry is solicited + * + * This is wrapper function for rvt_enter_cq function call by + * receive queue. If rvt_cq_enter return false, it means cq is + * full and the qp is put into error state. + */ +static inline void rvt_recv_cq(struct rvt_qp *qp, struct ib_wc *wc, + bool solicited) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.recv_cq); + + if (unlikely(!rvt_cq_enter(cq, wc, solicited))) + rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR); +} + +/** + * rvt_send_cq - add a new entry to completion queue + * by send queue + * @qp: send queue + * @wc: work completion entry to add + * @solicited: true if @entry is solicited + * + * This is wrapper function for rvt_enter_cq function call by + * send queue. If rvt_cq_enter return false, it means cq is + * full and the qp is put into error state. + */ +static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc, + bool solicited) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.send_cq); + + if (unlikely(!rvt_cq_enter(cq, wc, solicited))) + rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR); +} + /** * rvt_qp_complete_swqe - insert send completion * @qp - the qp @@ -768,9 +810,7 @@ rvt_qp_complete_swqe(struct rvt_qp *qp, .qp = &qp->ibqp, .byte_len = byte_len, }; - - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &w, - status != IB_WC_SUCCESS); + rvt_send_cq(qp, &w, status != IB_WC_SUCCESS); } return last; } @@ -780,7 +820,6 @@ extern const int ib_rvt_state_ops[]; struct rvt_dev_info; int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only); void rvt_comm_est(struct rvt_qp *qp); -int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err); unsigned long rvt_rnr_tbl_to_usec(u32 index); enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t); From fe2ac04712cdc6e93d32e9c82c73bfb225554309 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 28 Jun 2019 14:21:58 -0400 Subject: [PATCH 111/194] IB/rdmavt: Set QP allowed opcodes after QP allocation Currently QP allowed_ops is set after the QP is completely initialized. This curtails the use of this optimization for any initialization before allowed_ops is set. Fix by adding a helper to determine the correct allowed_ops and moving the setting of the allowed_ops to just after QP allocation. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 35 +++++++++++-------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 17e192a2c8b6..b9035d969057 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -968,6 +968,16 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); } +/** + * get_allowed_ops - Given a QP type return the appropriate allowed OP + * @type: valid, supported, QP type + */ +static u8 get_allowed_ops(enum ib_qp_type type) +{ + return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ? + IB_OPCODE_UC : IB_OPCODE_UD; +} + /** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for @@ -1050,6 +1060,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, rdi->dparms.node); if (!qp) goto bail_swq; + qp->allowed_ops = get_allowed_ops(init_attr->qp_type); RCU_INIT_POINTER(qp->next, NULL); if (init_attr->qp_type == IB_QPT_RC) { @@ -1205,28 +1216,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, ret = &qp->ibqp; - /* - * We have our QP and its good, now keep track of what types of opcodes - * can be processed on this QP. We do this by keeping track of what the - * 3 high order bits of the opcode are. - */ - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD; - break; - case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC; - break; - case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC; - break; - default: - ret = ERR_PTR(-EINVAL); - goto bail_ip; - } - return ret; bail_ip: From d310c4bf8aeacc0256091feb6a0337b8fef763ac Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 28 Jun 2019 14:22:04 -0400 Subject: [PATCH 112/194] IB/{rdmavt, hfi1, qib}: Remove AH refcount for UD QPs Historically rdmavt destroy_ah() has returned an -EBUSY when the AH has a non-zero reference count. IBTA 11.2.2 notes no such return value or error case: Output Modifiers: - Verb results: - Operation completed successfully. - Invalid HCA handle. - Invalid address handle. ULPs never test for this error and this will leak memory. The reference count exists to allow for driver independent progress mechanisms to process UD SWQEs in parallel with post sends. The SWQE will hold a reference count until the UD SWQE completes and then drops the reference. Fix by removing need to reference count the AH. Add a UD specific allocation to each SWQE entry to cache the necessary information for independent progress. Copy the information during the post send processing. Reviewed-by: Mike Marciniszyn Signed-off-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 4 +-- drivers/infiniband/hw/hfi1/ud.c | 30 ++++++++-------- drivers/infiniband/hw/qib/qib_qp.c | 4 +-- drivers/infiniband/hw/qib/qib_ud.c | 21 +++++------ drivers/infiniband/sw/rdmavt/ah.c | 6 +--- drivers/infiniband/sw/rdmavt/qp.c | 58 ++++++++++++++++++++++++++++-- include/rdma/rdma_vt.h | 3 +- include/rdma/rdmavt_qp.h | 22 ++++++++++-- 8 files changed, 106 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 41261e72c429..a84b44af7b97 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) break; case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.ah); + ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index e16d499cfd1e..f8e796e45517 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) rcu_read_lock(); qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - swqe->ud_wr.remote_qpn); + swqe->ud_wr.wr.remote_qpn); if (!qp) { ibp->rvp.n_pkt_drops++; rcu_read_unlock(); @@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; + ah_attr = swqe->ud_wr.attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.remote_qkey; + qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? + sqp->qkey : swqe->ud_wr.wr.remote_qkey; if (unlikely(qkey != qp->qkey)) goto drop; /* silently drop per IBTA spec */ } @@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { if (sqp->ibqp.qp_type == IB_QPT_GSI || sqp->ibqp.qp_type == IB_QPT_SMI) - wc.pkey_index = swqe->ud_wr.pkey_index; + wc.pkey_index = swqe->ud_wr.wr.pkey_index; else wc.pkey_index = sqp->s_pkey_index; } else { @@ -282,20 +282,20 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, bth0 |= IB_BTH_SOLICITED; bth0 |= extra_bytes << 20; if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); else *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); if (!bypass) bth0 |= *pkey; ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); + ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn); ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); } @@ -315,7 +315,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; extra_bytes = -wqe->length & 3; nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; @@ -379,7 +379,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; - u32 dest_qp = wqe->ud_wr.remote_qpn; + u32 dest_qp = wqe->ud_wr.wr.remote_qpn; u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; @@ -387,7 +387,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; /* * Build 16B Management Packet if either the destination @@ -449,7 +449,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (is_mgmt) { l4 = OPA_16B_L4_FM; - pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, dest_qp, src_qp); } else { @@ -514,7 +514,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index a81905df2d0f..0e1d0d692891 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2019 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.ah); + ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 32ad0b635fc6..d8c2c968909f 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2012 - 2019 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -63,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) enum ib_qp_type sqptype, dqptype; rcu_read_lock(); - qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn); if (!qp) { ibp->rvp.n_pkt_drops++; goto drop; @@ -80,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; + ah_attr = swqe->ud_wr.attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -110,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.remote_qkey; + qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? + sqp->qkey : swqe->ud_wr.wr.remote_qkey; if (unlikely(qkey != qp->qkey)) goto drop; } @@ -203,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.qp = &qp->ibqp; wc.src_qp = sqp->ibqp.qp_num; wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? - swqe->ud_wr.pkey_index : 0; + swqe->ud_wr.wr.pkey_index : 0; wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); wc.sl = rdma_ah_get_sl(ah_attr); @@ -270,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { if (rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE)) @@ -362,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? - wqe->ud_wr.pkey_index : qp->s_pkey_index); + wqe->ud_wr.wr.pkey_index : qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); /* * Use the multicast QP if the destination LID is a multicast LID. @@ -371,14 +372,14 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) be16_to_cpu(IB_MULTICAST_LID_BASE) && rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ? cpu_to_be32(QIB_MULTICAST_QPN) : - cpu_to_be32(wqe->ud_wr.remote_qpn); + cpu_to_be32(wqe->ud_wr.wr.remote_qpn); ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index 0e147b32cbe9..fe99da0ff060 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, rdma_copy_ah_attr(&ah->attr, ah_attr); - atomic_set(&ah->refcount, 0); - if (dev->driver_f.notify_new_ah) dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah); @@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags) struct rvt_ah *ah = ibah_to_rvtah(ibah); unsigned long flags; - WARN_ON_ONCE(atomic_read(&ah->refcount)); - spin_lock_irqsave(&dev->n_ahs_lock, flags); dev->n_ahs_allocated--; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index b9035d969057..de7d2edb9781 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -978,6 +978,51 @@ static u8 get_allowed_ops(enum ib_qp_type type) IB_OPCODE_UC : IB_OPCODE_UD; } +/** + * free_ud_wq_attr - Clean up AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static void free_ud_wq_attr(struct rvt_qp *qp) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->ud_wr.attr); + wqe->ud_wr.attr = NULL; + } +} + +/** + * alloc_ud_wq_attr - AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * @node: Numa node for allocation + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static int alloc_ud_wq_attr(struct rvt_qp *qp, int node) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr), + GFP_KERNEL, node); + if (!wqe->ud_wr.attr) { + free_ud_wq_attr(qp); + return -ENOMEM; + } + } + + return 0; +} + /** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for @@ -1124,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) qp->s_flags = RVT_S_SIGNAL_REQ_WR; + err = alloc_ud_wq_attr(qp, rdi->dparms.node); + if (err) { + ret = (ERR_PTR(err)); + goto bail_driver_priv; + } err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, init_attr->qp_type, @@ -1227,6 +1277,7 @@ bail_qpn: bail_rq_wq: rvt_free_rq(&qp->r_rq); + free_ud_wq_attr(qp); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1671,6 +1722,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + free_ud_wq_attr(qp); vfree(qp->s_wq); kfree(qp); return 0; @@ -2037,10 +2089,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, */ log_pmtu = qp->log_pmtu; if (qp->allowed_ops == IB_OPCODE_UD) { - struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah); + struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); log_pmtu = ah->log_pmtu; - atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr); } if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { @@ -2085,7 +2137,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, bail_inval_free_ref: if (qp->allowed_ops == IB_OPCODE_UD) - atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_destroy_ah_attr(wqe->ud_wr.attr); bail_inval_free: /* release mr holds */ while (j) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 997f42678806..525848e227dc 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -2,7 +2,7 @@ #define DEF_RDMA_VT_H /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -202,7 +202,6 @@ struct rvt_pd { struct rvt_ah { struct ib_ah ibah; struct rdma_ah_attr attr; - atomic_t refcount; u8 vl; u8 log_pmtu; }; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index e4be869c4f21..9531de2fabe2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -2,7 +2,7 @@ #define DEF_RDMAVT_INCQP_H /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -157,6 +157,22 @@ #define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START #define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) +/** + * rvt_ud_wr - IB UD work plus AH cache + * @wr: valid IB work request + * @attr: pointer to an allocated AH attribute + * + * Special case the UD WR so we can keep track of the AH attributes. + * + * NOTE: This data structure is stricly ordered wr then attr. I.e the attr + * MUST come after wr. The ib_ud_wr is sized and copied in rvt_post_one_wr. + * The copy assumes that wr is first. + */ +struct rvt_ud_wr { + struct ib_ud_wr wr; + struct rdma_ah_attr *attr; +}; + /* * Send work request queue entry. * The size of the sg_list is determined when the QP is created and stored @@ -165,7 +181,7 @@ struct rvt_swqe { union { struct ib_send_wr wr; /* don't use wr.sg_list */ - struct ib_ud_wr ud_wr; + struct rvt_ud_wr ud_wr; struct ib_reg_wr reg_wr; struct ib_rdma_wr rdma_wr; struct ib_atomic_wr atomic_wr; @@ -700,7 +716,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) { rvt_put_swqe(wqe); if (qp->allowed_ops == IB_OPCODE_UD) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + rdma_destroy_ah_attr(wqe->ud_wr.attr); } /** From 2b0ad2da8fd4c32f63d9142f2de43a4d34fdd679 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 28 Jun 2019 14:22:11 -0400 Subject: [PATCH 113/194] IB/{rdmavt, hfi1, qib}: Add helpers to hide SWQE WR details Add some helper functions to hide struct rvt_swqe details. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 2 +- drivers/infiniband/hw/hfi1/ud.c | 29 ++++++++--------- drivers/infiniband/hw/qib/qib_qp.c | 2 +- drivers/infiniband/hw/qib/qib_ud.c | 21 +++++++------ drivers/infiniband/sw/rdmavt/qp.c | 2 +- include/rdma/rdmavt_qp.h | 50 ++++++++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index a84b44af7b97..f8e733aa3bb8 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) break; case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + ah = rvt_get_swqe_ah(wqe); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index f8e796e45517..e804af71b629 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) rcu_read_lock(); qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - swqe->ud_wr.wr.remote_qpn); + rvt_get_swqe_remote_qpn(swqe)); if (!qp) { ibp->rvp.n_pkt_drops++; rcu_read_unlock(); @@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = swqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(swqe); ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.wr.remote_qkey; + qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ? + sqp->qkey : rvt_get_swqe_remote_qkey(swqe); if (unlikely(qkey != qp->qkey)) goto drop; /* silently drop per IBTA spec */ } @@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { if (sqp->ibqp.qp_type == IB_QPT_GSI || sqp->ibqp.qp_type == IB_QPT_SMI) - wc.pkey_index = swqe->ud_wr.wr.pkey_index; + wc.pkey_index = rvt_get_swqe_pkey_index(swqe); else wc.pkey_index = sqp->s_pkey_index; } else { @@ -282,20 +282,21 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, bth0 |= IB_BTH_SOLICITED; bth0 |= extra_bytes << 20; if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); + *pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); else *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); if (!bypass) bth0 |= *pkey; ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn); + ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe)); ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.wr.remote_qkey); + ohdr->u.ud.deth[0] = + cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey : + rvt_get_swqe_remote_qkey(wqe)); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); } @@ -315,7 +316,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); extra_bytes = -wqe->length & 3; nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; @@ -379,7 +380,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; - u32 dest_qp = wqe->ud_wr.wr.remote_qpn; + u32 dest_qp = rvt_get_swqe_remote_qpn(wqe); u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; @@ -387,7 +388,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); /* * Build 16B Management Packet if either the destination @@ -449,7 +450,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (is_mgmt) { l4 = OPA_16B_L4_FM; - pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); + pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, dest_qp, src_qp); } else { @@ -514,7 +515,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 0e1d0d692891..8d0563ef5be1 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + ah = rvt_get_swqe_ah(wqe); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d8c2c968909f..93ca21347959 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -64,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) enum ib_qp_type sqptype, dqptype; rcu_read_lock(); - qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, rvt_get_swqe_remote_qpn(swqe)); if (!qp) { ibp->rvp.n_pkt_drops++; goto drop; @@ -81,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = swqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(swqe); ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -111,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.wr.remote_qkey; + qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ? + sqp->qkey : rvt_get_swqe_remote_qkey(swqe); if (unlikely(qkey != qp->qkey)) goto drop; } @@ -204,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.qp = &qp->ibqp; wc.src_qp = sqp->ibqp.qp_num; wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? - swqe->ud_wr.wr.pkey_index : 0; + rvt_get_swqe_pkey_index(swqe) : 0; wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); wc.sl = rdma_ah_get_sl(ah_attr); @@ -271,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { if (rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE)) @@ -363,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? - wqe->ud_wr.wr.pkey_index : qp->s_pkey_index); + rvt_get_swqe_pkey_index(wqe) : qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); /* * Use the multicast QP if the destination LID is a multicast LID. @@ -372,14 +372,15 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) be16_to_cpu(IB_MULTICAST_LID_BASE) && rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ? cpu_to_be32(QIB_MULTICAST_QPN) : - cpu_to_be32(wqe->ud_wr.wr.remote_qpn); + cpu_to_be32(rvt_get_swqe_remote_qpn(wqe)); ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.wr.remote_qkey); + ohdr->u.ud.deth[0] = + cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey : + rvt_get_swqe_remote_qkey(wqe)); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index de7d2edb9781..11b4d3c1efd4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2089,7 +2089,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, */ log_pmtu = qp->log_pmtu; if (qp->allowed_ops == IB_OPCODE_UD) { - struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + struct rvt_ah *ah = rvt_get_swqe_ah(wqe); log_pmtu = ah->log_pmtu; rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 9531de2fabe2..0eeea520a853 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -219,6 +219,56 @@ struct rvt_krwq { struct rvt_rwqe wq[]; }; +/* + * rvt_get_swqe_ah - Return the pointer to the struct rvt_ah + * @swqe: valid Send WQE + * + */ +static inline struct rvt_ah *rvt_get_swqe_ah(struct rvt_swqe *swqe) +{ + return ibah_to_rvtah(swqe->ud_wr.wr.ah); +} + +/** + * rvt_get_swqe_ah_attr - Return the cached ah attribute information + * @swqe: valid Send WQE + * + */ +static inline struct rdma_ah_attr *rvt_get_swqe_ah_attr(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.attr; +} + +/** + * rvt_get_swqe_remote_qpn - Access the remote QPN value + * @swqe: valid Send WQE + * + */ +static inline u32 rvt_get_swqe_remote_qpn(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.remote_qpn; +} + +/** + * rvt_get_swqe_remote_qkey - Acces the remote qkey value + * @swqe: valid Send WQE + * + */ +static inline u32 rvt_get_swqe_remote_qkey(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.remote_qkey; +} + +/** + * rvt_get_swqe_pkey_index - Access the pkey index + * @swqe: valid Send WQE + * + */ +static inline u16 rvt_get_swqe_pkey_index(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.pkey_index; +} + struct rvt_rq { struct rvt_rwq *wq; struct rvt_krwq *kwq; From bf3b1e0ce093ce31d4d91d613f9b09d80a4021cc Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 28 Jun 2019 14:22:17 -0400 Subject: [PATCH 114/194] IB/hfi1: Reduce excessive aspm inlines Uninline the aspm API since it increases code space for no reason. Move the aspm module param to the new aspm C file. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/aspm.c | 270 ++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/aspm.h | 262 +-------------------------- drivers/infiniband/hw/hfi1/pcie.c | 6 +- 4 files changed, 280 insertions(+), 259 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/aspm.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 4044a8c8dbf4..0405d26d0833 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o hfi1-y := \ affinity.o \ + aspm.o \ chip.o \ device.o \ driver.o \ diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c new file mode 100644 index 000000000000..a3c53be4072c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/aspm.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2019 Intel Corporation. + * + */ + +#include "aspm.h" + +/* Time after which the timer interrupt will re-enable ASPM */ +#define ASPM_TIMER_MS 1000 +/* Time for which interrupts are ignored after a timer has been scheduled */ +#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2) +/* Two interrupts within this time trigger ASPM disable */ +#define ASPM_TRIGGER_MS 1 +#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull) +#define ASPM_L1_SUPPORTED(reg) \ + ((((reg) & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2) + +uint aspm_mode = ASPM_MODE_DISABLED; +module_param_named(aspm, aspm_mode, uint, 0444); +MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); + +static bool aspm_hw_l1_supported(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u32 up, dn; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return false; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn); + dn = ASPM_L1_SUPPORTED(dn); + + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up); + up = ASPM_L1_SUPPORTED(up); + + /* ASPM works on A-step but is reported as not supported */ + return (!!dn || is_ax(dd)) && !!up; +} + +/* Set L1 entrance latency for slower entry to L1 */ +static void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd) +{ + u32 l1_ent_lat = 0x4u; + u32 reg32; + + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32); + reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK; + reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32); +} + +static void aspm_hw_enable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return; + + /* Enable ASPM L1 first in upstream component and then downstream */ + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); +} + +void aspm_hw_disable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* Disable ASPM L1 first in downstream component and then upstream */ + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); + if (parent) + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); +} + +static void aspm_enable(struct hfi1_devdata *dd) +{ + if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED || + !dd->aspm_supported) + return; + + aspm_hw_enable_l1(dd); + dd->aspm_enabled = true; +} + +static void aspm_disable(struct hfi1_devdata *dd) +{ + if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED) + return; + + aspm_hw_disable_l1(dd); + dd->aspm_enabled = false; +} + +static void aspm_disable_inc(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + aspm_disable(dd); + atomic_inc(&dd->aspm_disabled_cnt); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +static void aspm_enable_dec(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + if (atomic_dec_and_test(&dd->aspm_disabled_cnt)) + aspm_enable(dd); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +/* ASPM processing for each receive context interrupt */ +void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd) +{ + bool restart_timer; + bool close_interrupts; + unsigned long flags; + ktime_t now, prev; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + /* PSM contexts are open */ + if (!rcd->aspm_intr_enable) + goto unlock; + + prev = rcd->aspm_ts_last_intr; + now = ktime_get(); + rcd->aspm_ts_last_intr = now; + + /* An interrupt pair close together in time */ + close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS; + + /* Don't push out our timer till this much time has elapsed */ + restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) > + ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC; + restart_timer = restart_timer && close_interrupts; + + /* Disable ASPM and schedule timer */ + if (rcd->aspm_enabled && close_interrupts) { + aspm_disable_inc(rcd->dd); + rcd->aspm_enabled = false; + restart_timer = true; + } + + if (restart_timer) { + mod_timer(&rcd->aspm_timer, + jiffies + msecs_to_jiffies(ASPM_TIMER_MS)); + rcd->aspm_ts_timer_sched = now; + } +unlock: + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Timer function for re-enabling ASPM in the absence of interrupt activity */ +static void aspm_ctx_timer_function(struct timer_list *t) +{ + struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer); + unsigned long flags; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + aspm_enable_dec(rcd->dd); + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* + * Disable interrupt processing for verbs contexts when PSM or VNIC contexts + * are open. + */ +void aspm_disable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + u16 i; + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + del_timer_sync(&rcd->aspm_timer); + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = false; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } + } + + aspm_disable(dd); + atomic_set(&dd->aspm_disabled_cnt, 0); +} + +/* Re-enable interrupt processing for verbs contexts */ +void aspm_enable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + u16 i; + + aspm_enable(dd); + + if (aspm_mode != ASPM_MODE_DYNAMIC) + return; + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = true; + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } + } +} + +static void aspm_ctx_init(struct hfi1_ctxtdata *rcd) +{ + spin_lock_init(&rcd->aspm_lock); + timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0); + rcd->aspm_intr_supported = rcd->dd->aspm_supported && + aspm_mode == ASPM_MODE_DYNAMIC && + rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt; +} + +void aspm_init(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + u16 i; + + spin_lock_init(&dd->aspm_lock); + dd->aspm_supported = aspm_hw_l1_supported(dd); + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + aspm_ctx_init(rcd); + hfi1_rcd_put(rcd); + } + + /* Start with ASPM disabled */ + aspm_hw_set_l1_ent_latency(dd); + dd->aspm_enabled = false; + aspm_hw_disable_l1(dd); + + /* Now turn on ASPM if configured */ + aspm_enable_all(dd); +} + +void aspm_exit(struct hfi1_devdata *dd) +{ + aspm_disable_all(dd); + + /* Turn on ASPM on exit to conserve power */ + aspm_enable(dd); +} + diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h index e8133870ee87..75d5d18da3da 100644 --- a/drivers/infiniband/hw/hfi1/aspm.h +++ b/drivers/infiniband/hw/hfi1/aspm.h @@ -57,266 +57,20 @@ enum aspm_mode { ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */ }; -/* Time after which the timer interrupt will re-enable ASPM */ -#define ASPM_TIMER_MS 1000 -/* Time for which interrupts are ignored after a timer has been scheduled */ -#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2) -/* Two interrupts within this time trigger ASPM disable */ -#define ASPM_TRIGGER_MS 1 -#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull) -#define ASPM_L1_SUPPORTED(reg) \ - (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2) +void aspm_init(struct hfi1_devdata *dd); +void aspm_exit(struct hfi1_devdata *dd); +void aspm_hw_disable_l1(struct hfi1_devdata *dd); +void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd); +void aspm_disable_all(struct hfi1_devdata *dd); +void aspm_enable_all(struct hfi1_devdata *dd); -static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd) -{ - struct pci_dev *parent = dd->pcidev->bus->self; - u32 up, dn; - - /* - * If the driver does not have access to the upstream component, - * it cannot support ASPM L1 at all. - */ - if (!parent) - return false; - - pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn); - dn = ASPM_L1_SUPPORTED(dn); - - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up); - up = ASPM_L1_SUPPORTED(up); - - /* ASPM works on A-step but is reported as not supported */ - return (!!dn || is_ax(dd)) && !!up; -} - -/* Set L1 entrance latency for slower entry to L1 */ -static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd) -{ - u32 l1_ent_lat = 0x4u; - u32 reg32; - - pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32); - reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK; - reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT; - pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32); -} - -static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd) -{ - struct pci_dev *parent = dd->pcidev->bus->self; - - /* - * If the driver does not have access to the upstream component, - * it cannot support ASPM L1 at all. - */ - if (!parent) - return; - - /* Enable ASPM L1 first in upstream component and then downstream */ - pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, - PCI_EXP_LNKCTL_ASPMC, - PCI_EXP_LNKCTL_ASPM_L1); - pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, - PCI_EXP_LNKCTL_ASPMC, - PCI_EXP_LNKCTL_ASPM_L1); -} - -static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd) -{ - struct pci_dev *parent = dd->pcidev->bus->self; - - /* Disable ASPM L1 first in downstream component and then upstream */ - pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, - PCI_EXP_LNKCTL_ASPMC, 0x0); - if (parent) - pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, - PCI_EXP_LNKCTL_ASPMC, 0x0); -} - -static inline void aspm_enable(struct hfi1_devdata *dd) -{ - if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED || - !dd->aspm_supported) - return; - - aspm_hw_enable_l1(dd); - dd->aspm_enabled = true; -} - -static inline void aspm_disable(struct hfi1_devdata *dd) -{ - if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED) - return; - - aspm_hw_disable_l1(dd); - dd->aspm_enabled = false; -} - -static inline void aspm_disable_inc(struct hfi1_devdata *dd) -{ - unsigned long flags; - - spin_lock_irqsave(&dd->aspm_lock, flags); - aspm_disable(dd); - atomic_inc(&dd->aspm_disabled_cnt); - spin_unlock_irqrestore(&dd->aspm_lock, flags); -} - -static inline void aspm_enable_dec(struct hfi1_devdata *dd) -{ - unsigned long flags; - - spin_lock_irqsave(&dd->aspm_lock, flags); - if (atomic_dec_and_test(&dd->aspm_disabled_cnt)) - aspm_enable(dd); - spin_unlock_irqrestore(&dd->aspm_lock, flags); -} - -/* ASPM processing for each receive context interrupt */ static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd) { - bool restart_timer; - bool close_interrupts; - unsigned long flags; - ktime_t now, prev; - /* Quickest exit for minimum impact */ - if (!rcd->aspm_intr_supported) + if (likely(!rcd->aspm_intr_supported)) return; - spin_lock_irqsave(&rcd->aspm_lock, flags); - /* PSM contexts are open */ - if (!rcd->aspm_intr_enable) - goto unlock; - - prev = rcd->aspm_ts_last_intr; - now = ktime_get(); - rcd->aspm_ts_last_intr = now; - - /* An interrupt pair close together in time */ - close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS; - - /* Don't push out our timer till this much time has elapsed */ - restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) > - ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC; - restart_timer = restart_timer && close_interrupts; - - /* Disable ASPM and schedule timer */ - if (rcd->aspm_enabled && close_interrupts) { - aspm_disable_inc(rcd->dd); - rcd->aspm_enabled = false; - restart_timer = true; - } - - if (restart_timer) { - mod_timer(&rcd->aspm_timer, - jiffies + msecs_to_jiffies(ASPM_TIMER_MS)); - rcd->aspm_ts_timer_sched = now; - } -unlock: - spin_unlock_irqrestore(&rcd->aspm_lock, flags); -} - -/* Timer function for re-enabling ASPM in the absence of interrupt activity */ -static inline void aspm_ctx_timer_function(struct timer_list *t) -{ - struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer); - unsigned long flags; - - spin_lock_irqsave(&rcd->aspm_lock, flags); - aspm_enable_dec(rcd->dd); - rcd->aspm_enabled = true; - spin_unlock_irqrestore(&rcd->aspm_lock, flags); -} - -/* - * Disable interrupt processing for verbs contexts when PSM or VNIC contexts - * are open. - */ -static inline void aspm_disable_all(struct hfi1_devdata *dd) -{ - struct hfi1_ctxtdata *rcd; - unsigned long flags; - u16 i; - - for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { - rcd = hfi1_rcd_get_by_index(dd, i); - if (rcd) { - del_timer_sync(&rcd->aspm_timer); - spin_lock_irqsave(&rcd->aspm_lock, flags); - rcd->aspm_intr_enable = false; - spin_unlock_irqrestore(&rcd->aspm_lock, flags); - hfi1_rcd_put(rcd); - } - } - - aspm_disable(dd); - atomic_set(&dd->aspm_disabled_cnt, 0); -} - -/* Re-enable interrupt processing for verbs contexts */ -static inline void aspm_enable_all(struct hfi1_devdata *dd) -{ - struct hfi1_ctxtdata *rcd; - unsigned long flags; - u16 i; - - aspm_enable(dd); - - if (aspm_mode != ASPM_MODE_DYNAMIC) - return; - - for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { - rcd = hfi1_rcd_get_by_index(dd, i); - if (rcd) { - spin_lock_irqsave(&rcd->aspm_lock, flags); - rcd->aspm_intr_enable = true; - rcd->aspm_enabled = true; - spin_unlock_irqrestore(&rcd->aspm_lock, flags); - hfi1_rcd_put(rcd); - } - } -} - -static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd) -{ - spin_lock_init(&rcd->aspm_lock); - timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0); - rcd->aspm_intr_supported = rcd->dd->aspm_supported && - aspm_mode == ASPM_MODE_DYNAMIC && - rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt; -} - -static inline void aspm_init(struct hfi1_devdata *dd) -{ - struct hfi1_ctxtdata *rcd; - u16 i; - - spin_lock_init(&dd->aspm_lock); - dd->aspm_supported = aspm_hw_l1_supported(dd); - - for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { - rcd = hfi1_rcd_get_by_index(dd, i); - if (rcd) - aspm_ctx_init(rcd); - hfi1_rcd_put(rcd); - } - - /* Start with ASPM disabled */ - aspm_hw_set_l1_ent_latency(dd); - dd->aspm_enabled = false; - aspm_hw_disable_l1(dd); - - /* Now turn on ASPM if configured */ - aspm_enable_all(dd); -} - -static inline void aspm_exit(struct hfi1_devdata *dd) -{ - aspm_disable_all(dd); - - /* Turn on ASPM on exit to conserve power */ - aspm_enable(dd); + __aspm_ctx_disable(rcd); } #endif /* _ASPM_H */ diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index c96d193bb236..61aa5504d7c3 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -450,10 +450,6 @@ static int hfi1_pcie_caps; module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); -uint aspm_mode = ASPM_MODE_DISABLED; -module_param_named(aspm, aspm_mode, uint, 0444); -MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); - /** * tune_pcie_caps() - Code to adjust PCIe capabilities. * @dd: Valid device data structure From aa9b79ec37789d7cc1ca1339369b75dc5fec02dd Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 28 Jun 2019 14:22:23 -0400 Subject: [PATCH 115/194] IB/hfi1: Add missing INVALIDATE opcodes for trace This was missed in the original implementation of the memory management extensions. Fixes: 0db3dfa03c08 ("IB/hfi1: Work request processing for fast register mr and invalidate") Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index d1372cc66de6..2f84290a88ca 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -79,6 +79,8 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ + ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ ib_opcode_name(TID_RDMA_WRITE_REQ), \ ib_opcode_name(TID_RDMA_WRITE_RESP), \ ib_opcode_name(TID_RDMA_WRITE_DATA), \ From 315aed110c16ac806f25fd85eff8b34579ed101d Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 28 Jun 2019 14:22:33 -0400 Subject: [PATCH 116/194] IB/rdmavt: Enhance trace information for FRWR debug This patch enhances the MR trace information to enable more focused debug of MR issues. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 2 +- drivers/infiniband/sw/rdmavt/trace_mr.h | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0867a11d074e..23ddc63cae61 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -612,8 +612,8 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) n = mapped_segs % RVT_SEGSZ; mr->mr.map[m]->segs[n].vaddr = (void *)addr; mr->mr.map[m]->segs[n].length = ps; - trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); mr->mr.length += ps; + trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); return 0; } diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 976e482930a3..f43e477a8c91 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -64,8 +64,12 @@ DECLARE_EVENT_CLASS( RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device)) __field(void *, vaddr) __field(struct page *, page) + __field(u64, iova) + __field(u64, user_base) __field(size_t, len) + __field(size_t, length) __field(u32, lkey) + __field(u32, offset) __field(u16, m) __field(u16, n) ), @@ -73,18 +77,28 @@ DECLARE_EVENT_CLASS( RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device)); __entry->vaddr = v; __entry->page = virt_to_page(v); + __entry->iova = mr->iova; + __entry->user_base = mr->user_base; + __entry->lkey = mr->lkey; __entry->m = m; __entry->n = n; __entry->len = len; + __entry->length = mr->length; + __entry->offset = mr->offset; ), TP_printk( - "[%s] vaddr %p page %p m %u n %u len %ld", + "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u", __get_str(dev), - __entry->vaddr, + __entry->lkey, + __entry->iova, + __entry->user_base, + __entry->length, + (unsigned long long)__entry->vaddr, __entry->page, __entry->m, __entry->n, - __entry->len + __entry->len, + __entry->offset ) ); From 8bd516bd0d53f71594340dc644f6fbc4278a8ab1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 28 Jun 2019 14:22:39 -0400 Subject: [PATCH 117/194] IB/rdmavt: Add trace for map_mr_sg Add trace to debug map_mr_sg handling. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 1 + drivers/infiniband/sw/rdmavt/trace_mr.h | 36 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 23ddc63cae61..a6a39f01dca3 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -642,6 +642,7 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, mr->mr.iova = ibmr->iova; mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr; mr->mr.length = (size_t)ibmr->length; + trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset); return ret; } diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index f43e477a8c91..95b8a0e3b8bd 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -54,6 +54,8 @@ #include #include +#include "mr.h" + #undef TRACE_SYSTEM #define TRACE_SYSTEM rvt_mr DECLARE_EVENT_CLASS( @@ -179,6 +181,40 @@ DEFINE_EVENT( TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), TP_ARGS(sge, isge)); +TRACE_EVENT( + rvt_map_mr_sg, + TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset), + TP_ARGS(ibmr, sg_nents, sg_offset), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __field(u64, iova) + __field(u64, ibmr_iova) + __field(u64, user_base) + __field(u64, ibmr_length) + __field(int, sg_nents) + __field(uint, sg_offset) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __entry->ibmr_iova = ibmr->iova; + __entry->iova = to_imr(ibmr)->mr.iova; + __entry->user_base = to_imr(ibmr)->mr.user_base; + __entry->ibmr_length = to_imr(ibmr)->mr.length; + __entry->sg_nents = sg_nents; + __entry->sg_offset = sg_offset ? *sg_offset : 0; + ), + TP_printk( + "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u", + __get_str(dev), + __entry->ibmr_iova, + __entry->iova, + __entry->user_base, + __entry->ibmr_length, + __entry->sg_nents, + __entry->sg_offset + ) +); + #endif /* __RVT_TRACE_MR_H */ #undef TRACE_INCLUDE_PATH From 09fbca8e6240e945c663af1ac2c5d5ef1456bad7 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 28 Jun 2019 14:22:46 -0400 Subject: [PATCH 118/194] IB/hfi1: No need to use try_module_get for debugfs The call in debugfs.c for try_module_get() is not needed. A reference to the module will be taken by the VFS layer as long as the owner field is set in the file ops struct. So set this as well as remove the call. Suggested-by: Jason Gunthorpe Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/debugfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 15efb4a380b2..d268bf9c42ee 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -987,9 +987,6 @@ static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) struct hfi1_pportdata *ppd; int ret; - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - ppd = private2ppd(fp); ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); @@ -1155,6 +1152,7 @@ static int exprom_wp_debugfs_release(struct inode *in, struct file *fp) { \ .name = nm, \ .ops = { \ + .owner = THIS_MODULE, \ .read = readroutine, \ .write = writeroutine, \ .llseek = generic_file_llseek, \ @@ -1165,6 +1163,7 @@ static int exprom_wp_debugfs_release(struct inode *in, struct file *fp) { \ .name = nm, \ .ops = { \ + .owner = THIS_MODULE, \ .read = readf, \ .write = writef, \ .llseek = generic_file_llseek, \ From 0e935ae6afcdbe6f0c0aa457ae57feccc63bb9be Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:23 +0200 Subject: [PATCH 119/194] rdma/siw: iWarp wire packet format Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/iwarp.h | 380 ++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 drivers/infiniband/sw/siw/iwarp.h diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h new file mode 100644 index 000000000000..e8a04d9c89cb --- /dev/null +++ b/drivers/infiniband/sw/siw/iwarp.h @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _IWARP_H +#define _IWARP_H + +#include /* RDMA_MAX_PRIVATE_DATA */ +#include +#include + +#define RDMAP_VERSION 1 +#define DDP_VERSION 1 +#define MPA_REVISION_1 1 +#define MPA_REVISION_2 2 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_IRD_ORD_MASK 0x3fff + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response header bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000), + MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000), + MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800), + MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + __u8 key[16]; + struct mpa_rr_params params; +}; + +static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return be16_to_cpu(rev); +} + +enum mpa_v2_ctrl { + MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000), + MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000), + MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000), + MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff) +}; + +struct mpa_v2_data { + __be16 ird; + __be16 ord; +}; + +struct mpa_marker { + __be16 rsvd; + __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */ +}; + +/* + * maximum MPA trailer + */ +struct mpa_trailer { + __u8 pad[4]; + __be32 crc; +}; + +#define MPA_HDR_SIZE 2 +#define MPA_CRC_SIZE 4 + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for any FPDU + */ +struct iwarp_ctrl { + __be16 mpa_len; + __be16 ddp_rdmap_ctrl; +}; + +/* + * DDP/RDMAP Hdr bits & fields + */ +enum { + DDP_FLAG_TAGGED = cpu_to_be16(0x8000), + DDP_FLAG_LAST = cpu_to_be16(0x4000), + DDP_MASK_RESERVED = cpu_to_be16(0x3C00), + DDP_MASK_VERSION = cpu_to_be16(0x0300), + RDMAP_MASK_VERSION = cpu_to_be16(0x00C0), + RDMAP_MASK_RESERVED = cpu_to_be16(0x0030), + RDMAP_MASK_OPCODE = cpu_to_be16(0x000f) +}; + +static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8; +} + +static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = + (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) | + (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION); +} + +static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl) +{ + __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION; + + return be16_to_cpu(ver) >> 6; +} + +static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) | + (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION); +} + +static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE); +} + +static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) | + (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE); +} + +struct iwarp_rdma_write { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_rdma_rreq { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; + __be32 sink_stag; + __be64 sink_to; + __be32 read_size; + __be32 source_stag; + __be64 source_to; +}; + +struct iwarp_rdma_rresp { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_send { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_send_inv { + struct iwarp_ctrl ctrl; + __be32 inval_stag; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_terminate { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __be32 layer : 4; + __be32 etype : 4; + __be32 ecode : 8; + __be32 flag_m : 1; + __be32 flag_d : 1; + __be32 flag_r : 1; + __be32 reserved : 13; +#elif defined(__BIG_ENDIAN_BITFIELD) + __be32 reserved : 13; + __be32 flag_r : 1; + __be32 flag_d : 1; + __be32 flag_m : 1; + __be32 ecode : 8; + __be32 etype : 4; + __be32 layer : 4; +#else +#error "undefined byte order" +#endif +}; + +/* + * Terminate Hdr bits & fields + */ +enum { + TERM_MASK_LAYER = cpu_to_be32(0xf0000000), + TERM_MASK_ETYPE = cpu_to_be32(0x0f000000), + TERM_MASK_ECODE = cpu_to_be32(0x00ff0000), + TERM_FLAG_M = cpu_to_be32(0x00008000), + TERM_FLAG_D = cpu_to_be32(0x00004000), + TERM_FLAG_R = cpu_to_be32(0x00002000), + TERM_MASK_RESVD = cpu_to_be32(0x00001fff) +}; + +static inline u8 __rdmap_term_layer(struct iwarp_terminate *term) +{ + return term->layer; +} + +static inline void __rdmap_term_set_layer(struct iwarp_terminate *term, + u8 layer) +{ + term->layer = layer & 0xf; +} + +static inline u8 __rdmap_term_etype(struct iwarp_terminate *term) +{ + return term->etype; +} + +static inline void __rdmap_term_set_etype(struct iwarp_terminate *term, + u8 etype) +{ + term->etype = etype & 0xf; +} + +static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term) +{ + return term->ecode; +} + +static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term, + u8 ecode) +{ + term->ecode = ecode; +} + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying an untagged DDP segment + */ +struct iwarp_ctrl_untagged { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying a tagged DDP segment + */ +struct iwarp_ctrl_tagged { + struct iwarp_ctrl ctrl; + __be32 ddp_stag; + __be64 ddp_to; +}; + +union iwarp_hdr { + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; +}; + +enum term_elayer { + TERM_ERROR_LAYER_RDMAP = 0x00, + TERM_ERROR_LAYER_DDP = 0x01, + TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */ +}; + +enum ddp_etype { + DDP_ETYPE_CATASTROPHIC = 0x0, + DDP_ETYPE_TAGGED_BUF = 0x1, + DDP_ETYPE_UNTAGGED_BUF = 0x2, + DDP_ETYPE_RSVD = 0x3 +}; + +enum ddp_ecode { + /* unspecified, set to zero */ + DDP_ECODE_CATASTROPHIC = 0x00, + /* Tagged Buffer Errors */ + DDP_ECODE_T_INVALID_STAG = 0x00, + DDP_ECODE_T_BASE_BOUNDS = 0x01, + DDP_ECODE_T_STAG_NOT_ASSOC = 0x02, + DDP_ECODE_T_TO_WRAP = 0x03, + DDP_ECODE_T_VERSION = 0x04, + /* Untagged Buffer Errors */ + DDP_ECODE_UT_INVALID_QN = 0x01, + DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02, + DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03, + DDP_ECODE_UT_INVALID_MO = 0x04, + DDP_ECODE_UT_MSG_TOOLONG = 0x05, + DDP_ECODE_UT_VERSION = 0x06 +}; + +enum rdmap_untagged_qn { + RDMAP_UNTAGGED_QN_SEND = 0, + RDMAP_UNTAGGED_QN_RDMA_READ = 1, + RDMAP_UNTAGGED_QN_TERMINATE = 2, + RDMAP_UNTAGGED_QN_COUNT = 3 +}; + +enum rdmap_etype { + RDMAP_ETYPE_CATASTROPHIC = 0x0, + RDMAP_ETYPE_REMOTE_PROTECTION = 0x1, + RDMAP_ETYPE_REMOTE_OPERATION = 0x2 +}; + +enum rdmap_ecode { + RDMAP_ECODE_INVALID_STAG = 0x00, + RDMAP_ECODE_BASE_BOUNDS = 0x01, + RDMAP_ECODE_ACCESS_RIGHTS = 0x02, + RDMAP_ECODE_STAG_NOT_ASSOC = 0x03, + RDMAP_ECODE_TO_WRAP = 0x04, + RDMAP_ECODE_VERSION = 0x05, + RDMAP_ECODE_OPCODE = 0x06, + RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07, + RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_ECODE_CANNOT_INVALIDATE = 0x09, + RDMAP_ECODE_UNSPECIFIED = 0xff +}; + +enum llp_ecode { + LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */ + LLP_ECODE_RECEIVED_CRC = 0x02, + LLP_ECODE_FPDU_START = 0x03, + LLP_ECODE_INVALID_REQ_RESP = 0x04, + + /* Errors for Enhanced Connection Establishment only */ + LLP_ECODE_LOCAL_CATASTROPHIC = 0x05, + LLP_ECODE_INSUFFICIENT_IRD = 0x06, + LLP_ECODE_NO_MATCHING_RTR = 0x07 +}; + +enum llp_etype { LLP_ETYPE_MPA = 0x00 }; + +enum rdma_opcode { + RDMAP_RDMA_WRITE = 0x0, + RDMAP_RDMA_READ_REQ = 0x1, + RDMAP_RDMA_READ_RESP = 0x2, + RDMAP_SEND = 0x3, + RDMAP_SEND_INVAL = 0x4, + RDMAP_SEND_SE = 0x5, + RDMAP_SEND_SE_INVAL = 0x6, + RDMAP_TERMINATE = 0x7, + RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1 +}; + +#endif From a531975279f3e5dd7323da09077ec848067bb313 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:24 +0200 Subject: [PATCH 120/194] rdma/siw: main include file Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw.h | 745 ++++++++++++++++++++++++++++++++ 1 file changed, 745 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw.h diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h new file mode 100644 index 000000000000..03fd7b2f595f --- /dev/null +++ b/drivers/infiniband/sw/siw/siw.h @@ -0,0 +1,745 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_H +#define _SIW_H + +#include +#include +#include +#include +#include +#include + +#include +#include "iwarp.h" + +#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */ +#define SIW_VENDORT_PART_ID 0 +#define SIW_MAX_QP (1024 * 100) +#define SIW_MAX_QP_WR (1024 * 32) +#define SIW_MAX_ORD_QP 128 +#define SIW_MAX_IRD_QP 128 +#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */ +#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */ +#define SIW_MAX_CQ (1024 * 100) +#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100) +#define SIW_MAX_MR (SIW_MAX_QP * 10) +#define SIW_MAX_PD SIW_MAX_QP +#define SIW_MAX_MW 0 /* to be set if MW's are supported */ +#define SIW_MAX_FMR SIW_MAX_MR +#define SIW_MAX_SRQ SIW_MAX_QP +#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) +#define SIW_MAX_CONTEXT SIW_MAX_PD + +/* Min number of bytes for using zero copy transmit */ +#define SENDPAGE_THRESH PAGE_SIZE + +/* Maximum number of frames which can be send in one SQ processing */ +#define SQ_USER_MAXBURST 100 + +/* Maximum number of consecutive IRQ elements which get served + * if SQ has pending work. Prevents starving local SQ processing + * by serving peer Read Requests. + */ +#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 + +struct siw_dev_cap { + int max_qp; + int max_qp_wr; + int max_ord; /* max. outbound read queue depth */ + int max_ird; /* max. inbound read queue depth */ + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_mw; + int max_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; +}; + +struct siw_pd { + struct ib_pd base_pd; +}; + +struct siw_device { + struct ib_device base_dev; + struct net_device *netdev; + struct siw_dev_cap attrs; + + u32 vendor_part_id; + int numa_node; + + /* physical port state (only one port per device) */ + enum ib_port_state state; + + spinlock_t lock; + + struct xarray qp_xa; + struct xarray mem_xa; + + struct list_head cep_list; + struct list_head qp_list; + + /* active objects statistics to enforce limits */ + atomic_t num_qp; + atomic_t num_cq; + atomic_t num_pd; + atomic_t num_mr; + atomic_t num_srq; + atomic_t num_ctx; + + struct work_struct netdev_down; +}; + +struct siw_uobj { + void *addr; + u32 size; +}; + +struct siw_ucontext { + struct ib_ucontext base_ucontext; + struct siw_device *sdev; + + /* xarray of user mappable objects */ + struct xarray xa; + u32 uobj_nextkey; +}; + +/* + * The RDMA core does not define LOCAL_READ access, which is always + * enabled implictely. + */ +#define IWARP_ACCESS_MASK \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \ + IB_ACCESS_REMOTE_READ) + +/* + * siw presentation of user memory registered as source + * or target of RDMA operations. + */ + +struct siw_page_chunk { + struct page **plist; +}; + +struct siw_umem { + struct siw_page_chunk *page_chunk; + int num_pages; + bool writable; + u64 fp_addr; /* First page base address */ + struct mm_struct *owning_mm; +}; + +struct siw_pble { + u64 addr; /* Address of assigned user buffer */ + u64 size; /* Size of this entry */ + u64 pbl_off; /* Total offset from start of PBL */ +}; + +struct siw_pbl { + unsigned int num_buf; + unsigned int max_buf; + struct siw_pble pbe[1]; +}; + +struct siw_mr; + +/* + * Generic memory representation for registered siw memory. + * Memory lookup always via higher 24 bit of STag (STag index). + */ +struct siw_mem { + struct siw_device *sdev; + struct kref ref; + u64 va; /* VA of memory */ + u64 len; /* length of the memory buffer in bytes */ + u32 stag; /* iWarp memory access steering tag */ + u8 stag_valid; /* VALID or INVALID */ + u8 is_pbl; /* PBL or user space mem */ + u8 is_mw; /* Memory Region or Memory Window */ + enum ib_access_flags perms; /* local/remote READ & WRITE */ + union { + struct siw_umem *umem; + struct siw_pbl *pbl; + void *mem_obj; + }; + struct ib_pd *pd; +}; + +struct siw_mr { + struct ib_mr base_mr; + struct siw_mem *mem; + struct rcu_head rcu; +}; + +/* + * Error codes for local or remote + * access to registered memory + */ +enum siw_access_state { + E_ACCESS_OK, + E_STAG_INVALID, + E_BASE_BOUNDS, + E_ACCESS_PERM, + E_PD_MISMATCH +}; + +enum siw_wr_state { + SIW_WR_IDLE, + SIW_WR_QUEUED, /* processing has not started yet */ + SIW_WR_INPROGRESS /* initiated processing of the WR */ +}; + +/* The WQE currently being processed (RX or TX) */ +struct siw_wqe { + /* Copy of applications SQE or RQE */ + union { + struct siw_sqe sqe; + struct siw_rqe rqe; + }; + struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */ + enum siw_wr_state wr_status; + enum siw_wc_status wc_status; + u32 bytes; /* total bytes to process */ + u32 processed; /* bytes processed */ +}; + +struct siw_cq { + struct ib_cq base_cq; + spinlock_t lock; + u64 *notify; + struct siw_cqe *queue; + u32 cq_put; + u32 cq_get; + u32 num_cqe; + bool kernel_verbs; + u32 xa_cq_index; /* mmap information for CQE array */ + u32 id; /* For debugging only */ +}; + +enum siw_qp_state { + SIW_QP_STATE_IDLE, + SIW_QP_STATE_RTR, + SIW_QP_STATE_RTS, + SIW_QP_STATE_CLOSING, + SIW_QP_STATE_TERMINATE, + SIW_QP_STATE_ERROR, + SIW_QP_STATE_COUNT +}; + +enum siw_qp_flags { + SIW_RDMA_BIND_ENABLED = (1 << 0), + SIW_RDMA_WRITE_ENABLED = (1 << 1), + SIW_RDMA_READ_ENABLED = (1 << 2), + SIW_SIGNAL_ALL_WR = (1 << 3), + SIW_MPA_CRC = (1 << 4), + SIW_QP_IN_DESTROY = (1 << 5) +}; + +enum siw_qp_attr_mask { + SIW_QP_ATTR_STATE = (1 << 0), + SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1), + SIW_QP_ATTR_LLP_HANDLE = (1 << 2), + SIW_QP_ATTR_ORD = (1 << 3), + SIW_QP_ATTR_IRD = (1 << 4), + SIW_QP_ATTR_SQ_SIZE = (1 << 5), + SIW_QP_ATTR_RQ_SIZE = (1 << 6), + SIW_QP_ATTR_MPA = (1 << 7) +}; + +struct siw_srq { + struct ib_srq base_srq; + spinlock_t lock; + u32 max_sge; + u32 limit; /* low watermark for async event */ + struct siw_rqe *recvq; + u32 rq_put; + u32 rq_get; + u32 num_rqe; /* max # of wqe's allowed */ + u32 xa_srq_index; /* mmap information for SRQ array */ + char armed; /* inform user if limit hit */ + char kernel_verbs; /* '1' if kernel client */ +}; + +struct siw_qp_attrs { + enum siw_qp_state state; + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 sq_max_sges; + u32 rq_max_sges; + enum siw_qp_flags flags; + + struct socket *sk; +}; + +enum siw_tx_ctx { + SIW_SEND_HDR, /* start or continue sending HDR */ + SIW_SEND_DATA, /* start or continue sending DDP payload */ + SIW_SEND_TRAILER, /* start or continue sending TRAILER */ + SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */ +}; + +enum siw_rx_state { + SIW_GET_HDR, /* await new hdr or within hdr */ + SIW_GET_DATA_START, /* start of inbound DDP payload */ + SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */ + SIW_GET_TRAILER/* await new trailer or within trailer */ +}; + +struct siw_rx_stream { + struct sk_buff *skb; + int skb_new; /* pending unread bytes in skb */ + int skb_offset; /* offset in skb */ + int skb_copied; /* processed bytes in skb */ + + union iwarp_hdr hdr; + struct mpa_trailer trailer; + + enum siw_rx_state state; + + /* + * For each FPDU, main RX loop runs through 3 stages: + * Receiving protocol headers, placing DDP payload and receiving + * trailer information (CRC + possibly padding). + * Next two variables keep state on receive status of the + * current FPDU part (hdr, data, trailer). + */ + int fpdu_part_rcvd; /* bytes in pkt part copied */ + int fpdu_part_rem; /* bytes in pkt part not seen */ + + /* + * Next expected DDP MSN for each QN + + * expected steering tag + + * expected DDP tagget offset (all HBO) + */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + u32 ddp_stag; + u64 ddp_to; + u32 inval_stag; /* Stag to be invalidated */ + + struct shash_desc *mpa_crc_hd; + u8 rx_suspend : 1; + u8 pad : 2; /* # of pad bytes expected */ + u8 rdmap_op : 4; /* opcode of current frame */ +}; + +struct siw_rx_fpdu { + /* + * Local destination memory of inbound RDMA operation. + * Valid, according to wqe->wr_status + */ + struct siw_wqe wqe_active; + + unsigned int pbl_idx; /* Index into current PBL */ + unsigned int sge_idx; /* current sge in rx */ + unsigned int sge_off; /* already rcvd in curr. sge */ + + char first_ddp_seg; /* this is the first DDP seg */ + char more_ddp_segs; /* more DDP segs expected */ + u8 prev_rdmap_op : 4; /* opcode of prev frame */ +}; + +/* + * Shorthands for short packets w/o payload + * to be transmitted more efficient. + */ +struct siw_send_pkt { + struct iwarp_send send; + __be32 crc; +}; + +struct siw_write_pkt { + struct iwarp_rdma_write write; + __be32 crc; +}; + +struct siw_rreq_pkt { + struct iwarp_rdma_rreq rreq; + __be32 crc; +}; + +struct siw_rresp_pkt { + struct iwarp_rdma_rresp rresp; + __be32 crc; +}; + +struct siw_iwarp_tx { + union { + union iwarp_hdr hdr; + + /* Generic part of FPDU header */ + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + + /* FPDU headers */ + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; + + /* complete short FPDUs */ + struct siw_send_pkt send_pkt; + struct siw_write_pkt write_pkt; + struct siw_rreq_pkt rreq_pkt; + struct siw_rresp_pkt rresp_pkt; + } pkt; + + struct mpa_trailer trailer; + /* DDP MSN for untagged messages */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + + enum siw_tx_ctx state; + u16 ctrl_len; /* ddp+rdmap hdr */ + u16 ctrl_sent; + int burst; + int bytes_unsent; /* ddp payload bytes */ + + struct shash_desc *mpa_crc_hd; + + u8 do_crc : 1; /* do crc for segment */ + u8 use_sendpage : 1; /* send w/o copy */ + u8 tx_suspend : 1; /* stop sending DDP segs. */ + u8 pad : 2; /* # pad in current fpdu */ + u8 orq_fence : 1; /* ORQ full or Send fenced */ + u8 in_syscall : 1; /* TX out of user context */ + u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */ + u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */ + + u16 fpdu_len; /* len of FPDU to tx */ + unsigned int tcp_seglen; /* remaining tcp seg space */ + + struct siw_wqe wqe_active; + + int pbl_idx; /* Index into current PBL */ + int sge_idx; /* current sge in tx */ + u32 sge_off; /* already sent in curr. sge */ +}; + +struct siw_qp { + struct siw_device *sdev; + struct ib_qp *ib_qp; + struct kref ref; + u32 qp_num; + struct list_head devq; + int tx_cpu; + bool kernel_verbs; + struct siw_qp_attrs attrs; + + struct siw_cep *cep; + struct rw_semaphore state_lock; + + struct ib_pd *pd; + struct siw_cq *scq; + struct siw_cq *rcq; + struct siw_srq *srq; + + struct siw_iwarp_tx tx_ctx; /* Transmit context */ + spinlock_t sq_lock; + struct siw_sqe *sendq; /* send queue element array */ + uint32_t sq_get; /* consumer index into sq array */ + uint32_t sq_put; /* kernel prod. index into sq array */ + struct llist_node tx_list; + + struct siw_sqe *orq; /* outbound read queue element array */ + spinlock_t orq_lock; + uint32_t orq_get; /* consumer index into orq array */ + uint32_t orq_put; /* shared producer index for ORQ */ + + struct siw_rx_stream rx_stream; + struct siw_rx_fpdu *rx_fpdu; + struct siw_rx_fpdu rx_tagged; + struct siw_rx_fpdu rx_untagged; + spinlock_t rq_lock; + struct siw_rqe *recvq; /* recv queue element array */ + uint32_t rq_get; /* consumer index into rq array */ + uint32_t rq_put; /* kernel prod. index into rq array */ + + struct siw_sqe *irq; /* inbound read queue element array */ + uint32_t irq_get; /* consumer index into irq array */ + uint32_t irq_put; /* producer index into irq array */ + int irq_burst; + + struct { /* information to be carried in TERMINATE pkt, if valid */ + u8 valid; + u8 in_tx; + u8 layer : 4, etype : 4; + u8 ecode; + } term_info; + u32 xa_sq_index; /* mmap information for SQE array */ + u32 xa_rq_index; /* mmap information for RQE array */ + struct rcu_head rcu; +}; + +struct siw_base_qp { + struct ib_qp base_qp; + struct siw_qp *qp; +}; + +/* helper macros */ +#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream) +#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx) +#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active) +#define rx_wqe(rctx) (&(rctx)->wqe_active) +#define rx_mem(rctx) ((rctx)->wqe_active.mem[0]) +#define tx_type(wqe) ((wqe)->sqe.opcode) +#define rx_type(wqe) ((wqe)->rqe.opcode) +#define tx_flags(wqe) ((wqe)->sqe.flags) + +struct iwarp_msg_info { + int hdr_len; + struct iwarp_ctrl ctrl; + int (*rx_data)(struct siw_qp *qp); +}; + +/* Global siw parameters. Currently set in siw_main.c */ +extern const bool zcopy_tx; +extern const bool try_gso; +extern const bool loopback_enabled; +extern const bool mpa_crc_required; +extern const bool mpa_crc_strict; +extern const bool siw_tcp_nagle; +extern u_char mpa_version; +extern const bool peer_to_peer; +extern struct task_struct *siw_tx_thread[]; + +extern struct crypto_shash *siw_crypto_shash; +extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; + +/* QP general functions */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr, + enum siw_qp_attr_mask mask); +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl); +void siw_qp_llp_close(struct siw_qp *qp); +void siw_qp_cm_drop(struct siw_qp *qp, int schedule); +void siw_send_terminate(struct siw_qp *qp); + +void siw_qp_get_ref(struct ib_qp *qp); +void siw_qp_put_ref(struct ib_qp *qp); +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp); +void siw_free_qp(struct kref *ref); + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, + u8 etype, u8 ecode, int in_tx); +enum ddp_ecode siw_tagged_error(enum siw_access_state state); +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state); + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe); +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status); +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status); +void siw_qp_llp_data_ready(struct sock *sk); +void siw_qp_llp_write_space(struct sock *sk); + +/* QP TX path functions */ +int siw_run_sq(void *arg); +int siw_qp_sq_process(struct siw_qp *qp); +int siw_sq_start(struct siw_qp *qp); +int siw_activate_tx(struct siw_qp *qp); +void siw_stop_tx_thread(int nr_cpu); +int siw_get_tx_cpu(struct siw_device *sdev); +void siw_put_tx_cpu(int cpu); + +/* QP RX path functions */ +int siw_proc_send(struct siw_qp *qp); +int siw_proc_rreq(struct siw_qp *qp); +int siw_proc_rresp(struct siw_qp *qp); +int siw_proc_write(struct siw_qp *qp); +int siw_proc_terminate(struct siw_qp *qp); + +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len); + +static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode) +{ + if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP) + qp->rx_fpdu = &qp->rx_tagged; + else + qp->rx_fpdu = &qp->rx_untagged; + + qp->rx_stream.rdmap_op = opcode; +} + +static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx) +{ + return container_of(base_ctx, struct siw_ucontext, base_ucontext); +} + +static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp) +{ + return container_of(base_qp, struct siw_base_qp, base_qp); +} + +static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp) +{ + return to_siw_base_qp(base_qp)->qp; +} + +static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq) +{ + return container_of(base_cq, struct siw_cq, base_cq); +} + +static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq) +{ + return container_of(base_srq, struct siw_srq, base_srq); +} + +static inline struct siw_device *to_siw_dev(struct ib_device *base_dev) +{ + return container_of(base_dev, struct siw_device, base_dev); +} + +static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr) +{ + return container_of(base_mr, struct siw_mr, base_mr); +} + +static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id) +{ + struct siw_qp *qp; + + rcu_read_lock(); + qp = xa_load(&sdev->qp_xa, id); + if (likely(qp && kref_get_unless_zero(&qp->ref))) { + rcu_read_unlock(); + return qp; + } + rcu_read_unlock(); + return NULL; +} + +static inline u32 qp_id(struct siw_qp *qp) +{ + return qp->qp_num; +} + +static inline void siw_qp_get(struct siw_qp *qp) +{ + kref_get(&qp->ref); +} + +static inline void siw_qp_put(struct siw_qp *qp) +{ + kref_put(&qp->ref, siw_free_qp); +} + +static inline int siw_sq_empty(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + return READ_ONCE(sqe->flags) == 0; +} + +static inline struct siw_sqe *sq_get_next(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + if (READ_ONCE(sqe->flags) & SIW_WQE_VALID) + return sqe; + + return NULL; +} + +static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_get % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_put % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) +{ + struct siw_sqe *orq_e = orq_get_tail(qp); + + if (orq_e && READ_ONCE(orq_e->flags) == 0) + return orq_e; + + return NULL; +} + +static inline int siw_orq_empty(struct siw_qp *qp) +{ + return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0; +} + +static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) +{ + struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size]; + + if (READ_ONCE(irq_e->flags) == 0) { + qp->irq_put++; + return irq_e; + } + return NULL; +} + +static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)crc32c((__force __u32)sum, buff, len); +} + +static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset, + int len) +{ + return (__force __wsum)__crc32c_le_combine((__force __u32)csum, + (__force __u32)csum2, len); +} + +static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) +{ + const struct skb_checksum_ops siw_cs_ops = { + .update = siw_csum_update, + .combine = siw_csum_combine, + }; + __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd); + + crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc, + &siw_cs_ops); + *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc; +} + +#define siw_dbg(ibdev, fmt, ...) \ + ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__) + +#define siw_dbg_qp(qp, fmt, ...) \ + ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_cq(cq, fmt, ...) \ + ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_pd(pd, fmt, ...) \ + ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_mem(mem, fmt, ...) \ + ibdev_dbg(&mem->sdev->base_dev, \ + "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) + +#define siw_dbg_cep(cep, fmt, ...) \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ + cep, __func__, ##__VA_ARGS__) + +void siw_cq_flush(struct siw_cq *cq); +void siw_sq_flush(struct siw_qp *qp); +void siw_rq_flush(struct siw_qp *qp); +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); + +#endif From bdcf26bf9b3acb03c8f90387cfc6474fc8ac5521 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:25 +0200 Subject: [PATCH 121/194] rdma/siw: network and RDMA core interface Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 687 +++++++++++++++++++++++++++ 1 file changed, 687 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_main.c diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c new file mode 100644 index 000000000000..3f5f3d27ebe5 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -0,0 +1,687 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" + +MODULE_AUTHOR("Bernard Metzler"); +MODULE_DESCRIPTION("Software iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* transmit from user buffer, if possible */ +const bool zcopy_tx = true; + +/* Restrict usage of GSO, if hardware peer iwarp is unable to process + * large packets. try_gso = true lets siw try to use local GSO, + * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. + */ +const bool try_gso; + +/* Attach siw also with loopback devices */ +const bool loopback_enabled = true; + +/* We try to negotiate CRC on, if true */ +const bool mpa_crc_required; + +/* MPA CRC on/off enforced */ +const bool mpa_crc_strict; + +/* Control TCP_NODELAY socket option */ +const bool siw_tcp_nagle; + +/* Select MPA version to be used during connection setup */ +u_char mpa_version = MPA_REVISION_2; + +/* Selects MPA P2P mode (additional handshake during connection + * setup, if true. + */ +const bool peer_to_peer; + +struct task_struct *siw_tx_thread[NR_CPUS]; +struct crypto_shash *siw_crypto_shash; + +static int siw_device_register(struct siw_device *sdev, const char *name) +{ + struct ib_device *base_dev = &sdev->base_dev; + static int dev_id = 1; + int rv; + + rv = ib_register_device(base_dev, name); + if (rv) { + pr_warn("siw: device registration error %d\n", rv); + return rv; + } + sdev->vendor_part_id = dev_id++; + + siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr); + + return 0; +} + +static void siw_device_cleanup(struct ib_device *base_dev) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + xa_destroy(&sdev->qp_xa); + xa_destroy(&sdev->mem_xa); +} + +static int siw_create_tx_threads(void) +{ + int cpu, rv, assigned = 0; + + for_each_online_cpu(cpu) { + /* Skip HT cores */ + if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) + continue; + + siw_tx_thread[cpu] = + kthread_create(siw_run_sq, (unsigned long *)(long)cpu, + "siw_tx/%d", cpu); + if (IS_ERR(siw_tx_thread[cpu])) { + rv = PTR_ERR(siw_tx_thread[cpu]); + siw_tx_thread[cpu] = NULL; + pr_info("Creating TX thread for CPU %d failed", cpu); + continue; + } + kthread_bind(siw_tx_thread[cpu], cpu); + + wake_up_process(siw_tx_thread[cpu]); + assigned++; + } + return assigned; +} + +static int siw_dev_qualified(struct net_device *netdev) +{ + /* + * Additional hardware support can be added here + * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see + * for type identifiers. + */ + if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || + (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) + return 1; + + return 0; +} + +static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); + +static struct { + struct cpumask **tx_valid_cpus; + int num_nodes; +} siw_cpu_info; + +static int siw_init_cpulist(void) +{ + int i, num_nodes = num_possible_nodes(); + + memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); + + siw_cpu_info.num_nodes = num_nodes; + + siw_cpu_info.tx_valid_cpus = + kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus) { + siw_cpu_info.num_nodes = 0; + return -ENOMEM; + } + for (i = 0; i < siw_cpu_info.num_nodes; i++) { + siw_cpu_info.tx_valid_cpus[i] = + kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus[i]) + goto out_err; + + cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); + } + for_each_possible_cpu(i) + cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); + + return 0; + +out_err: + siw_cpu_info.num_nodes = 0; + while (i) { + kfree(siw_cpu_info.tx_valid_cpus[i]); + siw_cpu_info.tx_valid_cpus[i--] = NULL; + } + kfree(siw_cpu_info.tx_valid_cpus); + siw_cpu_info.tx_valid_cpus = NULL; + + return -ENOMEM; +} + +static void siw_destroy_cpulist(void) +{ + int i = 0; + + while (i < siw_cpu_info.num_nodes) + kfree(siw_cpu_info.tx_valid_cpus[i++]); + + kfree(siw_cpu_info.tx_valid_cpus); +} + +/* + * Choose CPU with least number of active QP's from NUMA node of + * TX interface. + */ +int siw_get_tx_cpu(struct siw_device *sdev) +{ + const struct cpumask *tx_cpumask; + int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; + + if (node < 0) + tx_cpumask = cpu_online_mask; + else + tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; + + num_cpus = cpumask_weight(tx_cpumask); + if (!num_cpus) { + /* no CPU on this NUMA node */ + tx_cpumask = cpu_online_mask; + num_cpus = cpumask_weight(tx_cpumask); + } + if (!num_cpus) + goto out; + + cpu = cpumask_first(tx_cpumask); + + for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; + i++, cpu = cpumask_next(cpu, tx_cpumask)) { + int usage; + + /* Skip any cores which have no TX thread */ + if (!siw_tx_thread[cpu]) + continue; + + usage = atomic_read(&per_cpu(use_cnt, cpu)); + if (usage <= min_use) { + tx_cpu = cpu; + min_use = usage; + } + } + siw_dbg(&sdev->base_dev, + "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); + +out: + if (tx_cpu >= 0) + atomic_inc(&per_cpu(use_cnt, tx_cpu)); + else + pr_warn("siw: no tx cpu found\n"); + + return tx_cpu; +} + +void siw_put_tx_cpu(int cpu) +{ + atomic_dec(&per_cpu(use_cnt, cpu)); +} + +static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) +{ + struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); + + if (qp) { + /* + * siw_qp_id2obj() increments object reference count + */ + siw_qp_put(qp); + return qp->ib_qp; + } + return NULL; +} + +static void siw_verbs_sq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_sq_flush(qp); + up_write(&qp->state_lock); +} + +static void siw_verbs_rq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_rq_flush(qp); + up_write(&qp->state_lock); +} + +static const struct ib_device_ops siw_device_ops = { + .owner = THIS_MODULE, + .uverbs_abi_ver = SIW_ABI_VERSION, + .driver_id = RDMA_DRIVER_SIW, + + .alloc_mr = siw_alloc_mr, + .alloc_pd = siw_alloc_pd, + .alloc_ucontext = siw_alloc_ucontext, + .create_cq = siw_create_cq, + .create_qp = siw_create_qp, + .create_srq = siw_create_srq, + .dealloc_driver = siw_device_cleanup, + .dealloc_pd = siw_dealloc_pd, + .dealloc_ucontext = siw_dealloc_ucontext, + .dereg_mr = siw_dereg_mr, + .destroy_cq = siw_destroy_cq, + .destroy_qp = siw_destroy_qp, + .destroy_srq = siw_destroy_srq, + .drain_rq = siw_verbs_rq_flush, + .drain_sq = siw_verbs_sq_flush, + .get_dma_mr = siw_get_dma_mr, + .get_port_immutable = siw_get_port_immutable, + .iw_accept = siw_accept, + .iw_add_ref = siw_qp_get_ref, + .iw_connect = siw_connect, + .iw_create_listen = siw_create_listen, + .iw_destroy_listen = siw_destroy_listen, + .iw_get_qp = siw_get_base_qp, + .iw_reject = siw_reject, + .iw_rem_ref = siw_qp_put_ref, + .map_mr_sg = siw_map_mr_sg, + .mmap = siw_mmap, + .modify_qp = siw_verbs_modify_qp, + .modify_srq = siw_modify_srq, + .poll_cq = siw_poll_cq, + .post_recv = siw_post_receive, + .post_send = siw_post_send, + .post_srq_recv = siw_post_srq_recv, + .query_device = siw_query_device, + .query_gid = siw_query_gid, + .query_pkey = siw_query_pkey, + .query_port = siw_query_port, + .query_qp = siw_query_qp, + .query_srq = siw_query_srq, + .req_notify_cq = siw_req_notify_cq, + .reg_user_mr = siw_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), + INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), + INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), +}; + +static struct siw_device *siw_device_create(struct net_device *netdev) +{ + struct siw_device *sdev = NULL; + struct ib_device *base_dev; + struct device *parent = netdev->dev.parent; + int rv; + + if (!parent) { + /* + * The loopback device has no parent device, + * so it appears as a top-level device. To support + * loopback device connectivity, take this device + * as the parent device. Skip all other devices + * w/o parent device. + */ + if (netdev->type != ARPHRD_LOOPBACK) { + pr_warn("siw: device %s error: no parent device\n", + netdev->name); + return NULL; + } + parent = &netdev->dev; + } + sdev = ib_alloc_device(siw_device, base_dev); + if (!sdev) + return NULL; + + base_dev = &sdev->base_dev; + + sdev->netdev = netdev; + + if (netdev->type != ARPHRD_LOOPBACK) { + memcpy(&base_dev->node_guid, netdev->dev_addr, 6); + } else { + /* + * The loopback device does not have a HW address, + * but connection mangagement lib expects gid != 0 + */ + size_t gidlen = min_t(size_t, strlen(base_dev->name), 6); + + memcpy(&base_dev->node_guid, base_dev->name, gidlen); + } + base_dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + base_dev->node_type = RDMA_NODE_RNIC; + memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, + sizeof(SIW_NODE_DESC_COMMON)); + + /* + * Current model (one-to-one device association): + * One Softiwarp device per net_device or, equivalently, + * per physical port. + */ + base_dev->phys_port_cnt = 1; + base_dev->dev.parent = parent; + base_dev->dev.dma_ops = &dma_virt_ops; + base_dev->num_comp_vectors = num_possible_cpus(); + + ib_set_device_ops(base_dev, &siw_device_ops); + rv = ib_device_set_netdev(base_dev, netdev, 1); + if (rv) + goto error; + + memcpy(base_dev->iw_ifname, netdev->name, + sizeof(base_dev->iw_ifname)); + + /* Disable TCP port mapping */ + base_dev->iw_driver_flags = IW_F_NO_PORT_MAP, + + sdev->attrs.max_qp = SIW_MAX_QP; + sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; + sdev->attrs.max_ord = SIW_MAX_ORD_QP; + sdev->attrs.max_ird = SIW_MAX_IRD_QP; + sdev->attrs.max_sge = SIW_MAX_SGE; + sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; + sdev->attrs.max_cq = SIW_MAX_CQ; + sdev->attrs.max_cqe = SIW_MAX_CQE; + sdev->attrs.max_mr = SIW_MAX_MR; + sdev->attrs.max_pd = SIW_MAX_PD; + sdev->attrs.max_mw = SIW_MAX_MW; + sdev->attrs.max_fmr = SIW_MAX_FMR; + sdev->attrs.max_srq = SIW_MAX_SRQ; + sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; + sdev->attrs.max_srq_sge = SIW_MAX_SGE; + + xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); + + INIT_LIST_HEAD(&sdev->cep_list); + INIT_LIST_HEAD(&sdev->qp_list); + + atomic_set(&sdev->num_ctx, 0); + atomic_set(&sdev->num_srq, 0); + atomic_set(&sdev->num_qp, 0); + atomic_set(&sdev->num_cq, 0); + atomic_set(&sdev->num_mr, 0); + atomic_set(&sdev->num_pd, 0); + + sdev->numa_node = dev_to_node(parent); + spin_lock_init(&sdev->lock); + + return sdev; +error: + ib_dealloc_device(base_dev); + + return NULL; +} + +/* + * Network link becomes unavailable. Mark all + * affected QP's accordingly. + */ +static void siw_netdev_down(struct work_struct *work) +{ + struct siw_device *sdev = + container_of(work, struct siw_device, netdev_down); + + struct siw_qp_attrs qp_attrs; + struct list_head *pos, *tmp; + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.state = SIW_QP_STATE_ERROR; + + list_for_each_safe(pos, tmp, &sdev->qp_list) { + struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); + + down_write(&qp->state_lock); + WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); + up_write(&qp->state_lock); + } + ib_device_put(&sdev->base_dev); +} + +static void siw_device_goes_down(struct siw_device *sdev) +{ + if (ib_device_try_get(&sdev->base_dev)) { + INIT_WORK(&sdev->netdev_down, siw_netdev_down); + schedule_work(&sdev->netdev_down); + } +} + +static int siw_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct ib_device *base_dev; + struct siw_device *sdev; + + dev_dbg(&netdev->dev, "siw: event %lu\n", event); + + if (dev_net(netdev) != &init_net) + return NOTIFY_OK; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (!base_dev) + return NOTIFY_OK; + + sdev = to_siw_dev(base_dev); + + switch (event) { + case NETDEV_UP: + sdev->state = IB_PORT_ACTIVE; + siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); + break; + + case NETDEV_GOING_DOWN: + siw_device_goes_down(sdev); + break; + + case NETDEV_DOWN: + sdev->state = IB_PORT_DOWN; + siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); + break; + + case NETDEV_REGISTER: + /* + * Device registration now handled only by + * rdma netlink commands. So it shall be impossible + * to end up here with a valid siw device. + */ + siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); + break; + + case NETDEV_UNREGISTER: + ib_unregister_device_queued(&sdev->base_dev); + break; + + case NETDEV_CHANGEADDR: + siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); + break; + /* + * Todo: Below netdev events are currently not handled. + */ + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + break; + + default: + break; + } + ib_device_put(&sdev->base_dev); + + return NOTIFY_OK; +} + +static struct notifier_block siw_netdev_nb = { + .notifier_call = siw_netdev_event, +}; + +static int siw_newlink(const char *basedev_name, struct net_device *netdev) +{ + struct ib_device *base_dev; + struct siw_device *sdev = NULL; + int rv = -ENOMEM; + + if (!siw_dev_qualified(netdev)) + return -EINVAL; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (base_dev) { + ib_device_put(base_dev); + return -EEXIST; + } + sdev = siw_device_create(netdev); + if (sdev) { + dev_dbg(&netdev->dev, "siw: new device\n"); + + if (netif_running(netdev) && netif_carrier_ok(netdev)) + sdev->state = IB_PORT_ACTIVE; + else + sdev->state = IB_PORT_DOWN; + + rv = siw_device_register(sdev, basedev_name); + if (rv) + ib_dealloc_device(&sdev->base_dev); + } + return rv; +} + +static struct rdma_link_ops siw_link_ops = { + .type = "siw", + .newlink = siw_newlink, +}; + +/* + * siw_init_module - Initialize Softiwarp module and register with netdev + * subsystem. + */ +static __init int siw_init_module(void) +{ + int rv; + int nr_cpu; + + if (SENDPAGE_THRESH < SIW_MAX_INLINE) { + pr_info("siw: sendpage threshold too small: %u\n", + (int)SENDPAGE_THRESH); + rv = -EINVAL; + goto out_error; + } + rv = siw_init_cpulist(); + if (rv) + goto out_error; + + rv = siw_cm_init(); + if (rv) + goto out_error; + + if (!siw_create_tx_threads()) { + pr_info("siw: Could not start any TX thread\n"); + goto out_error; + } + /* + * Locate CRC32 algorithm. If unsuccessful, fail + * loading siw only, if CRC is required. + */ + siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(siw_crypto_shash)) { + pr_info("siw: Loading CRC32c failed: %ld\n", + PTR_ERR(siw_crypto_shash)); + siw_crypto_shash = NULL; + if (mpa_crc_required) { + rv = -EOPNOTSUPP; + goto out_error; + } + } + rv = register_netdevice_notifier(&siw_netdev_nb); + if (rv) + goto out_error; + + rdma_link_register(&siw_link_ops); + + pr_info("SoftiWARP attached\n"); + return 0; + +out_error: + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { + if (siw_tx_thread[nr_cpu]) { + siw_stop_tx_thread(nr_cpu); + siw_tx_thread[nr_cpu] = NULL; + } + } + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftIWARP attach failed. Error: %d\n", rv); + + siw_cm_exit(); + siw_destroy_cpulist(); + + return rv; +} + +static void __exit siw_exit_module(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (siw_tx_thread[cpu]) { + siw_stop_tx_thread(cpu); + siw_tx_thread[cpu] = NULL; + } + } + unregister_netdevice_notifier(&siw_netdev_nb); + rdma_link_unregister(&siw_link_ops); + ib_unregister_driver(RDMA_DRIVER_SIW); + + siw_cm_exit(); + + siw_destroy_cpulist(); + + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftiWARP detached\n"); +} + +module_init(siw_init_module); +module_exit(siw_exit_module); + +MODULE_ALIAS_RDMA_LINK("siw"); From 6c52fdc244b5ccc468006fd65a504d4ee33743c7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:26 +0200 Subject: [PATCH 122/194] rdma/siw: connection management Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 2072 ++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_cm.h | 133 ++ 2 files changed, 2205 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_cm.c create mode 100644 drivers/infiniband/sw/siw/siw_cm.h diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c new file mode 100644 index 000000000000..8e618cb7261f --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -0,0 +1,2072 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Fredy Neeser */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_cm.h" + +/* + * Set to any combination of + * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR + */ +static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; +static const bool relaxed_ird_negotiation = 1; + +static void siw_cm_llp_state_change(struct sock *s); +static void siw_cm_llp_data_ready(struct sock *s); +static void siw_cm_llp_write_space(struct sock *s); +static void siw_cm_llp_error_report(struct sock *s); +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status); + +static void siw_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = siw_cm_llp_state_change; + sk->sk_data_ready = siw_cm_llp_data_ready; + sk->sk_write_space = siw_cm_llp_write_space; + sk->sk_error_report = siw_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_save_upcalls(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_write_space = sk->sk_write_space; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_write_space = cep->sk_write_space; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) +{ + struct socket *s = cep->sock; + struct sock *sk = s->sk; + + write_lock_bh(&sk->sk_callback_lock); + + qp->attrs.sk = s; + sk->sk_data_ready = siw_qp_llp_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct siw_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + siw_sk_restore_upcalls(sk, cep); + siw_cep_put(cep); + } else { + pr_warn("siw: cannot restore sk callbacks: no ep\n"); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + pr_warn("siw: cannot restore sk callbacks: no sk\n"); + } +} + +static void siw_rtr_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + struct siw_qp *qp = NULL; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN(1, "No connection endpoint\n"); + goto out; + } + qp = sk_to_qp(sk); + + memset(&rd_desc, 0, sizeof(rd_desc)); + rd_desc.arg.data = qp; + rd_desc.count = 1; + + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + /* + * Check if first frame was successfully processed. + * Signal connection full establishment if yes. + * Failed data processing would have already scheduled + * connection drop. + */ + if (!qp->rx_stream.rx_suspend) + siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); +out: + read_unlock(&sk->sk_callback_lock); + if (qp) + siw_qp_socket_assoc(cep, qp); +} + +static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) +{ + struct sock *sk = cep->sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = siw_rtr_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) +{ + cep->sock = s; + siw_cep_get(cep); + s->sk->sk_user_data = cep; + + siw_sk_save_upcalls(s->sk); + siw_sk_assign_cm_upcalls(s->sk); +} + +static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) +{ + struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = SIW_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->sdev = sdev; + cep->enhanced_rdma_conn_est = false; + + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&cep->devq, &sdev->cep_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "new endpoint\n"); + return cep; +} + +static void siw_cm_free_work(struct siw_cep *cep) +{ + struct list_head *w, *tmp; + struct siw_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct siw_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void siw_cancel_mpatimer(struct siw_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + siw_cep_put(cep); + kfree(cep->mpa_timer); /* not needed again */ + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void siw_put_work(struct siw_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void siw_cep_set_inuse(struct siw_cep *cep) +{ + unsigned long flags; + int rv; +retry: + spin_lock_irqsave(&cep->lock, flags); + + if (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + rv = wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + goto retry; + } else { + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); + } +} + +static void siw_cep_set_free(struct siw_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __siw_cep_dealloc(struct kref *ref) +{ + struct siw_cep *cep = container_of(ref, struct siw_cep, ref); + struct siw_device *sdev = cep->sdev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + /* kfree(NULL) is safe */ + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + siw_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&sdev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "free endpoint\n"); + kfree(cep); +} + +static struct siw_cm_work *siw_get_work(struct siw_cep *cep) +{ + struct siw_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct siw_cm_work, + list); + list_del_init(&work->list); + } + spin_unlock_bh(&cep->lock); + return work; +} + +static int siw_cm_alloc_work(struct siw_cep *cep, int num) +{ + struct siw_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + siw_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + return 0; +} + +/* + * siw_cm_upcall() + * + * Upcall to IWCM to inform about async connection events + */ +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + id = cep->listen_cep->cm_id; + } else { + id = cep->cm_id; + } + /* Signal IRD and ORD */ + if (reason == IW_CM_EVENT_ESTABLISHED || + reason == IW_CM_EVENT_CONNECT_REPLY) { + /* Signal negotiated IRD/ORD values we will use */ + event.ird = cep->ird; + event.ord = cep->ord; + } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.ird = cep->ord; + event.ord = cep->ird; + } + /* Signal private data and address information */ + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len) { + /* + * hand over MPA private data + */ + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + + /* Hide MPA V2 IRD/ORD control */ + if (cep->enhanced_rdma_conn_est) { + event.private_data_len -= + sizeof(struct mpa_v2_data); + event.private_data += + sizeof(struct mpa_v2_data); + } + } + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n", + cep->qp ? qp_id(cep->qp) : -1, id, reason, status); + + return id->event_handler(id, &event); +} + +/* + * siw_qp_cm_drop() + * + * Drops established LLP connection if present and not already + * scheduled for dropping. Called from user context, SQ workqueue + * or receive IRQ. Caller signals if socket can be immediately + * closed (basically, if not in IRQ). + */ +void siw_qp_cm_drop(struct siw_qp *qp, int schedule) +{ + struct siw_cep *cep = qp->cep; + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + + if (!qp->cep) + return; + + if (schedule) { + siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); + } else { + siw_cep_set_inuse(cep); + + if (cep->state == SIW_EPSTATE_CLOSED) { + siw_dbg_cep(cep, "already closed\n"); + goto out; + } + siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); + + if (qp->term_info.valid) + siw_send_terminate(qp); + + if (cep->cm_id) { + switch (cep->state) { + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + + case SIW_EPSTATE_RDMA_MODE: + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + + case SIW_EPSTATE_IDLE: + case SIW_EPSTATE_LISTENING: + case SIW_EPSTATE_CONNECTING: + case SIW_EPSTATE_AWAIT_MPAREQ: + case SIW_EPSTATE_RECVD_MPAREQ: + case SIW_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->sock) { + siw_socket_disassoc(cep->sock); + /* + * Immediately close socket + */ + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->qp) { + cep->qp = NULL; + siw_qp_put(qp); + } +out: + siw_cep_set_free(cep); + } +} + +void siw_cep_put(struct siw_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __siw_cep_dealloc); +} + +void siw_cep_get(struct siw_cep *cep) +{ + kref_get(&cep->ref); +} + +/* + * Expects params->pd_len in host byte order + */ +static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int rv; + int iovec_num = 0; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + mpa_len = sizeof(*rr); + + if (cep->enhanced_rdma_conn_est) { + iovec_num++; + iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; + iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); + mpa_len += sizeof(cep->mpa.v2_ctrl); + } + if (pd_len) { + iovec_num++; + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + } + if (cep->enhanced_rdma_conn_est) + pd_len += sizeof(cep->mpa.v2_ctrl); + + rr->params.pd_len = cpu_to_be16(pd_len); + + rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); + + return rv < 0 ? rv : 0; +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply header including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int siw_recv_mpa_rr(struct siw_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, + sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, + 0); + if (rcvd <= 0) + return -ECONNABORTED; + + cep->mpa.bytes_rcvd += rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) + return -EAGAIN; + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) + return -EPROTO; + } + pd_len = be16_to_cpu(hdr->params.pd_len); + + /* + * At least the MPA Request/Reply header (frame not including + * private data) has been received. + * Receive (or continue receiving) any private data. + */ + to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); + + if (!to_rcv) { + /* + * We must have hdr->params.pd_len == 0 and thus received a + * complete MPA Request/Reply frame. + * Check against peer protocol violation. + */ + u32 word; + + rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return 0; + + if (rcvd == 0) { + siw_dbg_cep(cep, "peer EOF\n"); + return -EPIPE; + } + if (rcvd < 0) { + siw_dbg_cep(cep, "error: %d\n", rcvd); + return rcvd; + } + siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); + + return -EPROTO; + } + + /* + * At this point, we must have hdr->params.pd_len != 0. + * A private data buffer gets allocated if hdr->params.pd_len != 0. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + rcvd = ksock_recv( + s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + to_rcv + 4, MSG_DONTWAIT); + + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) { + siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); + return 0; + } + return -EAGAIN; +} + +/* + * siw_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int siw_proc_mpareq(struct siw_cep *cep) +{ + struct mpa_rr *req; + int version, rv; + u16 pd_len; + + rv = siw_recv_mpa_rr(cep); + if (rv) + return rv; + + req = &cep->mpa.hdr; + + version = __mpa_rr_revision(req->params.bits); + pd_len = be16_to_cpu(req->params.pd_len); + + if (version > MPA_REVISION_2) + /* allow for 0, 1, and 2 only */ + return -EPROTO; + + if (memcmp(req->key, MPA_KEY_REQ, 16)) + return -EPROTO; + + /* Prepare for sending MPA reply */ + memcpy(req->key, MPA_KEY_REP, 16); + + if (version == MPA_REVISION_2 && + (req->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * MPA version 2 must signal IRD/ORD values and P2P mode + * in private data if header flag MPA_RR_FLAG_ENHANCED + * is set. + */ + if (pd_len < sizeof(struct mpa_v2_data)) + goto reject_conn; + + cep->enhanced_rdma_conn_est = true; + } + + /* MPA Markers: currently not supported. Marker TX to be added. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS) + goto reject_conn; + + if (req->params.bits & MPA_RR_FLAG_CRC) { + /* + * RFC 5044, page 27: CRC MUST be used if peer requests it. + * siw specific: 'mpa_crc_strict' parameter to reject + * connection with CRC if local CRC off enforced by + * 'mpa_crc_strict' module parameter. + */ + if (!mpa_crc_required && mpa_crc_strict) + goto reject_conn; + + /* Enable CRC if requested by module parameter */ + if (mpa_crc_required) + req->params.bits |= MPA_RR_FLAG_CRC; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; + + /* + * Peer requested ORD becomes requested local IRD, + * peer requested IRD becomes requested local ORD. + * IRD and ORD get limited by global maximum values. + */ + cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + cep->ord = min(cep->ord, SIW_MAX_ORD_QP); + cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + cep->ird = min(cep->ird, SIW_MAX_IRD_QP); + + /* May get overwritten by locally negotiated values */ + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + /* + * Support for peer sent zero length Write or Read to + * let local side enter RTS. Writes are preferred. + * Sends would require pre-posting a Receive and are + * not supported. + * Propose zero length Write if none of Read and Write + * is indicated. + */ + if (v2->ird & MPA_V2_PEER_TO_PEER) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + + if (v2->ord & MPA_V2_RDMA_WRITE_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + else if (v2->ord & MPA_V2_RDMA_READ_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; + else + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + } + } + + cep->state = SIW_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + siw_cep_get(cep); + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (rv) + siw_cep_put(cep); + + return rv; + +reject_conn: + siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", + req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + + if (!mpa_crc_required && mpa_crc_strict) + req->params.bits &= ~MPA_RR_FLAG_CRC; + + if (pd_len) + kfree(cep->mpa.pdata); + + cep->mpa.pdata = NULL; + + siw_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int siw_proc_mpareply(struct siw_cep *cep) +{ + struct siw_qp_attrs qp_attrs; + enum siw_qp_attr_mask qp_attr_mask; + struct siw_qp *qp = cep->qp; + struct mpa_rr *rep; + int rv; + u16 rep_ord; + u16 rep_ird; + bool ird_insufficient = false; + enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; + + rv = siw_recv_mpa_rr(cep); + if (rv != -EAGAIN) + siw_cancel_mpatimer(cep); + if (rv) + goto out_err; + + rep = &cep->mpa.hdr; + + if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { + /* allow for 0, 1, and 2 only */ + rv = -EPROTO; + goto out_err; + } + if (memcmp(rep->key, MPA_KEY_REP, 16)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, + LLP_ECODE_INVALID_REQ_RESP, 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + siw_dbg_cep(cep, "got mpa reject\n"); + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + + return -ECONNRESET; + } + if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || + (mpa_crc_strict && !mpa_crc_required && + (rep->params.bits & MPA_RR_FLAG_CRC))) { + siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", + rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + + return -EINVAL; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2; + + if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || + !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * Protocol failure: The responder MUST reply with + * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. + */ + siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", + __mpa_rr_revision(rep->params.bits), + rep->params.bits & MPA_RR_FLAG_ENHANCED ? + 1 : + 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + return -EINVAL; + } + v2 = (struct mpa_v2_data *)cep->mpa.pdata; + rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + + if (cep->ird < rep_ord && + (relaxed_ird_negotiation == false || + rep_ord > cep->sdev->attrs.max_ird)) { + siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", + cep->ird, rep_ord, + cep->sdev->attrs.max_ord); + ird_insufficient = true; + } + if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { + siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, + rep_ird); + ird_insufficient = true; + } + /* + * Always report negotiated peer values to user, + * even if IRD/ORD negotiation failed + */ + cep->ird = rep_ord; + cep->ord = rep_ird; + + if (ird_insufficient) { + /* + * If the initiator IRD is insuffient for the + * responder ORD, send a TERM. + */ + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_INSUFFICIENT_IRD, 0); + siw_send_terminate(qp); + rv = -ENOMEM; + goto out_err; + } + if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) + mpa_p2p_mode = + cep->mpa.v2_ctrl_req.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); + + /* + * Check if we requested P2P mode, and if peer agrees + */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + if ((mpa_p2p_mode & v2->ord) == 0) { + /* + * We requested RTR mode(s), but the peer + * did not pick any mode we support. + */ + siw_dbg_cep(cep, + "rtr mode: req %2x, got %2x\n", + mpa_p2p_mode, + v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR)); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_NO_MATCHING_RTR, + 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR); + } + } + memset(&qp_attrs, 0, sizeof(qp_attrs)); + + if (rep->params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.sk = cep->sock; + qp_attrs.state = SIW_QP_STATE_RTS; + + qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; + + /* Move socket RX/TX under QP control */ + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); + + siw_qp_socket_assoc(cep, qp); + + up_write(&qp->state_lock); + + /* Send extra RDMA frame to trigger peer RTS if negotiated */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); + if (rv) + goto out_err; + } + if (!rv) { + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!rv) + cep->state = SIW_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return rv; +} + +/* + * siw_accept_newconn - accept an incoming pending connection + * + */ +static void siw_accept_newconn(struct siw_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct siw_cep *new_cep = NULL; + int rv = 0; /* debug only. should disappear */ + + if (cep->state != SIW_EPSTATE_LISTENING) + goto error; + + new_cep = siw_cep_alloc(cep->sdev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (siw_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_write_space = cep->sk_write_space; + new_cep->sk_error_report = cep->sk_error_report; + + rv = kernel_accept(s, &new_s, O_NONBLOCK); + if (rv != 0) { + /* + * Connection already aborted by peer..? + */ + siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); + goto error; + } + new_cep->sock = new_s; + siw_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s); + + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rv) { + siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; + + rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); + if (rv) + goto error; + /* + * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. + */ + new_cep->listen_cep = cep; + siw_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* + * MPA REQ already queued + */ + siw_dbg_cep(cep, "immediate mpa request\n"); + + siw_cep_set_inuse(new_cep); + rv = siw_proc_mpareq(new_cep); + siw_cep_set_free(new_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep); + new_cep->listen_cep = NULL; + if (rv) + goto error; + } + } + return; + +error: + if (new_cep) + siw_cep_put(new_cep); + + if (new_s) { + siw_socket_disassoc(new_s); + sock_release(new_s); + new_cep->sock = NULL; + } + siw_dbg_cep(cep, "error %d\n", rv); +} + +static void siw_cm_work_handler(struct work_struct *w) +{ + struct siw_cm_work *work; + struct siw_cep *cep; + int release_cep = 0, rv = 0; + + work = container_of(w, struct siw_cm_work, work.work); + cep = work->cep; + + siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", + cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state); + + siw_cep_set_inuse(cep); + + switch (work->type) { + case SIW_CM_WORK_ACCEPT: + siw_accept_newconn(cep); + break; + + case SIW_CM_WORK_READ_MPAHDR: + if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + siw_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + SIW_EPSTATE_LISTENING) + rv = siw_proc_mpareq(cep); + else + rv = -EFAULT; + + siw_cep_set_free(cep->listen_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (rv) + siw_cep_put(cep); + } + } + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + rv = siw_proc_mpareply(cep); + } else { + /* + * CEP already moved out of MPA handshake. + * any connection management already done. + * silently ignore the mpa packet. + */ + if (cep->state == SIW_EPSTATE_RDMA_MODE) { + cep->sock->sk->sk_data_ready(cep->sock->sk); + siw_dbg_cep(cep, "already in RDMA mode"); + } else { + siw_dbg_cep(cep, "out of state: %d\n", + cep->state); + } + } + if (rv && rv != EAGAIN) + release_cep = 1; + break; + + case SIW_CM_WORK_CLOSE_LLP: + /* + * QP scheduled LLP close + */ + if (cep->qp && cep->qp->term_info.valid) + siw_send_terminate(cep->qp); + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + + release_cep = 1; + break; + + case SIW_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + /* + * for other states there is no connection + * known to the IWCM. + */ + } else { + if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { + /* + * Wait for the ulp/CM to call accept/reject + */ + siw_dbg_cep(cep, + "mpa req recvd, wait for ULP\n"); + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * Socket close before MPA request received. + */ + siw_dbg_cep(cep, "no mpareq: drop listener\n"); + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + } + release_cep = 1; + break; + + case SIW_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * No MPA request received after peer TCP stream setup. + */ + if (cep->listen_cep) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + release_cep = 1; + } + break; + + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + if (release_cep) { + siw_dbg_cep(cep, + "release: timer=%s, QP[%u], id 0x%p\n", + cep->mpa_timer ? "y" : "n", + cep->qp ? qp_id(cep->qp) : -1, cep->cm_id); + + siw_cancel_mpatimer(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->qp) { + struct siw_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling siw_qp_cm_drop() + */ + siw_qp_get(qp); + siw_cep_set_free(cep); + + siw_qp_llp_close(qp); + siw_qp_put(qp); + + siw_cep_set_inuse(cep); + cep->qp = NULL; + siw_qp_put(qp); + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + } + siw_cep_set_free(cep); + siw_put_work(work); + siw_cep_put(cep); +} + +static struct workqueue_struct *siw_cm_wq; + +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) +{ + struct siw_cm_work *work = siw_get_work(cep); + unsigned long delay = 0; + + if (!work) { + siw_dbg_cep(cep, "failed with no work available\n"); + return -ENOMEM; + } + work->type = type; + work->cep = cep; + + siw_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); + + if (type == SIW_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) + delay = MPAREQ_TIMEOUT; + else + delay = MPAREP_TIMEOUT; + } + siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n", + cep->qp ? qp_id(cep->qp) : -1, type, work, delay); + + queue_delayed_work(siw_cm_wq, &work->work, delay); + + return 0; +} + +static void siw_cm_llp_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN_ON(1); + goto out; + } + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (cep->state) { + case SIW_EPSTATE_RDMA_MODE: + /* fall through */ + case SIW_EPSTATE_LISTENING: + break; + + case SIW_EPSTATE_AWAIT_MPAREQ: + /* fall through */ + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); + break; + + default: + siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); + break; + } +out: + read_unlock(&sk->sk_callback_lock); +} + +static void siw_cm_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) + siw_dbg_cep(cep, "state: %d\n", cep->state); +} + +static void siw_cm_llp_error_report(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) { + siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", + sk->sk_err, sk->sk_state, cep->state); + cep->sk_error_report(sk); + } +} + +static void siw_cm_llp_state_change(struct sock *sk) +{ + struct siw_cep *cep; + struct socket *s; + void (*orig_state_change)(struct sock *s); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + /* endpoint already disassociated */ + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + s = sk->sk_socket; + + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + /* + * handle accepting socket as special case where only + * new connection is possible + */ + siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); + break; + + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->qp) + cep->qp->tx_ctx.tx_suspend = 1; + siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); + break; + + default: + siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + struct sockaddr *raddr) +{ + int rv, flags = 0, s_val = 1; + size_t size = laddr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + + /* + * Make address available again asap. + */ + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv < 0) + return rv; + + rv = s->ops->bind(s, laddr, size); + if (rv < 0) + return rv; + + rv = s->ops->connect(s, raddr, size, flags); + + return rv < 0 ? rv : 0; +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_qp *qp; + struct siw_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, + *raddr = (struct sockaddr *)&id->remote_addr; + bool p2p_mode = peer_to_peer, v4 = true; + u16 pd_len = params->private_data_len; + int version = mpa_version, rv; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > sdev->attrs.max_ird || + params->ord > sdev->attrs.max_ord) + return -ENOMEM; + + if (laddr->sa_family == AF_INET6) + v4 = false; + else if (laddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* + * Respect any iwarp port mapping: Use mapped remote address + * if valid. Local address must not be mapped, since siw + * uses kernel TCP stack. + */ + if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || + to_sockaddr_in6(id->remote_addr).sin6_port != 0) + raddr = (struct sockaddr *)&id->m_remote_addr; + + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %u] does not exist\n", params->qpn); + rv = -EINVAL; + goto error; + } + if (v4) + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", + id, pd_len, + &((struct sockaddr_in *)(laddr))->sin_addr, + ntohs(((struct sockaddr_in *)(laddr))->sin_port), + &((struct sockaddr_in *)(raddr))->sin_addr, + ntohs(((struct sockaddr_in *)(raddr))->sin_port)); + else + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", + id, pd_len, + &((struct sockaddr_in6 *)(laddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port), + &((struct sockaddr_in6 *)(raddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port)); + + rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + goto error; + + /* + * NOTE: For simplification, connect() is called in blocking + * mode. Might be reconsidered for async connection setup at + * TCP level. + */ + rv = kernel_bindconnect(s, laddr, raddr); + if (rv != 0) { + siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); + goto error; + } + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, + sizeof(val)); + if (rv) { + siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_set_inuse(cep); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + id->add_ref(id); + cep->cm_id = id; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + rv = siw_cm_alloc_work(cep, 4); + if (rv != 0) { + rv = -ENOMEM; + goto error; + } + cep->ird = params->ird; + cep->ord = params->ord; + + if (p2p_mode && cep->ord == 0) + cep->ord = 1; + + cep->state = SIW_EPSTATE_CONNECTING; + + /* + * Associate CEP with socket + */ + siw_cep_socket_assoc(cep, s); + + cep->state = SIW_EPSTATE_AWAIT_MPAREP; + + /* + * Set MPA Request bits: CRC if required, no MPA Markers, + * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. + */ + cep->mpa.hdr.params.bits = 0; + if (version > MPA_REVISION_2) { + pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); + version = MPA_REVISION_2; + /* Adjust also module parameter */ + mpa_version = MPA_REVISION_2; + } + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); + + if (try_gso) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; + + if (mpa_crc_required) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; + + /* + * If MPA version == 2: + * o Include ORD and IRD. + * o Indicate peer-to-peer mode, if required by module + * parameter 'peer_to_peer'. + */ + if (version == MPA_REVISION_2) { + cep->enhanced_rdma_conn_est = true; + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; + + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + if (p2p_mode) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + cep->mpa.v2_ctrl.ord |= rtr_type; + } + /* Remember own P2P mode requested */ + cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; + cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; + } + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); + + rv = siw_send_mpareqrep(cep, params->private_data, pd_len); + /* + * Reset private data. + */ + cep->mpa.hdr.params.pd_len = 0; + + if (rv >= 0) { + rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); + if (!rv) { + siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id, + qp_id(qp)); + siw_cep_set_free(cep); + return 0; + } + } +error: + siw_dbg_qp(qp, "failed: %d\n", rv); + + if (cep) { + siw_socket_disassoc(s); + sock_release(s); + cep->sock = NULL; + + cep->qp = NULL; + + cep->cm_id = NULL; + id->rem_ref(id); + siw_cep_put(cep); + + qp->cep = NULL; + siw_cep_put(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + + siw_cep_put(cep); + + } else if (s) { + sock_release(s); + } + siw_qp_put(qp); + + return rv; +} + +/* + * siw_accept - Let SoftiWARP accept an RDMA connection request + * + * @id: New connection management id to be used for accepted + * connection request + * @params: Connection parameters provided by ULP for accepting connection + * + * Transition QP to RTS state, associate new CM id @id with accepted CEP + * and get prepared for TCP input by installing socket callbacks. + * Then send MPA Reply and generate the "connection established" event. + * Socket callbacks must be installed before sending MPA Reply, because + * the latter may cause a first RDMA message to arrive from the RDMA Initiator + * side very quickly, at which time the socket callbacks must be ready. + */ +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + struct siw_qp *qp; + struct siw_qp_attrs qp_attrs; + int rv, max_priv_data = MPA_MAX_PRIVDATA; + bool wait_for_peer_rts = false; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -ECONNRESET; + } + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %d] does not exist\n", params->qpn); + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -EINVAL; + } + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + siw_dbg_cep(cep, "id 0x%p\n", id); + + if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if (params->ord > sdev->attrs.max_ord || + params->ird > sdev->attrs.max_ird) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n", + id, qp_id(qp), params->ord, sdev->attrs.max_ord, + params->ird, sdev->attrs.max_ird); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) + max_priv_data -= sizeof(struct mpa_v2_data); + + if (params->private_data_len > max_priv_data) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: private data length: %d (max %d)\n", + id, qp_id(qp), params->private_data_len, max_priv_data); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) { + if (params->ord > cep->ord) { + if (relaxed_ird_negotiation) { + params->ord = cep->ord; + } else { + cep->ird = params->ird; + cep->ord = params->ord; + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + } + if (params->ird < cep->ird) { + if (relaxed_ird_negotiation && + cep->ird <= sdev->attrs.max_ird) + params->ird = cep->ird; + else { + rv = -ENOMEM; + up_write(&qp->state_lock); + goto error; + } + } + if (cep->mpa.v2_ctrl.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) + wait_for_peer_rts = true; + /* + * Signal back negotiated IRD and ORD values + */ + cep->mpa.v2_ctrl.ord = + htons(params->ord & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); + cep->mpa.v2_ctrl.ird = + htons(params->ird & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); + } + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = cep->ord; + qp_attrs.irq_size = cep->ird; + qp_attrs.sk = cep->sock; + if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + qp_attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp)); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + cep->state = SIW_EPSTATE_RDMA_MODE; + + /* Move socket RX/TX under QP control */ + rv = siw_qp_modify(qp, &qp_attrs, + SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | + SIW_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (rv) + goto error; + + siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n", + id, qp_id(qp), params->private_data_len); + + rv = siw_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (rv != 0) + goto error; + + if (wait_for_peer_rts) { + siw_sk_assign_rtr_upcalls(cep); + } else { + siw_qp_socket_assoc(cep, qp); + rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (rv) + goto error; + } + siw_cep_set_free(cep); + + return 0; +error: + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + if (qp->cep) { + siw_cep_put(cep); + qp->cep = NULL; + } + cep->qp = NULL; + siw_qp_put(qp); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return rv; +} + +/* + * siw_reject() + * + * Local connection reject case. Send private data back to peer, + * close connection and dereference connection id. + */ +int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) +{ + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); /* put last reference */ + + return -ECONNRESET; + } + siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state, + pd_len); + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + siw_send_mpareqrep(cep, pdata, pd_len); + } + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return 0; +} + +static int siw_listen_address(struct iw_cm_id *id, int backlog, + struct sockaddr *laddr, int addr_family) +{ + struct socket *s; + struct siw_cep *cep = NULL; + struct siw_device *sdev = to_siw_dev(id->device); + int rv = 0, s_val; + + rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + return rv; + + /* + * Allow binding local port when still in TIME_WAIT from last close. + */ + s_val = 1; + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv) { + siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv); + goto error; + } + rv = s->ops->bind(s, laddr, addr_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + if (rv) { + siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv); + goto error; + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_socket_assoc(cep, s); + + rv = siw_cm_alloc_work(cep, backlog); + if (rv) { + siw_dbg(id->device, + "id 0x%p: alloc_work error %d, backlog %d\n", id, + rv, backlog); + goto error; + } + rv = s->ops->listen(s, backlog); + if (rv) { + siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv); + goto error; + } + cep->cm_id = id; + id->add_ref(id); + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + * + * We currently use id->provider_data in three different ways: + * + * o For a listener's IWCM id, id->provider_data points to + * the list_head of the list of listening CEPs. + * Uses: siw_create_listen(), siw_destroy_listen() + * + * o For each accepted passive-side IWCM id, id->provider_data + * points to the CEP itself. This is a consequence of + * - siw_cm_upcall() setting event.provider_data = cep and + * - the IWCM's cm_conn_req_handler() setting provider_data of the + * new passive-side IWCM id equal to event.provider_data + * Uses: siw_accept(), siw_reject() + * + * o For an active-side IWCM id, id->provider_data is not used at all. + * + */ + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + rv = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = SIW_EPSTATE_LISTENING; + + if (addr_family == AF_INET) + siw_dbg(id->device, "Listen at laddr %pI4 %u\n", + &(((struct sockaddr_in *)laddr)->sin_addr), + ((struct sockaddr_in *)laddr)->sin_port); + else + siw_dbg(id->device, "Listen at laddr %pI6 %u\n", + &(((struct sockaddr_in6 *)laddr)->sin6_addr), + ((struct sockaddr_in6 *)laddr)->sin6_port); + + return 0; + +error: + siw_dbg(id->device, "failed: %d\n", rv); + + if (cep) { + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + siw_socket_disassoc(s); + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + } + sock_release(s); + + return rv; +} + +static void siw_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); + + list_del(p); + + siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id, + cep->state); + + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = SIW_EPSTATE_CLOSED; + siw_cep_set_free(cep); + siw_cep_put(cep); + } +} + +/* + * siw_create_listen - Create resources for a listener's IWCM ID @id + * + * Listens on the socket addresses id->local_addr and id->remote_addr. + * + * If the listener's @id provides a specific local IP address, at most one + * listening socket is created and associated with @id. + * + * If the listener's @id provides the wildcard (zero) local IP address, + * a separate listen is performed for each local IP address of the device + * by creating a listening socket and binding to that local IP address. + * + */ +int siw_create_listen(struct iw_cm_id *id, int backlog) +{ + struct net_device *dev = to_siw_dev(id->device)->netdev; + int rv = 0, listeners = 0; + + siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog); + + /* + * For each attached address of the interface, create a + * listening socket, if id->local_addr is the wildcard + * IP address or matches the IP address. + */ + if (id->local_addr.ss_family == AF_INET) { + struct in_device *in_dev = in_dev_get(dev); + struct sockaddr_in s_laddr, *s_raddr; + + memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr)); + s_raddr = (struct sockaddr_in *)&id->remote_addr; + + siw_dbg(id->device, + "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n", + id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), + &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); + + for_ifa(in_dev) + { + if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || + s_laddr.sin_addr.s_addr == ifa->ifa_address) { + s_laddr.sin_addr.s_addr = ifa->ifa_address; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&s_laddr, + AF_INET); + if (!rv) + listeners++; + } + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } else if (id->local_addr.ss_family == AF_INET6) { + struct inet6_dev *in6_dev = in6_dev_get(dev); + struct inet6_ifaddr *ifp; + struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr), + *s_raddr = &to_sockaddr_in6(id->remote_addr); + + siw_dbg(id->device, + "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n", + id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), + &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port)); + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + struct sockaddr_in6 bind_addr; + + if (ipv6_addr_any(&s_laddr->sin6_addr) || + ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) { + bind_addr.sin6_family = AF_INET6; + bind_addr.sin6_port = s_laddr->sin6_port; + bind_addr.sin6_flowinfo = 0; + bind_addr.sin6_addr = ifp->addr; + bind_addr.sin6_scope_id = dev->ifindex; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&bind_addr, + AF_INET6); + if (!rv) + listeners++; + } + } + read_unlock_bh(&in6_dev->lock); + + in6_dev_put(in6_dev); + } else { + return -EAFNOSUPPORT; + } + if (listeners) + rv = 0; + else if (!rv) + rv = -EINVAL; + + siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK"); + + return rv; +} + +int siw_destroy_listen(struct iw_cm_id *id) +{ + siw_dbg(id->device, "id 0x%p\n", id); + + if (!id->provider_data) { + siw_dbg(id->device, "id 0x%p: no cep(s)\n", id); + return 0; + } + siw_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int siw_cm_init(void) +{ + /* + * create_single_workqueue for strict ordering + */ + siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); + if (!siw_cm_wq) + return -ENOMEM; + + return 0; +} + +void siw_cm_exit(void) +{ + if (siw_cm_wq) { + flush_workqueue(siw_cm_wq); + destroy_workqueue(siw_cm_wq); + } +} diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h new file mode 100644 index 000000000000..8c59cb3e2868 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef _SIW_CM_H +#define _SIW_CM_H + +#include +#include + +#include + +enum siw_cep_state { + SIW_EPSTATE_IDLE = 1, + SIW_EPSTATE_LISTENING, + SIW_EPSTATE_CONNECTING, + SIW_EPSTATE_AWAIT_MPAREQ, + SIW_EPSTATE_RECVD_MPAREQ, + SIW_EPSTATE_AWAIT_MPAREP, + SIW_EPSTATE_RDMA_MODE, + SIW_EPSTATE_CLOSED +}; + +struct siw_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct mpa_v2_data v2_ctrl; + struct mpa_v2_data v2_ctrl_req; + char *pdata; + int bytes_rcvd; +}; + +struct siw_device; + +struct siw_cep { + struct iw_cm_id *cm_id; + struct siw_device *sdev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum siw_cep_state state; + + struct list_head listenq; + struct siw_cep *listen_cep; + + struct siw_qp *qp; + struct socket *sock; + + struct siw_cm_work *mpa_timer; + struct list_head work_freelist; + + struct siw_mpa_info mpa; + int ord; + int ird; + bool enhanced_rdma_conn_est; + + /* Saved upcalls of socket */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +/* + * Connection initiator waits 10 seconds to receive an + * MPA reply after sending out MPA request. Reponder waits for + * 5 seconds for MPA request to arrive if new TCP connection + * was set up. + */ +#define MPAREQ_TIMEOUT (HZ * 10) +#define MPAREP_TIMEOUT (HZ * 5) + +enum siw_work_type { + SIW_CM_WORK_ACCEPT = 1, + SIW_CM_WORK_READ_MPAHDR, + SIW_CM_WORK_CLOSE_LLP, /* close socket */ + SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + SIW_CM_WORK_MPATIMEOUT +}; + +struct siw_cm_work { + struct delayed_work work; + struct list_head list; + enum siw_work_type type; + struct siw_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) +#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm); +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int siw_reject(struct iw_cm_id *id, const void *data, u8 len); +int siw_create_listen(struct iw_cm_id *id, int backlog); +int siw_destroy_listen(struct iw_cm_id *id); + +void siw_cep_get(struct siw_cep *cep); +void siw_cep_put(struct siw_cep *cep); +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type); + +int siw_cm_init(void); +void siw_cm_exit(void); + +/* + * TCP socket interface + */ +#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp) +#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data)) + +#endif From 303ae1cdfdf7280ff4cfbbe65563b5ff15bb025b Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:27 +0200 Subject: [PATCH 123/194] rdma/siw: application interface Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_verbs.c | 1760 ++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_verbs.h | 91 ++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + include/uapi/rdma/siw-abi.h | 185 +++ 4 files changed, 2037 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h create mode 100644 include/uapi/rdma/siw-abi.h diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c new file mode 100644 index 000000000000..32dc79d0e898 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -0,0 +1,1760 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, + [IB_QPS_RTR] = SIW_QP_STATE_RTR, + [IB_QPS_RTS] = SIW_QP_STATE_RTS, + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, + [IB_QPS_ERR] = SIW_QP_STATE_ERROR +}; + +static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { + [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR" +}; + +static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) +{ + struct siw_uobj *uobj; + struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); + u32 key; + + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return SIW_INVAL_UOBJ_KEY; + + if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, + GFP_KERNEL) < 0) { + kfree(uobj); + return SIW_INVAL_UOBJ_KEY; + } + uobj->size = PAGE_ALIGN(size); + uobj->addr = vaddr; + + return key; +} + +static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, + unsigned long off, u32 size) +{ + struct siw_uobj *uobj = xa_load(&uctx->xa, off); + + if (uobj && uobj->size == size) + return uobj; + + return NULL; +} + +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct siw_ucontext *uctx = to_siw_ctx(ctx); + struct siw_uobj *uobj; + unsigned long off = vma->vm_pgoff; + int size = vma->vm_end - vma->vm_start; + int rv = -EINVAL; + + /* + * Must be page aligned + */ + if (vma->vm_start & (PAGE_SIZE - 1)) { + pr_warn("siw: mmap not page aligned\n"); + goto out; + } + uobj = siw_get_uobj(uctx, off, size); + if (!uobj) { + siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", + off, size); + goto out; + } + rv = remap_vmalloc_range(vma, uobj->addr, 0); + if (rv) + pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); +out: + return rv; +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_ctx->device); + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); + struct siw_uresp_alloc_ctx uresp = {}; + int rv; + + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { + rv = -ENOMEM; + goto err_out; + } + xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); + ctx->uobj_nextkey = 0; + ctx->sdev = sdev; + + uresp.dev_id = sdev->vendor_part_id; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + + siw_dbg(base_ctx->device, "success. now %d context(s)\n", + atomic_read(&sdev->num_ctx)); + + return 0; + +err_out: + atomic_dec(&sdev->num_ctx); + siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, + atomic_read(&sdev->num_ctx)); + + return rv; +} + +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) +{ + struct siw_ucontext *uctx = to_siw_ctx(base_ctx); + void *entry; + unsigned long index; + + /* + * Make sure all user mmap objects are gone. Since QP, CQ + * and SRQ destroy routines destroy related objects, nothing + * should be found here. + */ + xa_for_each(&uctx->xa, index, entry) { + kfree(xa_erase(&uctx->xa, index)); + pr_warn("siw: dropping orphaned uobj at %lu\n", index); + } + xa_destroy(&uctx->xa); + atomic_dec(&uctx->sdev->num_ctx); +} + +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + if (udata->inlen || udata->outlen) + return -EINVAL; + + memset(attr, 0, sizeof(*attr)); + + /* Revisit atomic caps if RFC 7306 gets supported */ + attr->atomic_cap = 0; + attr->device_cap_flags = + IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; + attr->max_cq = sdev->attrs.max_cq; + attr->max_cqe = sdev->attrs.max_cqe; + attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; + attr->max_fmr = sdev->attrs.max_fmr; + attr->max_mr = sdev->attrs.max_mr; + attr->max_mw = sdev->attrs.max_mw; + attr->max_mr_size = ~0ull; + attr->max_pd = sdev->attrs.max_pd; + attr->max_qp = sdev->attrs.max_qp; + attr->max_qp_init_rd_atom = sdev->attrs.max_ird; + attr->max_qp_rd_atom = sdev->attrs.max_ord; + attr->max_qp_wr = sdev->attrs.max_qp_wr; + attr->max_recv_sge = sdev->attrs.max_sge; + attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; + attr->max_send_sge = sdev->attrs.max_sge; + attr->max_sge_rd = sdev->attrs.max_sge_rd; + attr->max_srq = sdev->attrs.max_srq; + attr->max_srq_sge = sdev->attrs.max_srq_sge; + attr->max_srq_wr = sdev->attrs.max_srq_wr; + attr->page_size_cap = PAGE_SIZE; + attr->vendor_id = SIW_VENDOR_ID; + attr->vendor_part_id = sdev->vendor_part_id; + + memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + memset(attr, 0, sizeof(*attr)); + + attr->active_mtu = attr->max_mtu; + attr->active_speed = 2; + attr->active_width = 2; + attr->gid_tbl_len = 1; + attr->max_msg_sz = -1; + attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; + attr->pkey_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->state = sdev->state; + /* + * All zero + * + * attr->lid = 0; + * attr->bad_pkey_cntr = 0; + * attr->qkey_viol_cntr = 0; + * attr->sm_lid = 0; + * attr->lmc = 0; + * attr->max_vl_num = 0; + * attr->sm_sl = 0; + * attr->subnet_timeout = 0; + * attr->init_type_repy = 0; + */ + return 0; +} + +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable) +{ + struct ib_port_attr attr; + int rv = siw_query_port(base_dev, port, &attr); + + if (rv) + return rv; + + port_immutable->pkey_tbl_len = attr.pkey_tbl_len; + port_immutable->gid_tbl_len = attr.gid_tbl_len; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) +{ + /* Report the default pkey */ + *pkey = 0xffff; + return 0; +} + +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { + atomic_dec(&sdev->num_pd); + return -ENOMEM; + } + siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); + + return 0; +} + +void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + siw_dbg_pd(pd, "free PD\n"); + atomic_dec(&sdev->num_pd); +} + +void siw_qp_get_ref(struct ib_qp *base_qp) +{ + siw_qp_get(to_siw_qp(base_qp)); +} + +void siw_qp_put_ref(struct ib_qp *base_qp) +{ + siw_qp_put(to_siw_qp(base_qp)); +} + +/* + * siw_create_qp() + * + * Create QP of requested size on given device. + * + * @pd: Protection Domain + * @attrs: Initial QP attributes. + * @udata: used to provide QP ID, SQ and RQ size back to user. + */ + +struct ib_qp *siw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct siw_qp *qp = NULL; + struct siw_base_qp *siw_base_qp = NULL; + struct ib_device *base_dev = pd->device; + struct siw_device *sdev = to_siw_dev(base_dev); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_cq *scq = NULL, *rcq = NULL; + unsigned long flags; + int num_sqe, num_rqe, rv = 0; + + siw_dbg(base_dev, "create new QP\n"); + + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { + siw_dbg(base_dev, "too many QP's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->qp_type != IB_QPT_RC) { + siw_dbg(base_dev, "only RC QP's supported\n"); + rv = -EINVAL; + goto err_out; + } + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_send_sge > SIW_MAX_SGE) || + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { + siw_dbg(base_dev, "QP size error\n"); + rv = -EINVAL; + goto err_out; + } + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { + siw_dbg(base_dev, "max inline send: %d > %d\n", + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); + rv = -EINVAL; + goto err_out; + } + /* + * NOTE: we allow for zero element SQ and RQ WQE's SGL's + * but not for a QP unable to hold any WQE (SQ + RQ) + */ + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { + siw_dbg(base_dev, "QP must have send or receive queue\n"); + rv = -EINVAL; + goto err_out; + } + scq = to_siw_cq(attrs->send_cq); + rcq = to_siw_cq(attrs->recv_cq); + + if (!scq || (!rcq && !attrs->srq)) { + siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); + rv = -EINVAL; + goto err_out; + } + siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); + if (!siw_base_qp) { + rv = -ENOMEM; + goto err_out; + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + rv = -ENOMEM; + goto err_out; + } + siw_base_qp->qp = qp; + qp->ib_qp = &siw_base_qp->base_qp; + + init_rwsem(&qp->state_lock); + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + spin_lock_init(&qp->orq_lock); + + qp->kernel_verbs = !udata; + qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; + qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; + + rv = siw_qp_add(sdev, qp); + if (rv) + goto err_out; + + /* All queue indices are derived from modulo operations + * on a free running 'get' (consumer) and 'put' (producer) + * unsigned counter. Having queue sizes at power of two + * avoids handling counter wrap around. + */ + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (qp->kernel_verbs) + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); + else + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); + + if (qp->sendq == NULL) { + siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); + rv = -ENOMEM; + goto err_out_xa; + } + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; + else { + rv = -EINVAL; + goto err_out_xa; + } + } + qp->pd = pd; + qp->scq = scq; + qp->rcq = rcq; + + if (attrs->srq) { + /* + * SRQ support. + * Verbs 6.3.7: ignore RQ size, if SRQ present + * Verbs 6.3.5: do not check PD of SRQ against PD of QP + */ + qp->srq = to_siw_srq(attrs->srq); + qp->attrs.rq_size = 0; + siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", + qp->qp_num, qp->srq); + } else if (num_rqe) { + if (qp->kernel_verbs) + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); + else + qp->recvq = + vmalloc_user(num_rqe * sizeof(struct siw_rqe)); + + if (qp->recvq == NULL) { + siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); + rv = -ENOMEM; + goto err_out_xa; + } + qp->attrs.rq_size = num_rqe; + } + qp->attrs.sq_size = num_sqe; + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; + + /* Make those two tunables fixed for now. */ + qp->tx_ctx.gso_seg_limit = 1; + qp->tx_ctx.zcopy_tx = zcopy_tx; + + qp->attrs.state = SIW_QP_STATE_IDLE; + + if (udata) { + struct siw_uresp_create_qp uresp = {}; + + uresp.num_sqe = num_sqe; + uresp.num_rqe = num_rqe; + uresp.qp_id = qp_id(qp); + + if (qp->sendq) { + qp->xa_sq_index = + siw_create_uobj(uctx, qp->sendq, + num_sqe * sizeof(struct siw_sqe)); + } + if (qp->recvq) { + qp->xa_rq_index = + siw_create_uobj(uctx, qp->recvq, + num_rqe * sizeof(struct siw_rqe)); + } + if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || + qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out_xa; + } + uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; + uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out_xa; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out_xa; + } + qp->tx_cpu = siw_get_tx_cpu(sdev); + if (qp->tx_cpu < 0) { + rv = -EINVAL; + goto err_out_xa; + } + INIT_LIST_HEAD(&qp->devq); + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&qp->devq, &sdev->qp_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + return qp->ib_qp; + +err_out_xa: + xa_erase(&sdev->qp_xa, qp_id(qp)); +err_out: + kfree(siw_base_qp); + + if (qp) { + if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + vfree(qp->sendq); + vfree(qp->recvq); + kfree(qp); + } + atomic_dec(&sdev->num_qp); + + return ERR_PTR(rv); +} + +/* + * Minimum siw_query_qp() verb interface. + * + * @qp_attr_mask is not used but all available information is provided + */ +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct siw_qp *qp; + struct siw_device *sdev; + + if (base_qp && qp_attr && qp_init_attr) { + qp = to_siw_qp(base_qp); + sdev = to_siw_dev(base_qp->device); + } else { + return -EINVAL; + } + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; + qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->qp_type = base_qp->qp_type; + qp_init_attr->send_cq = base_qp->send_cq; + qp_init_attr->recv_cq = base_qp->recv_cq; + qp_init_attr->srq = base_qp->srq; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct siw_qp_attrs new_attrs; + enum siw_qp_attr_mask siw_attr_mask = 0; + struct siw_qp *qp = to_siw_qp(base_qp); + int rv = 0; + + if (!attr_mask) + return 0; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; + + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + new_attrs.flags |= SIW_RDMA_READ_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; + } + if (attr_mask & IB_QP_STATE) { + siw_dbg_qp(qp, "desired IB QP state: %s\n", + ib_qp_state_to_string[attr->qp_state]); + + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; + + if (new_attrs.state > SIW_QP_STATE_RTS) + qp->tx_ctx.tx_suspend = 1; + + siw_attr_mask |= SIW_QP_ATTR_STATE; + } + if (!siw_attr_mask) + goto out; + + down_write(&qp->state_lock); + + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); + + up_write(&qp->state_lock); +out: + return rv; +} + +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_qp_attrs qp_attrs; + + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); + + /* + * Mark QP as in process of destruction to prevent from + * any async callbacks to RDMA core + */ + qp->attrs.flags |= SIW_QP_IN_DESTROY; + qp->rx_stream.rx_suspend = 1; + + if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + down_write(&qp->state_lock); + + qp_attrs.state = SIW_QP_STATE_ERROR; + siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); + + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + up_write(&qp->state_lock); + + kfree(qp->tx_ctx.mpa_crc_hd); + kfree(qp->rx_stream.mpa_crc_hd); + + qp->scq = qp->rcq = NULL; + + siw_qp_put(qp); + kfree(siw_base_qp); + + return 0; +} + +/* + * siw_copy_inline_sgl() + * + * Prepare sgl of inlined data for sending. For userland callers + * function checks if given buffer addresses and len's are within + * process context bounds. + * Data from all provided sge's are copied together into the wqe, + * referenced by a single sge. + */ +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, + struct siw_sqe *sqe) +{ + struct ib_sge *core_sge = core_wr->sg_list; + void *kbuf = &sqe->sge[1]; + int num_sge = core_wr->num_sge, bytes = 0; + + sqe->sge[0].laddr = (u64)kbuf; + sqe->sge[0].lkey = 0; + + while (num_sge--) { + if (!core_sge->length) { + core_sge++; + continue; + } + bytes += core_sge->length; + if (bytes > SIW_MAX_INLINE) { + bytes = -EINVAL; + break; + } + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, + core_sge->length); + + kbuf += core_sge->length; + core_sge++; + } + sqe->sge[0].length = bytes > 0 ? bytes : 0; + sqe->num_sge = bytes > 0 ? 1 : 0; + + return bytes; +} + +/* + * siw_post_send() + * + * Post a list of S-WR's to a SQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_wqe *wqe = tx_wqe(qp); + + unsigned long flags; + int rv = 0; + + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { + up_read(&qp->state_lock); + *bad_wr = wr; + siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (wr && !qp->kernel_verbs) { + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + while (wr) { + u32 idx = qp->sq_put % qp->attrs.sq_size; + struct siw_sqe *sqe = &qp->sendq[idx]; + + if (sqe->flags) { + siw_dbg_qp(qp, "sq full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.sq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + sqe->id = wr->wr_id; + + if ((wr->send_flags & IB_SEND_SIGNALED) || + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) + sqe->flags |= SIW_WQE_SIGNALLED; + + if (wr->send_flags & IB_SEND_FENCE) + sqe->flags |= SIW_WQE_READ_FENCE; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + sqe->flags |= SIW_WQE_SOLICITED; + + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, sqe->sge, + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (rv <= 0) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + if (wr->opcode == IB_WR_SEND) + sqe->opcode = SIW_OP_SEND; + else { + sqe->opcode = SIW_OP_SEND_REMOTE_INV; + sqe->rkey = wr->ex.invalidate_rkey; + } + break; + + case IB_WR_RDMA_READ_WITH_INV: + case IB_WR_RDMA_READ: + /* + * iWarp restricts RREAD sink to SGL containing + * 1 SGE only. we could relax to SGL with multiple + * elements referring the SAME ltag or even sending + * a private per-rreq tag referring to a checked + * local sgl with MULTIPLE ltag's. + */ + if (unlikely(wr->num_sge != 1)) { + rv = -EINVAL; + break; + } + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); + /* + * NOTE: zero length RREAD is allowed! + */ + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->num_sge = 1; + + if (wr->opcode == IB_WR_RDMA_READ) + sqe->opcode = SIW_OP_READ; + else + sqe->opcode = SIW_OP_READ_LOCAL_INV; + break; + + case IB_WR_RDMA_WRITE: + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, &sqe->sge[0], + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (unlikely(rv < 0)) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->opcode = SIW_OP_WRITE; + break; + + case IB_WR_REG_MR: + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; + sqe->rkey = reg_wr(wr)->key; + sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; + sqe->opcode = SIW_OP_REG_MR; + break; + + case IB_WR_LOCAL_INV: + sqe->rkey = wr->ex.invalidate_rkey; + sqe->opcode = SIW_OP_INVAL_STAG; + break; + + default: + siw_dbg_qp(qp, "ib wr type %d unsupported\n", + wr->opcode); + rv = -EINVAL; + break; + } + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", + sqe->opcode, sqe->flags, (void *)sqe->id); + + if (unlikely(rv < 0)) + break; + + /* make SQE only valid after completely written */ + smp_wmb(); + sqe->flags |= SIW_WQE_VALID; + + qp->sq_put++; + wr = wr->next; + } + + /* + * Send directly if SQ processing is not in progress. + * Eventual immediate errors (rv < 0) do not affect the involved + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ + * processing, if new work is already pending. But rv must be passed + * to caller. + */ + if (wqe->wr_status != SIW_WR_IDLE) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + goto skip_direct_sending; + } + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto skip_direct_sending; + + if (qp->kernel_verbs) { + rv = siw_sq_start(qp); + } else { + qp->tx_ctx.in_syscall = 1; + + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) + siw_qp_cm_drop(qp, 0); + + qp->tx_ctx.in_syscall = 0; + } +skip_direct_sending: + + up_read(&qp->state_lock); + + if (rv >= 0) + return 0; + /* + * Immediate error + */ + siw_dbg_qp(qp, "error %d\n", rv); + + *bad_wr = wr; + return rv; +} + +/* + * siw_post_receive() + * + * Post a list of R-WR's to a RQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + unsigned long flags; + int rv = 0; + + if (qp->srq) { + *bad_wr = wr; + return -EOPNOTSUPP; /* what else from errno.h? */ + } + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + return -ENOTCONN; + } + if (!qp->kernel_verbs) { + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + if (qp->attrs.state > SIW_QP_STATE_RTS) { + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + /* + * Serialize potentially multiple producers. + * Not needed for single threaded consumer side. + */ + spin_lock_irqsave(&qp->rq_lock, flags); + + while (wr) { + u32 idx = qp->rq_put % qp->attrs.rq_size; + struct siw_rqe *rqe = &qp->recvq[idx]; + + if (rqe->flags) { + siw_dbg_qp(qp, "RQ full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.rq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* make sure RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + qp->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&qp->rq_lock, flags); + + up_read(&qp->state_lock); + + if (rv < 0) { + siw_dbg_qp(qp, "error %d\n", rv); + *bad_wr = wr; + } + return rv > 0 ? 0 : rv; +} + +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + siw_dbg_cq(cq, "free CQ resources\n"); + + siw_cq_flush(cq); + + if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + + atomic_dec(&sdev->num_cq); + + vfree(cq->queue); +} + +/* + * siw_create_cq() + * + * Populate CQ of requested size + * + * @base_cq: CQ as allocated by RDMA midlayer + * @attr: Initial CQ attributes + * @udata: relates to user context + */ + +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_cq *cq = to_siw_cq(base_cq); + int rv, size = attr->cqe; + + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { + siw_dbg(base_cq->device, "too many CQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (size < 1 || size > sdev->attrs.max_cqe) { + siw_dbg(base_cq->device, "CQ size error: %d\n", size); + rv = -EINVAL; + goto err_out; + } + size = roundup_pow_of_two(size); + cq->base_cq.cqe = size; + cq->num_cqe = size; + cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; + + if (!udata) { + cq->kernel_verbs = 1; + cq->queue = vzalloc(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } else { + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } + if (cq->queue == NULL) { + rv = -ENOMEM; + goto err_out; + } + get_random_bytes(&cq->id, 4); + siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); + + spin_lock_init(&cq->lock); + + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + + if (udata) { + struct siw_uresp_create_cq uresp = {}; + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + cq->xa_cq_index = + siw_create_uobj(ctx, cq->queue, + size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; + uresp.cq_id = cq->id; + uresp.num_cqe = size; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + return 0; + +err_out: + siw_dbg(base_cq->device, "CQ creation failed: %d", rv); + + if (cq && cq->queue) { + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + vfree(cq->queue); + } + atomic_dec(&sdev->num_cq); + + return rv; +} + +/* + * siw_poll_cq() + * + * Reap CQ entries if available and copy work completion status into + * array of WC's provided by caller. Returns number of reaped CQE's. + * + * @base_cq: Base CQ contained in siw CQ. + * @num_cqe: Maximum number of CQE's to reap. + * @wc: Array of work completions to be filled by siw. + */ +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + int i; + + for (i = 0; i < num_cqe; i++) { + if (!siw_reap_cqe(cq, wc)) + break; + wc++; + } + return i; +} + +/* + * siw_req_notify_cq() + * + * Request notification for new CQE's added to that CQ. + * Defined flags: + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification + * event if a WQE with notification flag set enters the CQ + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification + * event if a WQE enters the CQ. + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the + * number of not reaped CQE's regardless of its notification + * type and current or new CQ notification settings. + * + * @base_cq: Base CQ contained in siw CQ. + * @flags: Requested notification flags. + */ +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + + siw_dbg_cq(cq, "flags: 0x%02x\n", flags); + + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + /* CQ event for next solicited completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + else + /* CQ event for any signalled completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + + if (flags & IB_CQ_REPORT_MISSED_EVENTS) + return cq->cq_put - cq->cq_get; + + return 0; +} + +/* + * siw_dereg_mr() + * + * Release Memory Region. + * + * @base_mr: Base MR contained in siw MR. + * @udata: points to user context, unused. + */ +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) +{ + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_device *sdev = to_siw_dev(base_mr->device); + + siw_dbg_mem(mr->mem, "deregister MR\n"); + + atomic_dec(&sdev->num_mr); + + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + + return 0; +} + +/* + * siw_reg_user_mr() + * + * Register Memory Region. + * + * @pd: Protection Domain + * @start: starting address of MR (virtual address) + * @len: len of MR + * @rnic_va: not used by siw + * @rights: MR access rights + * @udata: user buffer to communicate STag and Key. + */ +struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata) +{ + struct siw_mr *mr = NULL; + struct siw_umem *umem = NULL; + struct siw_ureq_reg_mr ureq; + struct siw_device *sdev = to_siw_dev(pd->device); + + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); + int rv; + + siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", + (unsigned long long)start, (unsigned long long)rnic_va, + (unsigned long long)len); + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (!len) { + rv = -EINVAL; + goto err_out; + } + if (mem_limit != RLIM_INFINITY) { + unsigned long num_pages = + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; + mem_limit >>= PAGE_SHIFT; + + if (num_pages > mem_limit - current->mm->locked_vm) { + siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", + num_pages, mem_limit, + current->mm->locked_vm); + rv = -ENOMEM; + goto err_out; + } + } + umem = siw_umem_get(start, len, ib_access_writable(rights)); + if (IS_ERR(umem)) { + rv = PTR_ERR(umem); + siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); + umem = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); + if (rv) + goto err_out; + + if (udata) { + struct siw_uresp_reg_mr uresp = {}; + struct siw_mem *mem = mr->mem; + + if (udata->inlen < sizeof(ureq)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); + if (rv) + goto err_out; + + mr->base_mr.lkey |= ureq.stag_key; + mr->base_mr.rkey |= ureq.stag_key; + mem->stag |= ureq.stag_key; + uresp.stag = mem->stag; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + mr->mem->stag_valid = 1; + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + if (mr) { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } else { + if (umem) + siw_umem_release(umem, false); + } + return ERR_PTR(rv); +} + +struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + struct siw_pbl *pbl = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (mr_type != IB_MR_TYPE_MEM_REG) { + siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); + rv = -EOPNOTSUPP; + goto err_out; + } + if (max_sge > SIW_MAX_SGE_PBL) { + siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); + rv = -ENOMEM; + goto err_out; + } + pbl = siw_pbl_alloc(max_sge); + if (IS_ERR(pbl)) { + rv = PTR_ERR(pbl); + siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); + pbl = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); + if (rv) + goto err_out; + + mr->mem->is_pbl = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + + if (!mr) { + kfree(pbl); + } else { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } + siw_dbg_pd(pd, "failed: %d\n", rv); + + return ERR_PTR(rv); +} + +/* Just used to count number of pages being mapped */ +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) +{ + return 0; +} + +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off) +{ + struct scatterlist *slp; + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_mem *mem = mr->mem; + struct siw_pbl *pbl = mem->pbl; + struct siw_pble *pble; + u64 pbl_size; + int i, rv; + + if (!pbl) { + siw_dbg_mem(mem, "no PBL allocated\n"); + return -EINVAL; + } + pble = pbl->pbe; + + if (pbl->max_buf < num_sle) { + siw_dbg_mem(mem, "too many SGE's: %d > %d\n", + mem->pbl->max_buf, num_sle); + return -ENOMEM; + } + for_each_sg(sl, slp, num_sle, i) { + if (sg_dma_len(slp) == 0) { + siw_dbg_mem(mem, "empty SGE\n"); + return -EINVAL; + } + if (i == 0) { + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = 0; + pbl_size = pble->size; + pbl->num_buf = 1; + } else { + /* Merge PBL entries if adjacent */ + if (pble->addr + pble->size == sg_dma_address(slp)) { + pble->size += sg_dma_len(slp); + } else { + pble++; + pbl->num_buf++; + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = pbl_size; + } + pbl_size += sg_dma_len(slp); + } + siw_dbg_mem(mem, + "sge[%d], size %llu, addr 0x%016llx, total %llu\n", + i, pble->size, pble->addr, pbl_size); + } + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); + if (rv > 0) { + mem->len = base_mr->length; + mem->va = base_mr->iova; + siw_dbg_mem(mem, + "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", + mem->len, mem->va, num_sle, pbl->num_buf); + } + return rv; +} + +/* + * siw_get_dma_mr() + * + * Create a (empty) DMA memory region, where no umem is attached. + */ +struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); + if (rv) + goto err_out; + + mr->mem->stag_valid = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + if (rv) + kfree(mr); + + atomic_dec(&sdev->num_mr); + + return ERR_PTR(rv); +} + +/* + * siw_create_srq() + * + * Create Shared Receive Queue of attributes @init_attrs + * within protection domain given by @pd. + * + * @base_srq: Base SRQ contained in siw SRQ. + * @init_attrs: SRQ init attributes. + * @udata: points to user context + */ +int siw_create_srq(struct ib_srq *base_srq, + struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct ib_srq_attr *attrs = &init_attrs->attr; + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + int rv; + + if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { + siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || + attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { + rv = -EINVAL; + goto err_out; + } + srq->max_sge = attrs->max_sge; + srq->num_rqe = roundup_pow_of_two(attrs->max_wr); + srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; + srq->limit = attrs->srq_limit; + if (srq->limit) + srq->armed = 1; + + srq->kernel_verbs = !udata; + + if (udata) + srq->recvq = + vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); + else + srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->recvq == NULL) { + rv = -ENOMEM; + goto err_out; + } + if (udata) { + struct siw_uresp_create_srq uresp = {}; + + srq->xa_srq_index = siw_create_uobj( + ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.srq_key = srq->xa_srq_index; + uresp.num_rqe = srq->num_rqe; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + spin_lock_init(&srq->lock); + + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); + + return 0; + +err_out: + if (srq->recvq) { + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + vfree(srq->recvq); + } + atomic_dec(&sdev->num_srq); + + return rv; +} + +/* + * siw_modify_srq() + * + * Modify SRQ. The caller may resize SRQ and/or set/reset notification + * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. + * + * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE + * parameter. siw_modify_srq() does not check the attrs->max_sge param. + */ +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&srq->lock, flags); + + if (attr_mask & IB_SRQ_MAX_WR) { + /* resize request not yet supported */ + rv = -EOPNOTSUPP; + goto out; + } + if (attr_mask & IB_SRQ_LIMIT) { + if (attrs->srq_limit) { + if (unlikely(attrs->srq_limit > srq->num_rqe)) { + rv = -EINVAL; + goto out; + } + srq->armed = 1; + } else { + srq->armed = 0; + } + srq->limit = attrs->srq_limit; + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return rv; +} + +/* + * siw_query_srq() + * + * Query SRQ attributes. + */ +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + + spin_lock_irqsave(&srq->lock, flags); + + attrs->max_wr = srq->num_rqe; + attrs->max_sge = srq->max_sge; + attrs->srq_limit = srq->limit; + + spin_unlock_irqrestore(&srq->lock, flags); + + return 0; +} + +/* + * siw_destroy_srq() + * + * Destroy SRQ. + * It is assumed that the SRQ is not referenced by any + * QP anymore - the code trusts the RDMA core environment to keep track + * of QP references. + */ +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + + vfree(srq->recvq); + atomic_dec(&sdev->num_srq); +} + +/* + * siw_post_srq_recv() + * + * Post a list of receive queue elements to SRQ. + * NOTE: The function does not check or lock a certain SRQ state + * during the post operation. The code simply trusts the + * RDMA core environment. + * + * @base_srq: Base SRQ contained in siw SRQ + * @wr: List of R-WR's + * @bad_wr: Updated to failing WR if posting fails. + */ +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + if (unlikely(!srq->kernel_verbs)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", + srq); + rv = -EINVAL; + goto out; + } + /* + * Serialize potentially multiple producers. + * Also needed to serialize potentially multiple + * consumers. + */ + spin_lock_irqsave(&srq->lock, flags); + + while (wr) { + u32 idx = srq->rq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + + if (rqe->flags) { + siw_dbg_pd(base_srq->pd, "SRQ full\n"); + rv = -ENOMEM; + break; + } + if (unlikely(wr->num_sge > srq->max_sge)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: too many sge's: %d\n", srq, + wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* Make sure S-RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + srq->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); +out: + if (unlikely(rv < 0)) { + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); + *bad_wr = wr; + } + return rv; +} + +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_qp *base_qp = qp->ib_qp; + + /* + * Do not report asynchronous errors on QP which gets + * destroyed via verbs interface (siw_destroy_qp()) + */ + if (qp->attrs.flags & SIW_QP_IN_DESTROY) + return; + + event.event = etype; + event.device = base_qp->device; + event.element.qp = base_qp; + + if (base_qp->event_handler) { + siw_dbg_qp(qp, "reporting event %d\n", etype); + base_qp->event_handler(&event, base_qp->qp_context); + } +} + +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_cq *base_cq = &cq->base_cq; + + event.event = etype; + event.device = base_cq->device; + event.element.cq = base_cq; + + if (base_cq->event_handler) { + siw_dbg_cq(cq, "reporting CQ event %d\n", etype); + base_cq->event_handler(&event, base_cq->cq_context); + } +} + +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_srq *base_srq = &srq->base_srq; + + event.event = etype; + event.device = base_srq->device; + event.element.srq = base_srq; + + if (base_srq->event_handler) { + siw_dbg_pd(srq->base_srq.pd, + "reporting SRQ event %d\n", etype); + base_srq->event_handler(&event, base_srq->srq_context); + } +} + +void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) +{ + struct ib_event event; + + event.event = etype; + event.device = &sdev->base_dev; + event.element.port_num = port; + + siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h new file mode 100644 index 000000000000..1910869281cb --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_VERBS_H +#define _SIW_VERBS_H + +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_cm.h" + +/* + * siw_copy_sgl() + * + * Copy SGL from RDMA core representation to local + * representation. + */ +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, + int num_sge) +{ + while (num_sge--) { + siw_sge->laddr = sge->addr; + siw_sge->length = sge->length; + siw_sge->lkey = sge->lkey; + + siw_sge++; + sge++; + } +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata); +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable); +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata); +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey); +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid); +int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata); +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); +int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata); +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata); +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off); +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata); +int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr, + struct ib_udata *udata); +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, struct ib_udata *udata); +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr); +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata); +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void siw_qp_event(struct siw_qp *qp, enum ib_event_type type); +void siw_cq_event(struct siw_cq *cq, enum ib_event_type type); +void siw_srq_event(struct siw_srq *srq, enum ib_event_type type); +void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type); + +#endif diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 26213f49f5c8..64c14cb0022f 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -103,6 +103,7 @@ enum rdma_driver_id { RDMA_DRIVER_HFI1, RDMA_DRIVER_QIB, RDMA_DRIVER_EFA, + RDMA_DRIVER_SIW, }; #endif diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h new file mode 100644 index 000000000000..3dd8071ace7b --- /dev/null +++ b/include/uapi/rdma/siw-abi.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_USER_H +#define _SIW_USER_H + +#include + +#define SIW_NODE_DESC_COMMON "Software iWARP stack" +#define SIW_ABI_VERSION 1 +#define SIW_MAX_SGE 6 +#define SIW_UOBJ_MAX_KEY 0x08FFFF +#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1) + +struct siw_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; + __aligned_u64 cq_key; +}; + +struct siw_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 pad; + __aligned_u64 sq_key; + __aligned_u64 rq_key; +}; + +struct siw_ureq_reg_mr { + __u8 stag_key; + __u8 reserved[3]; + __u32 pad; +}; + +struct siw_uresp_reg_mr { + __u32 stag; + __u32 pad; +}; + +struct siw_uresp_create_srq { + __u32 num_rqe; + __u32 pad; + __aligned_u64 srq_key; +}; + +struct siw_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; +}; + +enum siw_opcode { + SIW_OP_WRITE, + SIW_OP_READ, + SIW_OP_READ_LOCAL_INV, + SIW_OP_SEND, + SIW_OP_SEND_WITH_IMM, + SIW_OP_SEND_REMOTE_INV, + + /* Unsupported */ + SIW_OP_FETCH_AND_ADD, + SIW_OP_COMP_AND_SWAP, + + SIW_OP_RECEIVE, + /* provider internal SQE */ + SIW_OP_READ_RESPONSE, + /* + * below opcodes valid for + * in-kernel clients only + */ + SIW_OP_INVAL_STAG, + SIW_OP_REG_MR, + SIW_NUM_OPCODES +}; + +/* Keep it same as ibv_sge to allow for memcpy */ +struct siw_sge { + __aligned_u64 laddr; + __u32 length; + __u32 lkey; +}; + +/* + * Inline data are kept within the work request itself occupying + * the space of sge[1] .. sge[n]. Therefore, inline data cannot be + * supported if SIW_MAX_SGE is below 2 elements. + */ +#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1)) + +#if SIW_MAX_SGE < 2 +#error "SIW_MAX_SGE must be at least 2" +#endif + +enum siw_wqe_flags { + SIW_WQE_VALID = 1, + SIW_WQE_INLINE = (1 << 1), + SIW_WQE_SIGNALLED = (1 << 2), + SIW_WQE_SOLICITED = (1 << 3), + SIW_WQE_READ_FENCE = (1 << 4), + SIW_WQE_REM_INVAL = (1 << 5), + SIW_WQE_COMPLETED = (1 << 6) +}; + +/* Send Queue Element */ +struct siw_sqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* Contains enum siw_opcode values */ + __u8 opcode; + __u32 rkey; + union { + __aligned_u64 raddr; + __aligned_u64 base_mr; + }; + union { + struct siw_sge sge[SIW_MAX_SGE]; + __aligned_u64 access; + }; +}; + +/* Receive Queue Element */ +struct siw_rqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* + * only used by kernel driver, + * ignored if set by user + */ + __u8 opcode; + __u32 unused; + struct siw_sge sge[SIW_MAX_SGE]; +}; + +enum siw_notify_flags { + SIW_NOTIFY_NOT = (0), + SIW_NOTIFY_SOLICITED = (1 << 0), + SIW_NOTIFY_NEXT_COMPLETION = (1 << 1), + SIW_NOTIFY_MISSED_EVENTS = (1 << 2), + SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION | + SIW_NOTIFY_MISSED_EVENTS +}; + +enum siw_wc_status { + SIW_WC_SUCCESS, + SIW_WC_LOC_LEN_ERR, + SIW_WC_LOC_PROT_ERR, + SIW_WC_LOC_QP_OP_ERR, + SIW_WC_WR_FLUSH_ERR, + SIW_WC_BAD_RESP_ERR, + SIW_WC_LOC_ACCESS_ERR, + SIW_WC_REM_ACCESS_ERR, + SIW_WC_REM_INV_REQ_ERR, + SIW_WC_GENERAL_ERR, + SIW_NUM_WC_STATUS +}; + +struct siw_cqe { + __aligned_u64 id; + __u8 flags; + __u8 opcode; + __u16 status; + __u32 bytes; + union { + __aligned_u64 imm_data; + __u32 inval_stag; + }; + /* QP number or QP pointer */ + union { + struct ib_qp *base_qp; + __aligned_u64 qp_id; + }; +}; + +/* + * Shared structure between user and kernel + * to control CQ arming. + */ +struct siw_cq_ctrl { + __aligned_u64 notify; +}; +#endif From 2251334dcac9eb337575d8767e2a6a7e81848f7f Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:28 +0200 Subject: [PATCH 124/194] rdma/siw: application buffer management Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_mem.c | 460 ++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_mem.h | 74 +++++ 2 files changed, 534 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_mem.c create mode 100644 drivers/infiniband/sw/siw/siw_mem.h diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c new file mode 100644 index 000000000000..67171c82b0c4 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include + +#include "siw.h" +#include "siw_mem.h" + +/* + * Stag lookup is based on its index part only (24 bits). + * The code avoids special Stag of zero and tries to randomize + * STag values between 1 and SIW_STAG_MAX_INDEX. + */ +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) +{ + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, + GFP_KERNEL) < 0) + return -ENOMEM; + + /* Set the STag index part */ + m->stag = id << 8; + + siw_dbg_mem(m, "new MEM object\n"); + + return 0; +} + +/* + * siw_mem_id2obj() + * + * resolves memory from stag given by id. might be called from: + * o process context before sending out of sgl, or + * o in softirq when resolving target memory + */ +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) +{ + struct siw_mem *mem; + + rcu_read_lock(); + mem = xa_load(&sdev->mem_xa, stag_index); + if (likely(mem && kref_get_unless_zero(&mem->ref))) { + rcu_read_unlock(); + return mem; + } + rcu_read_unlock(); + + return NULL; +} + +static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, + bool dirty) +{ + struct page **p = chunk->plist; + + while (num_pages--) { + if (!PageDirty(*p) && dirty) + put_user_pages_dirty_lock(p, 1); + else + put_user_page(*p); + p++; + } +} + +void siw_umem_release(struct siw_umem *umem, bool dirty) +{ + struct mm_struct *mm_s = umem->owning_mm; + int i, num_pages = umem->num_pages; + + for (i = 0; num_pages; i++) { + int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); + + siw_free_plist(&umem->page_chunk[i], to_free, + umem->writable && dirty); + kfree(umem->page_chunk[i].plist); + num_pages -= to_free; + } + atomic64_sub(umem->num_pages, &mm_s->pinned_vm); + + mmdrop(mm_s); + kfree(umem->page_chunk); + kfree(umem); +} + +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + if (!mem) + return -ENOMEM; + + mem->mem_obj = mem_obj; + mem->stag_valid = 0; + mem->sdev = sdev; + mem->va = start; + mem->len = len; + mem->pd = pd; + mem->perms = rights & IWARP_ACCESS_MASK; + kref_init(&mem->ref); + + mr->mem = mem; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, + GFP_KERNEL) < 0) { + kfree(mem); + return -ENOMEM; + } + /* Set the STag index part */ + mem->stag = id << 8; + mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; + + return 0; +} + +void siw_mr_drop_mem(struct siw_mr *mr) +{ + struct siw_mem *mem = mr->mem, *found; + + mem->stag_valid = 0; + + /* make STag invalid visible asap */ + smp_mb(); + + found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); + WARN_ON(found != mem); + siw_mem_put(mem); +} + +void siw_free_mem(struct kref *ref) +{ + struct siw_mem *mem = container_of(ref, struct siw_mem, ref); + + siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); + + if (!mem->is_mw && mem->mem_obj) { + if (mem->is_pbl == 0) + siw_umem_release(mem->umem, true); + else + kfree(mem->pbl); + } + kfree(mem); +} + +/* + * siw_check_mem() + * + * Check protection domain, STAG state, access permissions and + * address range for memory object. + * + * @pd: Protection Domain memory should belong to + * @mem: memory to be checked + * @addr: starting addr of mem + * @perms: requested access permissions + * @len: len of memory interval to be checked + * + */ +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len) +{ + if (!mem->stag_valid) { + siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); + return -E_STAG_INVALID; + } + if (mem->pd != pd) { + siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); + return -E_PD_MISMATCH; + } + /* + * check access permissions + */ + if ((mem->perms & perms) < perms) { + siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", + mem->perms, perms); + return -E_ACCESS_PERM; + } + /* + * Check if access falls into valid memory interval. + */ + if (addr < mem->va || addr + len > mem->va + mem->len) { + siw_dbg_pd(pd, "MEM interval len %d\n", len); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", + (unsigned long long)addr, + (unsigned long long)(addr + len)); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", + (unsigned long long)mem->va, + (unsigned long long)(mem->va + mem->len), + mem->stag); + + return -E_BASE_BOUNDS; + } + return E_ACCESS_OK; +} + +/* + * siw_check_sge() + * + * Check SGE for access rights in given interval + * + * @pd: Protection Domain memory should belong to + * @sge: SGE to be checked + * @mem: location of memory reference within array + * @perms: requested access permissions + * @off: starting offset in SGE + * @len: len of memory interval to be checked + * + * NOTE: Function references SGE's memory object (mem->obj) + * if not yet done. New reference is kept if check went ok and + * released if check failed. If mem->obj is already valid, no new + * lookup is being done and mem is not released it check fails. + */ +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], + enum ib_access_flags perms, u32 off, int len) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *new = NULL; + int rv = E_ACCESS_OK; + + if (len + off > sge->length) { + rv = -E_BASE_BOUNDS; + goto fail; + } + if (*mem == NULL) { + new = siw_mem_id2obj(sdev, sge->lkey >> 8); + if (unlikely(!new)) { + siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + *mem = new; + } + /* Check if user re-registered with different STag key */ + if (unlikely((*mem)->stag != sge->lkey)) { + siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); + if (unlikely(rv)) + goto fail; + + return 0; + +fail: + if (new) { + *mem = NULL; + siw_mem_put(new); + } + return rv; +} + +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) +{ + switch (op) { + case SIW_OP_SEND: + case SIW_OP_WRITE: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) + siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); + break; + + case SIW_OP_RECEIVE: + siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); + break; + + case SIW_OP_READ_RESPONSE: + siw_unref_mem_sgl(wqe->mem, 1); + break; + + default: + /* + * SIW_OP_INVAL_STAG and SIW_OP_REG_MR + * do not hold memory references + */ + break; + } +} + +int siw_invalidate_stag(struct ib_pd *pd, u32 stag) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); + int rv = 0; + + if (unlikely(!mem)) { + siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); + return -EINVAL; + } + if (unlikely(mem->pd != pd)) { + siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); + rv = -EACCES; + goto out; + } + /* + * Per RDMA verbs definition, an STag may already be in invalid + * state if invalidation is requested. So no state check here. + */ + mem->stag_valid = 0; + + siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); +out: + siw_mem_put(mem); + return rv; +} + +/* + * Gets physical address backed by PBL element. Address is referenced + * by linear byte offset into list of variably sized PB elements. + * Optionally, provides remaining len within current element, and + * current PBL index for later resume at same element. + */ +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) +{ + int i = idx ? *idx : 0; + + while (i < pbl->num_buf) { + struct siw_pble *pble = &pbl->pbe[i]; + + if (pble->pbl_off + pble->size > off) { + u64 pble_off = off - pble->pbl_off; + + if (len) + *len = pble->size - pble_off; + if (idx) + *idx = i; + + return pble->addr + pble_off; + } + i++; + } + if (len) + *len = 0; + return 0; +} + +struct siw_pbl *siw_pbl_alloc(u32 num_buf) +{ + struct siw_pbl *pbl; + int buf_size = sizeof(*pbl); + + if (num_buf == 0) + return ERR_PTR(-EINVAL); + + buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); + + pbl = kzalloc(buf_size, GFP_KERNEL); + if (!pbl) + return ERR_PTR(-ENOMEM); + + pbl->max_buf = num_buf; + + return pbl; +} + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) +{ + struct siw_umem *umem; + struct mm_struct *mm_s; + u64 first_page_va; + unsigned long mlock_limit; + unsigned int foll_flags = FOLL_WRITE; + int num_pages, num_chunks, i, rv = 0; + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + if (!len) + return ERR_PTR(-EINVAL); + + first_page_va = start & PAGE_MASK; + num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; + num_chunks = (num_pages >> CHUNK_SHIFT) + 1; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + mm_s = current->mm; + umem->owning_mm = mm_s; + umem->writable = writable; + + mmgrab(mm_s); + + if (!writable) + foll_flags |= FOLL_FORCE; + + down_read(&mm_s->mmap_sem); + + mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { + rv = -ENOMEM; + goto out_sem_up; + } + umem->fp_addr = first_page_va; + + umem->page_chunk = + kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); + if (!umem->page_chunk) { + rv = -ENOMEM; + goto out_sem_up; + } + for (i = 0; num_pages; i++) { + int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); + + umem->page_chunk[i].plist = + kcalloc(nents, sizeof(struct page *), GFP_KERNEL); + if (!umem->page_chunk[i].plist) { + rv = -ENOMEM; + goto out_sem_up; + } + got = 0; + while (nents) { + struct page **plist = &umem->page_chunk[i].plist[got]; + + rv = get_user_pages(first_page_va, nents, + foll_flags | FOLL_LONGTERM, + plist, NULL); + if (rv < 0) + goto out_sem_up; + + umem->num_pages += rv; + atomic64_add(rv, &mm_s->pinned_vm); + first_page_va += rv * PAGE_SIZE; + nents -= rv; + got += rv; + } + num_pages -= got; + } +out_sem_up: + up_read(&mm_s->mmap_sem); + + if (rv > 0) + return umem; + + siw_umem_release(umem, false); + + return ERR_PTR(rv); +} diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h new file mode 100644 index 000000000000..f43daf280891 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_MEM_H +#define _SIW_MEM_H + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable); +void siw_umem_release(struct siw_umem *umem, bool dirty); +struct siw_pbl *siw_pbl_alloc(u32 num_buf); +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); +int siw_invalidate_stag(struct ib_pd *pd, u32 stag); +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len); +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, + struct siw_mem *mem[], enum ib_access_flags perms, + u32 off, int len); +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op); +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights); +void siw_mr_drop_mem(struct siw_mr *mr); +void siw_free_mem(struct kref *ref); + +static inline void siw_mem_put(struct siw_mem *mem) +{ + kref_put(&mem->ref, siw_free_mem); +} + +static inline struct siw_mr *siw_mem2mr(struct siw_mem *m) +{ + return container_of(m, struct siw_mr, mem); +} + +static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge) +{ + while (num_sge) { + if (*mem == NULL) + break; + + siw_mem_put(*mem); + *mem = NULL; + mem++; + num_sge--; + } +} + +#define CHUNK_SHIFT 9 /* sets number of pages per chunk */ +#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT) +#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1)) +#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *)) + +/* + * siw_get_upage() + * + * Get page pointer for address on given umem. + * + * @umem: two dimensional list of page pointers + * @addr: user virtual address + */ +static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr) +{ + unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT, + chunk_idx = page_idx >> CHUNK_SHIFT, + page_in_chunk = page_idx & ~CHUNK_MASK; + + if (likely(page_idx < umem->num_pages)) + return umem->page_chunk[chunk_idx].plist[page_in_chunk]; + + return NULL; +} +#endif From f29dd55b0236f7a26a4b9dd69186e3c04266797b Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:29 +0200 Subject: [PATCH 125/194] rdma/siw: queue pair methods Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp.c | 1322 ++++++++++++++++++++++++++++ 1 file changed, 1322 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp.c diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c new file mode 100644 index 000000000000..11383d9f95ef --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -0,0 +1,1322 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = { + [SIW_QP_STATE_IDLE] = "IDLE", + [SIW_QP_STATE_RTR] = "RTR", + [SIW_QP_STATE_RTS] = "RTS", + [SIW_QP_STATE_CLOSING] = "CLOSING", + [SIW_QP_STATE_TERMINATE] = "TERMINATE", + [SIW_QP_STATE_ERROR] = "ERROR" +}; + +/* + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a + * per-RDMAP message basis. Please keep order of initializer. All MPA len + * is initialized to minimum packet size. + */ +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { + { /* RDMAP_RDMA_WRITE */ + .hdr_len = sizeof(struct iwarp_rdma_write), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_WRITE), + .rx_data = siw_proc_write }, + { /* RDMAP_RDMA_READ_REQ */ + .hdr_len = sizeof(struct iwarp_rdma_rreq), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_REQ), + .rx_data = siw_proc_rreq }, + { /* RDMAP_RDMA_READ_RESP */ + .hdr_len = sizeof(struct iwarp_rdma_rresp), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_RESP), + .rx_data = siw_proc_rresp }, + { /* RDMAP_SEND */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_TERMINATE */ + .hdr_len = sizeof(struct iwarp_terminate), + .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_TERMINATE), + .rx_data = siw_proc_terminate } +}; + +void siw_qp_llp_data_ready(struct sock *sk) +{ + struct siw_qp *qp; + + read_lock(&sk->sk_callback_lock); + + if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) + goto done; + + qp = sk_to_qp(sk); + + if (likely(!qp->rx_stream.rx_suspend && + down_read_trylock(&qp->state_lock))) { + read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 }; + + if (likely(qp->attrs.state == SIW_QP_STATE_RTS)) + /* + * Implements data receive operation during + * socket callback. TCP gracefully catches + * the case where there is nothing to receive + * (not calling siw_tcp_rx_data() then). + */ + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + + up_read(&qp->state_lock); + } else { + siw_dbg_qp(qp, "unable to process RX, suspend: %d\n", + qp->rx_stream.rx_suspend); + } +done: + read_unlock(&sk->sk_callback_lock); +} + +void siw_qp_llp_close(struct siw_qp *qp) +{ + siw_dbg_qp(qp, "enter llp close, state = %s\n", + siw_qp_state_to_string[qp->attrs.state]); + + down_write(&qp->state_lock); + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + qp->attrs.sk = NULL; + + switch (qp->attrs.state) { + case SIW_QP_STATE_RTS: + case SIW_QP_STATE_RTR: + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_ERROR; + break; + /* + * SIW_QP_STATE_CLOSING: + * + * This is a forced close. shall the QP be moved to + * ERROR or IDLE ? + */ + case SIW_QP_STATE_CLOSING: + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + qp->attrs.state = SIW_QP_STATE_ERROR; + else + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + default: + siw_dbg_qp(qp, "llp close: no state transition needed: %s\n", + siw_qp_state_to_string[qp->attrs.state]); + break; + } + siw_sq_flush(qp); + siw_rq_flush(qp); + + /* + * Dereference closing CEP + */ + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); + + siw_dbg_qp(qp, "llp close exit: state %s\n", + siw_qp_state_to_string[qp->attrs.state]); +} + +/* + * socket callback routine informing about newly available send space. + * Function schedules SQ work for processing SQ items. + */ +void siw_qp_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + cep->sk_write_space(sk); + + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + (void)siw_sq_start(cep->qp); +} + +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size) +{ + irq_size = roundup_pow_of_two(irq_size); + orq_size = roundup_pow_of_two(orq_size); + + qp->attrs.irq_size = irq_size; + qp->attrs.orq_size = orq_size; + + qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe)); + if (!qp->irq) { + siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size); + qp->attrs.irq_size = 0; + return -ENOMEM; + } + qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe)); + if (!qp->orq) { + siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size); + qp->attrs.orq_size = 0; + qp->attrs.irq_size = 0; + vfree(qp->irq); + return -ENOMEM; + } + siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size); + return 0; +} + +static int siw_qp_enable_crc(struct siw_qp *qp) +{ + struct siw_rx_stream *c_rx = &qp->rx_stream; + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int size = crypto_shash_descsize(siw_crypto_shash) + + sizeof(struct shash_desc); + + if (siw_crypto_shash == NULL) + return -ENOENT; + + c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) { + kfree(c_tx->mpa_crc_hd); + kfree(c_rx->mpa_crc_hd); + c_tx->mpa_crc_hd = NULL; + c_rx->mpa_crc_hd = NULL; + return -ENOMEM; + } + c_tx->mpa_crc_hd->tfm = siw_crypto_shash; + c_rx->mpa_crc_hd->tfm = siw_crypto_shash; + + return 0; +} + +/* + * Send a non signalled READ or WRITE to peer side as negotiated + * with MPAv2 P2P setup protocol. The work request is only created + * as a current active WR and does not consume Send Queue space. + * + * Caller must hold QP state lock. + */ +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl) +{ + struct siw_wqe *wqe = tx_wqe(qp); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&qp->sq_lock, flags); + + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + return -EIO; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + + wqe->wr_status = SIW_WR_QUEUED; + wqe->sqe.flags = 0; + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = 0; + wqe->sqe.sge[0].laddr = 0; + wqe->sqe.sge[0].lkey = 0; + /* + * While it must not be checked for inbound zero length + * READ/WRITE, some HW may treat STag 0 special. + */ + wqe->sqe.rkey = 1; + wqe->sqe.raddr = 0; + wqe->processed = 0; + + if (ctrl & MPA_V2_RDMA_WRITE_RTR) + wqe->sqe.opcode = SIW_OP_WRITE; + else if (ctrl & MPA_V2_RDMA_READ_RTR) { + struct siw_sqe *rreq; + + wqe->sqe.opcode = SIW_OP_READ; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else + rv = -EIO; + + spin_unlock(&qp->orq_lock); + } else + rv = -EINVAL; + + if (rv) + wqe->wr_status = SIW_WR_IDLE; + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (!rv) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Map memory access error to DDP tagged error + */ +enum ddp_ecode siw_tagged_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return DDP_ECODE_T_INVALID_STAG; + case E_BASE_BOUNDS: + return DDP_ECODE_T_BASE_BOUNDS; + case E_PD_MISMATCH: + return DDP_ECODE_T_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + /* + * RFC 5041 (DDP) lacks an ecode for insufficient access + * permissions. 'Invalid STag' seem to be the closest + * match though. + */ + return DDP_ECODE_T_INVALID_STAG; + default: + WARN_ON(1); + return DDP_ECODE_T_INVALID_STAG; + } +} + +/* + * Map memory access error to RDMAP protection error + */ +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return RDMAP_ECODE_INVALID_STAG; + case E_BASE_BOUNDS: + return RDMAP_ECODE_BASE_BOUNDS; + case E_PD_MISMATCH: + return RDMAP_ECODE_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + return RDMAP_ECODE_ACCESS_RIGHTS; + default: + return RDMAP_ECODE_UNSPECIFIED; + } +} + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype, + u8 ecode, int in_tx) +{ + if (!qp->term_info.valid) { + memset(&qp->term_info, 0, sizeof(qp->term_info)); + qp->term_info.layer = layer; + qp->term_info.etype = etype; + qp->term_info.ecode = ecode; + qp->term_info.in_tx = in_tx; + qp->term_info.valid = 1; + } + siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n", + layer, etype, ecode, in_tx ? "yes" : "no"); +} + +/* + * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581. + * Sending TERMINATE messages is best effort - such messages + * can only be send if the QP is still connected and it does + * not have another outbound message in-progress, i.e. the + * TERMINATE message must not interfer with an incomplete current + * transmit operation. + */ +void siw_send_terminate(struct siw_qp *qp) +{ + struct kvec iov[3]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct iwarp_terminate *term = NULL; + union iwarp_hdr *err_hdr = NULL; + struct socket *s = qp->attrs.sk; + struct siw_rx_stream *srx = &qp->rx_stream; + union iwarp_hdr *rx_hdr = &srx->hdr; + u32 crc = 0; + int num_frags, len_terminate, rv; + + if (!qp->term_info.valid) + return; + + qp->term_info.valid = 0; + + if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) { + siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n", + tx_type(tx_wqe(qp))); + return; + } + if (!s && qp->cep) + /* QP not yet in RTS. Take socket from connection end point */ + s = qp->cep->sock; + + if (!s) { + siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n"); + return; + } + + term = kzalloc(sizeof(*term), GFP_KERNEL); + if (!term) + return; + + term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE); + term->ddp_mo = 0; + term->ddp_msn = cpu_to_be32(1); + + iov[0].iov_base = term; + iov[0].iov_len = sizeof(*term); + + if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) || + ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) && + (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) { + err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL); + if (!err_hdr) { + kfree(term); + return; + } + } + memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl, + sizeof(struct iwarp_ctrl)); + + __rdmap_term_set_layer(term, qp->term_info.layer); + __rdmap_term_set_etype(term, qp->term_info.etype); + __rdmap_term_set_ecode(term, qp->term_info.ecode); + + switch (qp->term_info.layer) { + case TERM_ERROR_LAYER_RDMAP: + if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC) + /* No additional DDP/RDMAP header to be included */ + break; + + if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) { + /* + * Complete RDMAP frame will get attached, and + * DDP segment length is valid + */ + term->flag_m = 1; + term->flag_d = 1; + term->flag_r = 1; + + if (qp->term_info.in_tx) { + struct iwarp_rdma_rreq *rreq; + struct siw_wqe *wqe = tx_wqe(qp); + + /* Inbound RREQ error, detected during + * RRESP creation. Take state from + * current TX work queue element to + * reconstruct peers RREQ. + */ + rreq = (struct iwarp_rdma_rreq *)err_hdr; + + memcpy(&rreq->ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + rreq->rsvd = 0; + rreq->ddp_qn = + htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + + /* Provide RREQ's MSN as kept aside */ + rreq->ddp_msn = htonl(wqe->sqe.sge[0].length); + + rreq->ddp_mo = htonl(wqe->processed); + rreq->sink_stag = htonl(wqe->sqe.rkey); + rreq->sink_to = cpu_to_be64(wqe->sqe.raddr); + rreq->read_size = htonl(wqe->sqe.sge[0].length); + rreq->source_stag = htonl(wqe->sqe.sge[0].lkey); + rreq->source_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + + iov[1].iov_base = rreq; + iov[1].iov_len = sizeof(*rreq); + + rx_hdr = (union iwarp_hdr *)rreq; + } else { + /* Take RDMAP/DDP information from + * current (failed) inbound frame. + */ + iov[1].iov_base = rx_hdr; + + if (__rdmap_get_opcode(&rx_hdr->ctrl) == + RDMAP_RDMA_READ_REQ) + iov[1].iov_len = + sizeof(struct iwarp_rdma_rreq); + else /* SEND type */ + iov[1].iov_len = + sizeof(struct iwarp_send); + } + } else { + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) || + (qp->term_info.ecode == RDMAP_ECODE_OPCODE)) + break; + + iov[1].iov_base = rx_hdr; + + /* Only DDP frame will get attached */ + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = + sizeof(struct iwarp_rdma_write); + else + iov[1].iov_len = sizeof(struct iwarp_send); + + term->flag_m = 1; + term->flag_d = 1; + } + term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len); + break; + + case TERM_ERROR_LAYER_DDP: + /* Report error encountered while DDP processing. + * This can only happen as a result of inbound + * DDP processing + */ + + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_T_VERSION)) || + ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_UT_VERSION))) + break; + + iov[1].iov_base = rx_hdr; + + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged); + else + iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged); + + term->flag_m = 1; + term->flag_d = 1; + break; + + default: + break; + } + if (term->flag_m || term->flag_d || term->flag_r) { + iov[2].iov_base = &crc; + iov[2].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE; + num_frags = 3; + } else { + iov[1].iov_base = &crc; + iov[1].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + MPA_CRC_SIZE; + num_frags = 2; + } + + /* Adjust DDP Segment Length parameter, if valid */ + if (term->flag_m) { + u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len); + enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl); + + real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE; + rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len); + } + + term->ctrl.mpa_len = + cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE)); + if (qp->tx_ctx.mpa_crc_hd) { + crypto_shash_init(qp->tx_ctx.mpa_crc_hd); + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[0].iov_base, + iov[0].iov_len)) + goto out; + + if (num_frags == 3) { + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[1].iov_base, + iov[1].iov_len)) + goto out; + } + crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc); + } + + rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate); + siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n", + rv == len_terminate ? "success" : "failure", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term), rv); +out: + kfree(term); + kfree(err_hdr); +} + +/* + * Handle all attrs other than state + */ +static void siw_qp_modify_nonstate(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + if (mask & SIW_QP_ATTR_ACCESS_FLAGS) { + if (attrs->flags & SIW_RDMA_BIND_ENABLED) + qp->attrs.flags |= SIW_RDMA_BIND_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED; + + if (attrs->flags & SIW_RDMA_WRITE_ENABLED) + qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED; + + if (attrs->flags & SIW_RDMA_READ_ENABLED) + qp->attrs.flags |= SIW_RDMA_READ_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED; + } +} + +static int siw_qp_nextstate_from_idle(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_RTS: + if (attrs->flags & SIW_MPA_CRC) { + rv = siw_qp_enable_crc(qp); + if (rv) + break; + } + if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) { + siw_dbg_qp(qp, "no socket\n"); + rv = -EINVAL; + break; + } + if (!(mask & SIW_QP_ATTR_MPA)) { + siw_dbg_qp(qp, "no MPA\n"); + rv = -EINVAL; + break; + } + /* + * Initialize iWARP TX state + */ + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0; + + /* + * Initialize iWARP RX state + */ + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1; + + /* + * init IRD free queue, caller has already checked + * limits. + */ + rv = siw_qp_readq_init(qp, attrs->irq_size, + attrs->orq_size); + if (rv) + break; + + qp->attrs.sk = attrs->sk; + qp->attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n", + attrs->flags & SIW_MPA_CRC ? "y" : "n", + qp->attrs.orq_size, qp->attrs.irq_size); + break; + + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + break; + + default: + break; + } + return rv; +} + +static int siw_qp_nextstate_from_rts(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int drop_conn = 0; + + switch (attrs->state) { + case SIW_QP_STATE_CLOSING: + /* + * Verbs: move to IDLE if SQ and ORQ are empty. + * Move to ERROR otherwise. But first of all we must + * close the connection. So we keep CLOSING or ERROR + * as a transient state, schedule connection drop work + * and wait for the socket state change upcall to + * come back closed. + */ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) { + qp->attrs.state = SIW_QP_STATE_CLOSING; + } else { + qp->attrs.state = SIW_QP_STATE_ERROR; + siw_sq_flush(qp); + } + siw_rq_flush(qp); + + drop_conn = 1; + break; + + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_TERMINATE; + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + drop_conn = 1; + break; + + case SIW_QP_STATE_ERROR: + /* + * This is an emergency close. + * + * Any in progress transmit operation will get + * cancelled. + * This will likely result in a protocol failure, + * if a TX operation is in transit. The caller + * could unconditional wait to give the current + * operation a chance to complete. + * Esp., how to handle the non-empty IRQ case? + * The peer was asking for data transfer at a valid + * point in time. + */ + siw_sq_flush(qp); + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + drop_conn = 1; + break; + + default: + break; + } + return drop_conn; +} + +static void siw_qp_nextstate_from_term(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + switch (attrs->state) { + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + break; + + default: + break; + } +} + +static int siw_qp_nextstate_from_close(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_IDLE: + WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE); + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + case SIW_QP_STATE_CLOSING: + /* + * The LLP may already moved the QP to closing + * due to graceful peer close init + */ + break; + + case SIW_QP_STATE_ERROR: + /* + * QP was moved to CLOSING by LLP event + * not yet seen by user. + */ + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + + siw_rq_flush(qp); + break; + + default: + siw_dbg_qp(qp, "state transition undefined: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + rv = -ECONNABORTED; + } + return rv; +} + +/* + * Caller must hold qp->state_lock + */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int drop_conn = 0, rv = 0; + + if (!mask) + return 0; + + siw_dbg_qp(qp, "state: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + if (mask != SIW_QP_ATTR_STATE) + siw_qp_modify_nonstate(qp, attrs, mask); + + if (!(mask & SIW_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_RTR: + rv = siw_qp_nextstate_from_idle(qp, attrs, mask); + break; + + case SIW_QP_STATE_RTS: + drop_conn = siw_qp_nextstate_from_rts(qp, attrs); + break; + + case SIW_QP_STATE_TERMINATE: + siw_qp_nextstate_from_term(qp, attrs); + break; + + case SIW_QP_STATE_CLOSING: + siw_qp_nextstate_from_close(qp, attrs); + break; + default: + break; + } + if (drop_conn) + siw_qp_cm_drop(qp, 0); + + return rv; +} + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe) +{ + rreq->id = sqe->id; + rreq->opcode = sqe->opcode; + rreq->sge[0].laddr = sqe->sge[0].laddr; + rreq->sge[0].length = sqe->sge[0].length; + rreq->sge[0].lkey = sqe->sge[0].lkey; + rreq->sge[1].lkey = sqe->sge[1].lkey; + rreq->flags = sqe->flags | SIW_WQE_VALID; + rreq->num_sge = 1; +} + +/* + * Must be called with SQ locked. + * To avoid complete SQ starvation by constant inbound READ requests, + * the active IRQ will not be served after qp->irq_burst, if the + * SQ has pending work. + */ +int siw_activate_tx(struct siw_qp *qp) +{ + struct siw_sqe *irqe, *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int rv = 1; + + irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size]; + + if (irqe->flags & SIW_WQE_VALID) { + sqe = sq_get_next(qp); + + /* + * Avoid local WQE processing starvation in case + * of constant inbound READ request stream + */ + if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) { + qp->irq_burst = 0; + goto skip_irq; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* start READ RESPONSE */ + wqe->sqe.opcode = SIW_OP_READ_RESPONSE; + wqe->sqe.flags = 0; + if (irqe->num_sge) { + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = irqe->sge[0].length; + wqe->sqe.sge[0].laddr = irqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = irqe->sge[0].lkey; + } else { + wqe->sqe.num_sge = 0; + } + + /* Retain original RREQ's message sequence number for + * potential error reporting cases. + */ + wqe->sqe.sge[1].length = irqe->sge[1].length; + + wqe->sqe.rkey = irqe->rkey; + wqe->sqe.raddr = irqe->raddr; + + wqe->processed = 0; + qp->irq_get++; + + /* mark current IRQ entry free */ + smp_store_mb(irqe->flags, 0); + + goto out; + } + sqe = sq_get_next(qp); + if (sqe) { +skip_irq: + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* First copy SQE to kernel private memory */ + memcpy(&wqe->sqe, sqe, sizeof(*sqe)); + + if (wqe->sqe.opcode >= SIW_NUM_OPCODES) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.flags & SIW_WQE_INLINE) { + if (wqe->sqe.opcode != SIW_OP_SEND && + wqe->sqe.opcode != SIW_OP_WRITE) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) { + rv = -EINVAL; + goto out; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + wqe->sqe.sge[0].lkey = 0; + wqe->sqe.num_sge = 1; + } + if (wqe->sqe.flags & SIW_WQE_READ_FENCE) { + /* A READ cannot be fenced */ + if (unlikely(wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == + SIW_OP_READ_LOCAL_INV)) { + siw_dbg_qp(qp, "cannot fence read\n"); + rv = -EINVAL; + goto out; + } + spin_lock(&qp->orq_lock); + + if (!siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + + } else if (wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + struct siw_sqe *rreq; + + wqe->sqe.num_sge = 1; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + /* + * Make an immediate copy in ORQ to be ready + * to process loopback READ reply + */ + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + } + + /* Clear SQE, can be re-used by application */ + smp_store_mb(sqe->flags, 0); + qp->sq_get++; + } else { + rv = 0; + } +out: + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "error %d\n", rv); + wqe->wr_status = SIW_WR_IDLE; + } + return rv; +} + +/* + * Check if current CQ state qualifies for calling CQ completion + * handler. Must be called with CQ lock held. + */ +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags) +{ + u64 cq_notify; + + if (!cq->base_cq.comp_handler) + return false; + + cq_notify = READ_ONCE(*cq->notify); + + if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) || + ((cq_notify & SIW_NOTIFY_SOLICITED) && + (flags & SIW_WQE_SOLICITED))) { + /* dis-arm CQ */ + smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); + + return true; + } + return false; +} + +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status) +{ + struct siw_cq *cq = qp->scq; + int rv = 0; + + if (cq) { + u32 sqe_flags = sqe->flags; + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + + cqe->id = sqe->id; + cqe->opcode = sqe->opcode; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) + cqe->base_qp = qp->ib_qp; + else + cqe->qp_id = qp_id(qp); + + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, SIW_WQE_VALID); + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, sqe_flags); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + } + return rv; +} + +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status) +{ + struct siw_cq *cq = qp->rcq; + int rv = 0; + + if (cq) { + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + u8 cqe_flags = SIW_WQE_VALID; + + cqe->id = rqe->id; + cqe->opcode = SIW_OP_RECEIVE; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) { + cqe->base_qp = qp->ib_qp; + if (inval_stag) { + cqe_flags |= SIW_WQE_REM_INVAL; + cqe->inval_stag = inval_stag; + } + } else { + cqe->qp_id = qp_id(qp); + } + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, cqe_flags); + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + } + return rv; +} + +/* + * siw_sq_flush() + * + * Flush SQ and ORRQ entries to CQ. + * + * Must be called with QP state write lock held. + * Therefore, SQ and ORQ lock must not be taken. + */ +void siw_sq_flush(struct siw_qp *qp) +{ + struct siw_sqe *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int async_event = 0; + + /* + * Start with completing any work currently on the ORQ + */ + while (qp->attrs.orq_size) { + sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(sqe->flags, 0); + qp->orq_get++; + } + /* + * Flush an in-progress WQE if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n", + tx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, tx_type(wqe)); + + if (tx_type(wqe) != SIW_OP_READ_RESPONSE && + ((tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) || + wqe->wr_status == SIW_WR_QUEUED)) + /* + * An in-progress Read Request is already in + * the ORQ + */ + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_WR_FLUSH_ERR); + + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Send Queue + */ + while (qp->attrs.sq_size) { + sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + async_event = 1; + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + /* + * Shall IB_EVENT_SQ_DRAINED be supressed if work + * completion fails? + */ + break; + + WRITE_ONCE(sqe->flags, 0); + qp->sq_get++; + } + if (async_event) + siw_qp_event(qp, IB_EVENT_SQ_DRAINED); +} + +/* + * siw_rq_flush() + * + * Flush recv queue entries to CQ. Also + * takes care of pending active tagged and untagged + * inbound transfers, which have target memory + * referenced. + * + * Must be called with QP state write lock held. + * Therefore, RQ lock must not be taken. + */ +void siw_rq_flush(struct siw_qp *qp) +{ + struct siw_wqe *wqe = &qp->rx_untagged.wqe_active; + + /* + * Flush an in-progress untagged operation if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n", + rx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, rx_type(wqe)); + + if (rx_type(wqe) == SIW_OP_RECEIVE) { + siw_rqe_complete(qp, &wqe->rqe, wqe->bytes, + 0, SIW_WC_WR_FLUSH_ERR); + } else if (rx_type(wqe) != SIW_OP_READ && + rx_type(wqe) != SIW_OP_READ_RESPONSE && + rx_type(wqe) != SIW_OP_WRITE) { + siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR); + } + wqe->wr_status = SIW_WR_IDLE; + } + wqe = &qp->rx_tagged.wqe_active; + + if (wqe->wr_status != SIW_WR_IDLE) { + siw_wqe_put_mem(wqe, rx_type(wqe)); + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Receive Queue + */ + while (qp->attrs.rq_size) { + struct siw_rqe *rqe = + &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + + if (!READ_ONCE(rqe->flags)) + break; + + if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(rqe->flags, 0); + qp->rq_get++; + } +} + +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp) +{ + int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b, + GFP_KERNEL); + + if (!rv) { + kref_init(&qp->ref); + qp->sdev = sdev; + qp->qp_num = qp->ib_qp->qp_num; + siw_dbg_qp(qp, "new QP\n"); + } + return rv; +} + +void siw_free_qp(struct kref *ref) +{ + struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref); + struct siw_device *sdev = qp->sdev; + unsigned long flags; + + if (qp->cep) + siw_cep_put(qp->cep); + + found = xa_erase(&sdev->qp_xa, qp_id(qp)); + WARN_ON(found != qp); + spin_lock_irqsave(&sdev->lock, flags); + list_del(&qp->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + vfree(qp->sendq); + vfree(qp->recvq); + vfree(qp->irq); + vfree(qp->orq); + + siw_put_tx_cpu(qp->tx_cpu); + + atomic_dec(&sdev->num_qp); + siw_dbg_qp(qp, "free QP\n"); + kfree_rcu(qp, rcu); +} From b9be6f18cf9ed04dd8087cb9d69de6e90d8ceb08 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:30 +0200 Subject: [PATCH 126/194] rdma/siw: transmit path Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_tx.c | 1269 +++++++++++++++++++++++++ 1 file changed, 1269 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp_tx.c diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c new file mode 100644 index 000000000000..5e926fac51db --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -0,0 +1,1269 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +#define MAX_HDR_INLINE \ + (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ + sizeof(struct iwarp_send))) & 0xF8) + +static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); + + if (paddr) + return virt_to_page(paddr); + + return NULL; +} + +/* + * Copy short payload at provided destination payload address + */ +static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[0]; + u32 bytes = sge->length; + + if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) + return MAX_HDR_INLINE + 1; + + if (!bytes) + return 0; + + if (tx_flags(wqe) & SIW_WQE_INLINE) { + memcpy((void *)paddr, &wqe->sqe.sge[1], bytes); + } else { + struct siw_mem *mem = wqe->mem[0]; + + if (!mem->mem_obj) { + /* Kernel client using kva */ + memcpy((void *)paddr, (void *)sge->laddr, bytes); + } else if (c_tx->in_syscall) { + if (copy_from_user((void *)paddr, + (const void __user *)sge->laddr, + bytes)) + return -EFAULT; + } else { + unsigned int off = sge->laddr & ~PAGE_MASK; + struct page *p; + char *buffer; + int pbl_idx = 0; + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, sge->laddr); + else + p = siw_get_pblpage(mem, sge->laddr, &pbl_idx); + + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + + if (likely(PAGE_SIZE - off >= bytes)) { + memcpy((void *)paddr, buffer + off, bytes); + kunmap_atomic(buffer); + } else { + unsigned long part = bytes - (PAGE_SIZE - off); + + memcpy((void *)paddr, buffer + off, part); + kunmap_atomic(buffer); + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, + sge->laddr + part); + else + p = siw_get_pblpage(mem, + sge->laddr + part, + &pbl_idx); + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + memcpy((void *)(paddr + part), buffer, + bytes - part); + kunmap_atomic(buffer); + } + } + } + return (int)bytes; +} + +#define PKT_FRAGMENTED 1 +#define PKT_COMPLETE 0 + +/* + * siw_qp_prepare_tx() + * + * Prepare tx state for sending out one fpdu. Builds complete pkt + * if no user data or only immediate data are present. + * + * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. + */ +static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + char *crc = NULL; + int data = 0; + + switch (tx_type(wqe)) { + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rreq.rsvd = 0; + c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + c_tx->pkt.rreq.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); + c_tx->pkt.rreq.ddp_mo = 0; + c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); + c_tx->pkt.rreq.sink_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); + crc = (char *)&c_tx->pkt.rreq_pkt.crc; + break; + + case SIW_OP_SEND: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = 0; + + c_tx->ctrl_len = sizeof(struct iwarp_send); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_SEND_REMOTE_INV: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); + + c_tx->ctrl_len = sizeof(struct iwarp_send_inv); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_WRITE: + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_READ_RESPONSE: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, + sizeof(struct iwarp_ctrl)); + + /* NBO */ + c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); + c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + default: + siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); + return -EOPNOTSUPP; + } + if (unlikely(data < 0)) + return data; + + c_tx->ctrl_sent = 0; + + if (data <= MAX_HDR_INLINE) { + if (data) { + wqe->processed = data; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); + + /* Add pad, if needed */ + data += -(int)data & 0x3; + /* advance CRC location after payload */ + crc += data; + c_tx->ctrl_len += data; + + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + c_tx->pkt.c_untagged.ddp_mo = 0; + else + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr); + } + + *(u32 *)crc = 0; + /* + * Do complete CRC if enabled and short packet + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + if (crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->pkt, + c_tx->ctrl_len)) + return -EINVAL; + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc); + } + c_tx->ctrl_len += MPA_CRC_SIZE; + + return PKT_COMPLETE; + } + c_tx->ctrl_len += MPA_CRC_SIZE; + c_tx->sge_idx = 0; + c_tx->sge_off = 0; + c_tx->pbl_idx = 0; + + /* + * Allow direct sending out of user buffer if WR is non signalled + * and payload is over threshold. + * Per RDMA verbs, the application should not change the send buffer + * until the work completed. In iWarp, work completion is only + * local delivery to TCP. TCP may reuse the buffer for + * retransmission. Changing unsent data also breaks the CRC, + * if applied. + */ + if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && + !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) + c_tx->use_sendpage = 1; + else + c_tx->use_sendpage = 0; + + return PKT_FRAGMENTED; +} + +/* + * Send out one complete control type FPDU, or header of FPDU carrying + * data. Used for fixed sized packets like Read.Requests or zero length + * SENDs, WRITEs, READ.Responses, or header only. + */ +static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, + int flags) +{ + struct msghdr msg = { .msg_flags = flags }; + struct kvec iov = { .iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, + .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; + + int rv = kernel_sendmsg(s, &msg, &iov, 1, + c_tx->ctrl_len - c_tx->ctrl_sent); + + if (rv >= 0) { + c_tx->ctrl_sent += rv; + + if (c_tx->ctrl_sent == c_tx->ctrl_len) + rv = 0; + else + rv = -EAGAIN; + } + return rv; +} + +/* + * 0copy TCP transmit interface: Use do_tcp_sendpages. + * + * Using sendpage to push page by page appears to be less efficient + * than using sendmsg, even if data are copied. + * + * A general performance limitation might be the extra four bytes + * trailer checksum segment to be pushed after user data. + */ +static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, + size_t size) +{ + struct sock *sk = s->sk; + int i = 0, rv = 0, sent = 0, + flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST; + + while (size) { + size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); + + if (size + offset <= PAGE_SIZE) + flags = MSG_MORE | MSG_DONTWAIT; + + tcp_rate_check_app_limited(sk); +try_page_again: + lock_sock(sk); + rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags); + release_sock(sk); + + if (rv > 0) { + size -= rv; + sent += rv; + if (rv != bytes) { + offset += rv; + bytes -= rv; + goto try_page_again; + } + offset = 0; + } else { + if (rv == -EAGAIN || rv == 0) + break; + return rv; + } + i++; + } + return sent; +} + +/* + * siw_0copy_tx() + * + * Pushes list of pages to TCP socket. If pages from multiple + * SGE's, all referenced pages of each SGE are pushed in one + * shot. + */ +static int siw_0copy_tx(struct socket *s, struct page **page, + struct siw_sge *sge, unsigned int offset, + unsigned int size) +{ + int i = 0, sent = 0, rv; + int sge_bytes = min(sge->length - offset, size); + + offset = (sge->laddr + offset) & ~PAGE_MASK; + + while (sent != size) { + rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); + if (rv >= 0) { + sent += rv; + if (size == sent || sge_bytes > rv) + break; + + i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; + sge++; + sge_bytes = min(sge->length, size - sent); + offset = sge->laddr & ~PAGE_MASK; + } else { + sent = rv; + break; + } + } + return sent; +} + +#define MAX_TRAILER (MPA_CRC_SIZE + 4) + +static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps) +{ + if (hdr_len) { + ++pages; + --num_maps; + } + while (num_maps-- > 0) { + kunmap(*pages); + pages++; + } +} + +/* + * siw_tx_hdt() tries to push a complete packet to TCP where all + * packet fragments are referenced by the elements of one iovec. + * For the data portion, each involved page must be referenced by + * one extra element. All sge's data can be non-aligned to page + * boundaries. Two more elements are referencing iWARP header + * and trailer: + * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL + */ +#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) + +/* + * Write out iov referencing hdr, data and trailer of current FPDU. + * Update transmit state dependent on write return status + */ +static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; + struct kvec iov[MAX_ARRAY]; + struct page *page_array[MAX_ARRAY]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + + int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; + unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, + sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, + pbl_idx = c_tx->pbl_idx; + + if (c_tx->state == SIW_SEND_HDR) { + if (c_tx->use_sendpage) { + rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); + if (rv) + goto done; + + c_tx->state = SIW_SEND_DATA; + } else { + iov[0].iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; + iov[0].iov_len = hdr_len = + c_tx->ctrl_len - c_tx->ctrl_sent; + seg = 1; + } + } + + wqe->processed += data_len; + + while (data_len) { /* walk the list of SGE's */ + unsigned int sge_len = min(sge->length - sge_off, data_len); + unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; + struct siw_mem *mem; + + if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { + mem = wqe->mem[sge_idx]; + if (!mem->mem_obj) + is_kva = 1; + } else { + is_kva = 1; + } + if (is_kva && !c_tx->use_sendpage) { + /* + * tx from kernel virtual address: either inline data + * or memory region with assigned kernel buffer + */ + iov[seg].iov_base = (void *)(sge->laddr + sge_off); + iov[seg].iov_len = sge_len; + + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + iov[seg].iov_base, + sge_len); + sge_off += sge_len; + data_len -= sge_len; + seg++; + goto sge_done; + } + + while (sge_len) { + size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); + + if (!is_kva) { + struct page *p; + + if (mem->is_pbl) + p = siw_get_pblpage( + mem, sge->laddr + sge_off, + &pbl_idx); + else + p = siw_get_upage(mem->umem, + sge->laddr + sge_off); + if (unlikely(!p)) { + if (hdr_len) + seg--; + if (!c_tx->use_sendpage && seg) { + siw_unmap_pages(page_array, + hdr_len, seg); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EFAULT; + goto done_crc; + } + page_array[seg] = p; + + if (!c_tx->use_sendpage) { + iov[seg].iov_base = kmap(p) + fp_off; + iov[seg].iov_len = plen; + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + iov[seg].iov_base, + plen); + } else if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + page_address(p) + fp_off, + plen); + } else { + u64 pa = ((sge->laddr + sge_off) & PAGE_MASK); + + page_array[seg] = virt_to_page(pa); + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + (void *)(sge->laddr + sge_off), + plen); + } + + sge_len -= plen; + sge_off += plen; + data_len -= plen; + fp_off = 0; + + if (++seg > (int)MAX_ARRAY) { + siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); + if (!is_kva && !c_tx->use_sendpage) { + siw_unmap_pages(page_array, hdr_len, + seg - 1); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EMSGSIZE; + goto done_crc; + } + } +sge_done: + /* Update SGE variables at end of SGE */ + if (sge_off == sge->length && + (data_len != 0 || wqe->processed < wqe->bytes)) { + sge_idx++; + sge++; + sge_off = 0; + } + } + /* trailer */ + if (likely(c_tx->state != SIW_SEND_TRAILER)) { + iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; + iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); + } else { + iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; + iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; + } + + if (c_tx->pad) { + *(u32 *)c_tx->trailer.pad = 0; + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->trailer.crc - c_tx->pad, + c_tx->pad); + } + if (!c_tx->mpa_crc_hd) + c_tx->trailer.crc = 0; + else if (do_crc) + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); + + data_len = c_tx->bytes_unsent; + + if (c_tx->use_sendpage) { + rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], + c_tx->sge_off, data_len); + if (rv == data_len) { + rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); + if (rv > 0) + rv += data_len; + else + rv = data_len; + } + } else { + rv = kernel_sendmsg(s, &msg, iov, seg + 1, + hdr_len + data_len + trl_len); + if (!is_kva) + siw_unmap_pages(page_array, hdr_len, seg); + } + if (rv < (int)hdr_len) { + /* Not even complete hdr pushed or negative rv */ + wqe->processed -= data_len; + if (rv >= 0) { + c_tx->ctrl_sent += rv; + rv = -EAGAIN; + } + goto done_crc; + } + rv -= hdr_len; + + if (rv >= (int)data_len) { + /* all user data pushed to TCP or no data to push */ + if (data_len > 0 && wqe->processed < wqe->bytes) { + /* Save the current state for next tx */ + c_tx->sge_idx = sge_idx; + c_tx->sge_off = sge_off; + c_tx->pbl_idx = pbl_idx; + } + rv -= data_len; + + if (rv == trl_len) /* all pushed */ + rv = 0; + else { + c_tx->state = SIW_SEND_TRAILER; + c_tx->ctrl_len = MAX_TRAILER; + c_tx->ctrl_sent = rv + 4 - c_tx->pad; + c_tx->bytes_unsent = 0; + rv = -EAGAIN; + } + + } else if (data_len > 0) { + /* Maybe some user data pushed to TCP */ + c_tx->state = SIW_SEND_DATA; + wqe->processed -= data_len - rv; + + if (rv) { + /* + * Some bytes out. Recompute tx state based + * on old state and bytes pushed + */ + unsigned int sge_unsent; + + c_tx->bytes_unsent -= rv; + sge = &wqe->sqe.sge[c_tx->sge_idx]; + sge_unsent = sge->length - c_tx->sge_off; + + while (sge_unsent <= rv) { + rv -= sge_unsent; + c_tx->sge_idx++; + c_tx->sge_off = 0; + sge++; + sge_unsent = sge->length; + } + c_tx->sge_off += rv; + } + rv = -EAGAIN; + } +done_crc: + c_tx->do_crc = 0; +done: + return rv; +} + +static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, + struct socket *s) +{ + struct tcp_sock *tp = tcp_sk(s->sk); + + if (tp->gso_segs) { + if (c_tx->gso_seg_limit == 0) + c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; + else + c_tx->tcp_seglen = + tp->mss_cache * + min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); + } else { + c_tx->tcp_seglen = tp->mss_cache; + } + /* Loopback may give odd numbers */ + c_tx->tcp_seglen &= 0xfffffff8; +} + +/* + * siw_prepare_fpdu() + * + * Prepares transmit context to send out one FPDU if FPDU will contain + * user data and user data are not immediate data. + * Computes maximum FPDU length to fill up TCP MSS if possible. + * + * @qp: QP from which to transmit + * @wqe: Current WQE causing transmission + * + * TODO: Take into account real available sendspace on socket + * to avoid header misalignment due to send pausing within + * fpdu transmission + */ +static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int data_len; + + c_tx->ctrl_len = + iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; + c_tx->ctrl_sent = 0; + + /* + * Update target buffer offset if any + */ + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + /* Untagged message */ + c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); + else /* Tagged message */ + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr + wqe->processed); + + data_len = wqe->bytes - wqe->processed; + if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { + /* Trim DDP payload to fit into current TCP segment */ + data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); + c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; + c_tx->pad = 0; + } else { + c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; + c_tx->pad = -data_len & 0x3; + } + c_tx->bytes_unsent = data_len; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); + + /* + * Init MPA CRC computation + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, + c_tx->ctrl_len); + c_tx->do_crc = 1; + } +} + +/* + * siw_check_sgl_tx() + * + * Check permissions for a list of SGE's (SGL). + * A successful check will have all memory referenced + * for transmission resolved and assigned to the WQE. + * + * @pd: Protection Domain SGL should belong to + * @wqe: WQE to be checked + * @perms: requested access permissions + * + */ + +static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, + enum ib_access_flags perms) +{ + struct siw_sge *sge = &wqe->sqe.sge[0]; + int i, len, num_sge = wqe->sqe.num_sge; + + if (unlikely(num_sge > SIW_MAX_SGE)) + return -EINVAL; + + for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { + /* + * rdma verbs: do not check stag for a zero length sge + */ + if (sge->length) { + int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, + sge->length); + + if (unlikely(rv != E_ACCESS_OK)) + return rv; + } + len += sge->length; + } + return len; +} + +/* + * siw_qp_sq_proc_tx() + * + * Process one WQE which needs transmission on the wire. + */ +static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + struct socket *s = qp->attrs.sk; + int rv = 0, burst_len = qp->tx_ctx.burst; + enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; + + if (unlikely(wqe->wr_status == SIW_WR_IDLE)) + return 0; + + if (!burst_len) + burst_len = SQ_USER_MAXBURST; + + if (wqe->wr_status == SIW_WR_QUEUED) { + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { + if (tx_type(wqe) == SIW_OP_READ_RESPONSE) + wqe->sqe.num_sge = 1; + + if (tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { + /* + * Reference memory to be tx'd w/o checking + * access for LOCAL_READ permission, since + * not defined in RDMA core. + */ + rv = siw_check_sgl_tx(qp->pd, wqe, 0); + if (rv < 0) { + if (tx_type(wqe) == + SIW_OP_READ_RESPONSE) + ecode = siw_rdmap_error(-rv); + rv = -EINVAL; + goto tx_error; + } + wqe->bytes = rv; + } else { + wqe->bytes = 0; + } + } else { + wqe->bytes = wqe->sqe.sge[0].length; + if (!qp->kernel_verbs) { + if (wqe->bytes > SIW_MAX_INLINE) { + rv = -EINVAL; + goto tx_error; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + } + } + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->processed = 0; + + siw_update_tcpseg(c_tx, s); + + rv = siw_qp_prepare_tx(c_tx); + if (rv == PKT_FRAGMENTED) { + c_tx->state = SIW_SEND_HDR; + siw_prepare_fpdu(qp, wqe); + } else if (rv == PKT_COMPLETE) { + c_tx->state = SIW_SEND_SHORT_FPDU; + } else { + goto tx_error; + } + } + +next_segment: + siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", + tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, + wqe->sqe.id); + + if (--burst_len == 0) { + rv = -EINPROGRESS; + goto tx_done; + } + if (c_tx->state == SIW_SEND_SHORT_FPDU) { + enum siw_opcode tx_type = tx_type(wqe); + unsigned int msg_flags; + + if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) + /* + * End current TCP segment, if SQ runs empty, + * or siw_tcp_nagle is not set, or we bail out + * soon due to no burst credit left. + */ + msg_flags = MSG_DONTWAIT; + else + msg_flags = MSG_DONTWAIT | MSG_MORE; + + rv = siw_tx_ctrl(c_tx, s, msg_flags); + + if (!rv && tx_type != SIW_OP_READ && + tx_type != SIW_OP_READ_LOCAL_INV) + wqe->processed = wqe->bytes; + + goto tx_done; + + } else { + rv = siw_tx_hdt(c_tx, s); + } + if (!rv) { + /* + * One segment sent. Processing completed if last + * segment, Do next segment otherwise. + */ + if (unlikely(c_tx->tx_suspend)) { + /* + * Verbs, 6.4.: Try stopping sending after a full + * DDP segment if the connection goes down + * (== peer halfclose) + */ + rv = -ECONNABORTED; + goto tx_done; + } + if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { + siw_dbg_qp(qp, "WQE completed\n"); + goto tx_done; + } + c_tx->state = SIW_SEND_HDR; + + siw_update_tcpseg(c_tx, s); + + siw_prepare_fpdu(qp, wqe); + goto next_segment; + } +tx_done: + qp->tx_ctx.burst = burst_len; + return rv; + +tx_error: + if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); + else + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + return rv; +} + +static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) +{ + struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr; + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); + int rv = 0; + + siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); + + if (unlikely(!mem || !base_mr)) { + pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); + return -EINVAL; + } + if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { + pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->pd != pd)) { + pr_warn("siw: fastreg: PD mismatch\n"); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->stag_valid)) { + pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + /* Refresh STag since user may have changed key part */ + mem->stag = sqe->rkey; + mem->perms = sqe->access; + + siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n", + mem->va, base_mr->iova); + mem->va = base_mr->iova; + mem->stag_valid = 1; +out: + siw_mem_put(mem); + return rv; +} + +static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) +{ + int rv; + + switch (tx_type(wqe)) { + case SIW_OP_REG_MR: + rv = siw_fastreg_mr(qp->pd, &wqe->sqe); + break; + + case SIW_OP_INVAL_STAG: + rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); + break; + + default: + rv = -EINVAL; + } + return rv; +} + +/* + * siw_qp_sq_process() + * + * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. + * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more + * MPA FPDUs, each containing a DDP segment. + * + * SQ processing may occur in user context as a result of posting + * new WQE's or from siw_sq_work_handler() context. Processing in + * user context is limited to non-kernel verbs users. + * + * SQ processing may get paused anytime, possibly in the middle of a WR + * or FPDU, if insufficient send space is available. SQ processing + * gets resumed from siw_sq_work_handler(), if send space becomes + * available again. + * + * Must be called with the QP state read-locked. + * + * Note: + * An outbound RREQ can be satisfied by the corresponding RRESP + * _before_ it gets assigned to the ORQ. This happens regularly + * in RDMA READ via loopback case. Since both outbound RREQ and + * inbound RRESP can be handled by the same CPU, locking the ORQ + * is dead-lock prone and thus not an option. With that, the + * RREQ gets assigned to the ORQ _before_ being sent - see + * siw_activate_tx() - and pulled back in case of send failure. + */ +int siw_qp_sq_process(struct siw_qp *qp) +{ + struct siw_wqe *wqe = tx_wqe(qp); + enum siw_opcode tx_type; + unsigned long flags; + int rv = 0; + + siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); + +next_wqe: + /* + * Stop QP processing if SQ state changed + */ + if (unlikely(qp->tx_ctx.tx_suspend)) { + siw_dbg_qp(qp, "tx suspended\n"); + goto done; + } + tx_type = tx_type(wqe); + + if (tx_type <= SIW_OP_READ_RESPONSE) + rv = siw_qp_sq_proc_tx(qp, wqe); + else + rv = siw_qp_sq_proc_local(qp, wqe); + + if (!rv) { + /* + * WQE processing done + */ + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_WRITE: + siw_wqe_put_mem(wqe, tx_type); + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + if (tx_flags(wqe) & SIW_WQE_SIGNALLED) + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_SUCCESS); + break; + + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + /* + * already enqueued to ORQ queue + */ + break; + + case SIW_OP_READ_RESPONSE: + siw_wqe_put_mem(wqe, tx_type); + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + goto done; + } + + spin_lock_irqsave(&qp->sq_lock, flags); + wqe->wr_status = SIW_WR_IDLE; + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto done; + + goto next_wqe; + + } else if (rv == -EAGAIN) { + siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", + qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, + qp->tx_ctx.bytes_unsent); + rv = 0; + goto done; + } else if (rv == -EINPROGRESS) { + rv = siw_sq_start(qp); + goto done; + } else { + /* + * WQE processing failed. + * Verbs 8.3.2: + * o It turns any WQE into a signalled WQE. + * o Local catastrophic error must be surfaced + * o QP must be moved into Terminate state: done by code + * doing socket state change processing + * + * o TODO: Termination message must be sent. + * o TODO: Implement more precise work completion errors, + * see enum ib_wc_status in ib_verbs.h + */ + siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", + tx_type(wqe), rv); + + spin_lock_irqsave(&qp->sq_lock, flags); + /* + * RREQ may have already been completed by inbound RRESP! + */ + if (tx_type == SIW_OP_READ || + tx_type == SIW_OP_READ_LOCAL_INV) { + /* Cleanup pending entry in ORQ */ + qp->orq_put--; + qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; + } + spin_unlock_irqrestore(&qp->sq_lock, flags); + /* + * immediately suspends further TX processing + */ + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_WRITE: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + siw_wqe_put_mem(wqe, tx_type); + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_LOC_QP_OP_ERR); + + siw_qp_event(qp, IB_EVENT_QP_FATAL); + + break; + + case SIW_OP_READ_RESPONSE: + siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); + + siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); + + siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); + + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + } + wqe->wr_status = SIW_WR_IDLE; + } +done: + return rv; +} + +static void siw_sq_resume(struct siw_qp *qp) +{ + if (down_read_trylock(&qp->state_lock)) { + if (likely(qp->attrs.state == SIW_QP_STATE_RTS && + !qp->tx_ctx.tx_suspend)) { + int rv = siw_qp_sq_process(qp); + + up_read(&qp->state_lock); + + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); + + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + } + } else { + up_read(&qp->state_lock); + } + } else { + siw_dbg_qp(qp, "Resume SQ while QP locked\n"); + } + siw_qp_put(qp); +} + +struct tx_task_t { + struct llist_head active; + wait_queue_head_t waiting; +}; + +static DEFINE_PER_CPU(struct tx_task_t, tx_task_g); + +void siw_stop_tx_thread(int nr_cpu) +{ + kthread_stop(siw_tx_thread[nr_cpu]); + wake_up(&per_cpu(tx_task_g, nr_cpu).waiting); +} + +int siw_run_sq(void *data) +{ + const int nr_cpu = (unsigned int)(long)data; + struct llist_node *active; + struct siw_qp *qp; + struct tx_task_t *tx_task = &per_cpu(tx_task_g, nr_cpu); + + init_llist_head(&tx_task->active); + init_waitqueue_head(&tx_task->waiting); + + pr_info("Started siw TX thread on CPU %u\n", nr_cpu); + + while (1) { + struct llist_node *fifo_list = NULL; + + wait_event_interruptible(tx_task->waiting, + !llist_empty(&tx_task->active) || + kthread_should_stop()); + + if (kthread_should_stop()) + break; + + active = llist_del_all(&tx_task->active); + /* + * llist_del_all returns a list with newest entry first. + * Re-order list for fairness among QP's. + */ + while (active) { + struct llist_node *tmp = active; + + active = llist_next(active); + tmp->next = fifo_list; + fifo_list = tmp; + } + while (fifo_list) { + qp = container_of(fifo_list, struct siw_qp, tx_list); + fifo_list = llist_next(fifo_list); + qp->tx_list.next = NULL; + + siw_sq_resume(qp); + } + } + active = llist_del_all(&tx_task->active); + if (active) { + llist_for_each_entry(qp, active, tx_list) { + qp->tx_list.next = NULL; + siw_sq_resume(qp); + } + } + pr_info("Stopped siw TX thread on CPU %u\n", nr_cpu); + + return 0; +} + +int siw_sq_start(struct siw_qp *qp) +{ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + return 0; + + if (unlikely(!cpu_online(qp->tx_cpu))) { + siw_put_tx_cpu(qp->tx_cpu); + qp->tx_cpu = siw_get_tx_cpu(qp->sdev); + if (qp->tx_cpu < 0) { + pr_warn("siw: no tx cpu available\n"); + + return -EIO; + } + } + siw_qp_get(qp); + + llist_add(&qp->tx_list, &per_cpu(tx_task_g, qp->tx_cpu).active); + + wake_up(&per_cpu(tx_task_g, qp->tx_cpu).waiting); + + return 0; +} From 8b6a361b8c482f22ac99c3273285ff16b23fba91 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:31 +0200 Subject: [PATCH 127/194] rdma/siw: receive path Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_rx.c | 1456 +++++++++++++++++++++++++ 1 file changed, 1456 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp_rx.c diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c new file mode 100644 index 000000000000..682a290bc11e --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -0,0 +1,1456 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +/* + * siw_rx_umem() + * + * Receive data of @len into target referenced by @dest_addr. + * + * @srx: Receive Context + * @umem: siw representation of target memory + * @dest_addr: user virtual address + * @len: number of bytes to place + */ +static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, + u64 dest_addr, int len) +{ + int copied = 0; + + while (len) { + struct page *p; + int pg_off, bytes, rv; + void *dest; + + p = siw_get_upage(umem, dest_addr); + if (unlikely(!p)) { + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", + __func__, qp_id(rx_qp(srx)), + (void *)dest_addr, (void *)umem->fp_addr); + /* siw internal error */ + srx->skb_copied += copied; + srx->skb_new -= copied; + + return -EFAULT; + } + pg_off = dest_addr & ~PAGE_MASK; + bytes = min(len, (int)PAGE_SIZE - pg_off); + + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); + + dest = kmap_atomic(p); + rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, + bytes); + + if (unlikely(rv)) { + kunmap_atomic(dest); + srx->skb_copied += copied; + srx->skb_new -= copied; + + pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, p, rv); + + return -EFAULT; + } + if (srx->mpa_crc_hd) { + if (rx_qp(srx)->kernel_verbs) { + crypto_shash_update(srx->mpa_crc_hd, + (u8 *)(dest + pg_off), bytes); + kunmap_atomic(dest); + } else { + kunmap_atomic(dest); + /* + * Do CRC on original, not target buffer. + * Some user land applications may + * concurrently write the target buffer, + * which would yield a broken CRC. + * Walking the skb twice is very ineffcient. + * Folding the CRC into skb_copy_bits() + * would be much better, but is currently + * not supported. + */ + siw_crc_skb(srx, bytes); + } + } else { + kunmap_atomic(dest); + } + srx->skb_offset += bytes; + copied += bytes; + len -= bytes; + dest_addr += bytes; + pg_off = 0; + } + srx->skb_copied += copied; + srx->skb_new -= copied; + + return copied; +} + +static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) +{ + int rv; + + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); + + rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); + if (unlikely(rv)) { + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, kva, rv); + + return rv; + } + if (srx->mpa_crc_hd) + crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + + srx->skb_offset += len; + srx->skb_copied += len; + srx->skb_new -= len; + + return len; +} + +static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, + struct siw_mem *mem, u64 addr, int len) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + int copied = 0; + + while (len) { + int bytes; + u64 buf_addr = + siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); + if (!buf_addr) + break; + + bytes = min(bytes, len); + if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { + copied += bytes; + offset += bytes; + len -= bytes; + } else { + break; + } + } + return copied; +} + +/* + * siw_rresp_check_ntoh() + * + * Check incoming RRESP fragment header against expected + * header values and update expected values for potential next + * fragment. + * + * NOTE: This function must be called only if a RRESP DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(rresp->sink_stag); + u64 sink_to = be64_to_cpu(rresp->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = wqe->sqe.sge[0].lkey; + srx->ddp_to = wqe->sqe.sge[0].laddr; + frx->pbl_idx = 0; + } + /* Below checks extend beyond the semantics of DDP, and + * into RDMAP: + * We check if the read response matches exactly the + * read request which was send to the remote peer to + * trigger this read response. RFC5040/5041 do not + * always have a proper error code for the detected + * error cases. We choose 'base or bounds error' for + * cases where the inbound STag is valid, but offset + * or length do not match our response receive state. + */ + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + if (unlikely(!frx->more_ddp_segs && + (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { + pr_warn("siw: [QP %u]: rresp len: %d != %d\n", + qp_id(rx_qp(srx)), + wqe->processed + srx->fpdu_part_rem, wqe->bytes); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_write_check_ntoh() + * + * Check incoming WRITE fragment header against expected + * header values and update expected values for potential next + * fragment + * + * NOTE: This function must be called only if a WRITE DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_write_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_write *write = &srx->hdr.rwrite; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(write->sink_stag); + u64 sink_to = be64_to_cpu(write->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = sink_stag; + srx->ddp_to = sink_to; + frx->pbl_idx = 0; + } else { + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, + srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), + (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_send_check_ntoh() + * + * Check incoming SEND fragment header against expected + * header values and update expected MSN if no next + * fragment expected + * + * NOTE: This function must be called only if a SEND DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_send_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_send_inv *send = &srx->hdr.send_inv; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 ddp_msn = be32_to_cpu(send->ddp_msn); + u32 ddp_mo = be32_to_cpu(send->ddp_mo); + u32 ddp_qn = be32_to_cpu(send->ddp_qn); + + if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { + pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", + qp_id(rx_qp(srx)), ddp_qn); + ecode = DDP_ECODE_UT_INVALID_QN; + goto error; + } + if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { + pr_warn("siw: [QP %u]: send msn: %u != %u\n", + qp_id(rx_qp(srx)), ddp_msn, + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; + goto error; + } + if (unlikely(ddp_mo != wqe->processed)) { + pr_warn("siw: [QP %u], send mo: %u != %u\n", + qp_id(rx_qp(srx)), ddp_mo, wqe->processed); + ecode = DDP_ECODE_UT_INVALID_MO; + goto error; + } + if (frx->first_ddp_seg) { + /* initialize user memory write position */ + frx->sge_idx = 0; + frx->sge_off = 0; + frx->pbl_idx = 0; + + /* only valid for SEND_INV and SEND_SE_INV operations */ + srx->inval_stag = be32_to_cpu(send->inval_stag); + } + if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { + siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", + wqe->bytes, wqe->processed, srx->fpdu_part_rem); + wqe->wc_status = SIW_WC_LOC_LEN_ERR; + ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, ecode, 0); + return -EINVAL; +} + +static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) +{ + struct siw_rqe *rqe; + struct siw_srq *srq; + struct siw_wqe *wqe = NULL; + bool srq_event = false; + unsigned long flags; + + srq = qp->srq; + if (srq) { + spin_lock_irqsave(&srq->lock, flags); + if (unlikely(!srq->num_rqe)) + goto out; + + rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; + } else { + if (unlikely(!qp->recvq)) + goto out; + + rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + } + if (likely(rqe->flags == SIW_WQE_VALID)) { + int num_sge = rqe->num_sge; + + if (likely(num_sge <= SIW_MAX_SGE)) { + int i = 0; + + wqe = rx_wqe(&qp->rx_untagged); + rx_type(wqe) = SIW_OP_RECEIVE; + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->bytes = 0; + wqe->processed = 0; + + wqe->rqe.id = rqe->id; + wqe->rqe.num_sge = num_sge; + + while (i < num_sge) { + wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; + wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; + wqe->rqe.sge[i].length = rqe->sge[i].length; + wqe->bytes += wqe->rqe.sge[i].length; + wqe->mem[i] = NULL; + i++; + } + /* can be re-used by appl */ + smp_store_mb(rqe->flags, 0); + } else { + siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); + if (srq) + spin_unlock_irqrestore(&srq->lock, flags); + return NULL; + } + if (!srq) { + qp->rq_get++; + } else { + if (srq->armed) { + /* Test SRQ limit */ + u32 off = (srq->rq_get + srq->limit) % + srq->num_rqe; + struct siw_rqe *rqe2 = &srq->recvq[off]; + + if (!(rqe2->flags & SIW_WQE_VALID)) { + srq->armed = 0; + srq_event = true; + } + } + srq->rq_get++; + } + } +out: + if (srq) { + spin_unlock_irqrestore(&srq->lock, flags); + if (srq_event) + siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); + } + return wqe; +} + +/* + * siw_proc_send: + * + * Process one incoming SEND and place data into memory referenced by + * receive wqe. + * + * Function supports partially received sends (suspending/resuming + * current receive wqe processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_send(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_untagged; + struct siw_wqe *wqe; + u32 data_bytes; /* all data bytes available */ + u32 rcvd_bytes; /* sum of data bytes rcvd */ + int rv = 0; + + if (frx->first_ddp_seg) { + wqe = siw_rqe_get(qp); + if (unlikely(!wqe)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); + return -ENOENT; + } + } else { + wqe = rx_wqe(frx); + } + if (srx->state == SIW_GET_DATA_START) { + rv = siw_send_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + if (!srx->fpdu_part_rem) /* zero length SEND */ + return 0; + } + data_bytes = min(srx->fpdu_part_rem, srx->skb_new); + rcvd_bytes = 0; + + /* A zero length SEND will skip below loop */ + while (data_bytes) { + struct ib_pd *pd; + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + u32 sge_bytes; /* data bytes avail for SGE */ + + sge = &wqe->rqe.sge[frx->sge_idx]; + + if (!sge->length) { + /* just skip empty sge's */ + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + continue; + } + sge_bytes = min(data_bytes, sge->length - frx->sge_off); + mem = &wqe->mem[frx->sge_idx]; + + /* + * check with QP's PD if no SRQ present, SRQ's PD otherwise + */ + pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; + + rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, + frx->sge_off, sge_bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + break; + } + mem_p = *mem; + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(sge->laddr + frx->sge_off), + sge_bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, + sge->laddr + frx->sge_off, sge_bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + frx->sge_off, sge_bytes); + + if (unlikely(rv != sge_bytes)) { + wqe->processed += rcvd_bytes; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + frx->sge_off += rv; + + if (frx->sge_off == sge->length) { + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + } + data_bytes -= rv; + rcvd_bytes += rv; + + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + } + wqe->processed += rcvd_bytes; + + if (!srx->fpdu_part_rem) + return 0; + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * siw_proc_write: + * + * Place incoming WRITE after referencing and checking target buffer + + * Function supports partially received WRITEs (suspending/resuming + * current receive processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_write(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_mem *mem; + int bytes, rv; + + if (srx->state == SIW_GET_DATA_START) { + if (!srx->fpdu_part_rem) /* zero length WRITE */ + return 0; + + rv = siw_write_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (frx->first_ddp_seg) { + struct siw_wqe *wqe = rx_wqe(frx); + + rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); + if (unlikely(!rx_mem(frx))) { + siw_dbg_qp(qp, + "sink stag not found/invalid, stag 0x%08x\n", + srx->ddp_stag); + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + wqe->rqe.num_sge = 1; + rx_type(wqe) = SIW_OP_WRITE; + wqe->wr_status = SIW_WR_INPROGRESS; + } + mem = rx_mem(frx); + + /* + * Check if application re-registered memory with different + * key field of STag. + */ + if (unlikely(mem->stag != srx->ddp_stag)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, + IB_ACCESS_REMOTE_WRITE, bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), + 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + + if (mem->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(srx->ddp_to + srx->fpdu_part_rcvd), + bytes); + else if (!mem->is_pbl) + rv = siw_rx_umem(srx, mem->umem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + + if (unlikely(rv != bytes)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; +} + +/* + * Inbound RREQ's cannot carry user data. + */ +int siw_proc_rreq(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + + if (!srx->fpdu_part_rem) + return 0; + + pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), + be16_to_cpu(srx->hdr.ctrl.mpa_len)); + + return -EPROTO; +} + +/* + * siw_init_rresp: + * + * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. + * Put it at the tail of the IRQ, if there is another WQE currently in + * transmit processing. If not, make it the current WQE to be processed + * and schedule transmit processing. + * + * Can be called from softirq context and from process + * context (RREAD socket loopback case!) + * + * return value: + * 0: success, + * failure code otherwise + */ + +static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct siw_wqe *tx_work = tx_wqe(qp); + struct siw_sqe *resp; + + uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), + laddr = be64_to_cpu(srx->hdr.rreq.source_to); + uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), + lkey = be32_to_cpu(srx->hdr.rreq.source_stag), + rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), + msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); + + int run_sq = 1, rv = 0; + unsigned long flags; + + if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_RANGE, 0); + return -EPROTO; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + if (tx_work->wr_status == SIW_WR_IDLE) { + /* + * immediately schedule READ response w/o + * consuming IRQ entry: IRQ must be empty. + */ + tx_work->processed = 0; + tx_work->mem[0] = NULL; + tx_work->wr_status = SIW_WR_QUEUED; + resp = &tx_work->sqe; + } else { + resp = irq_alloc_free(qp); + run_sq = 0; + } + if (likely(resp)) { + resp->opcode = SIW_OP_READ_RESPONSE; + + resp->sge[0].length = length; + resp->sge[0].laddr = laddr; + resp->sge[0].lkey = lkey; + + /* Keep aside message sequence number for potential + * error reporting during Read Response generation. + */ + resp->sge[1].length = msn; + + resp->raddr = raddr; + resp->rkey = rkey; + resp->num_sge = length ? 1 : 0; + + /* RRESP now valid as current TX wqe or placed into IRQ */ + smp_store_mb(resp->flags, SIW_WQE_VALID); + } else { + pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), + qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CATASTROPHIC_STREAM, 0); + rv = -EPROTO; + } + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (run_sq) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Only called at start of Read.Resonse processing. + * Transfer pending Read from tip of ORQ into currrent rx wqe, + * but keep ORQ entry valid until Read.Response processing done. + * No Queue locking needed. + */ +static int siw_orqe_start_rx(struct siw_qp *qp) +{ + struct siw_sqe *orqe; + struct siw_wqe *wqe = NULL; + + /* make sure ORQ indices are current */ + smp_mb(); + + orqe = orq_get_current(qp); + if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { + /* RRESP is a TAGGED RDMAP operation */ + wqe = rx_wqe(&qp->rx_tagged); + wqe->sqe.id = orqe->id; + wqe->sqe.opcode = orqe->opcode; + wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; + wqe->sqe.sge[0].length = orqe->sge[0].length; + wqe->sqe.flags = orqe->flags; + wqe->sqe.num_sge = 1; + wqe->bytes = orqe->sge[0].length; + wqe->processed = 0; + wqe->mem[0] = NULL; + /* make sure WQE is completely written before valid */ + smp_wmb(); + wqe->wr_status = SIW_WR_INPROGRESS; + + return 0; + } + return -EPROTO; +} + +/* + * siw_proc_rresp: + * + * Place incoming RRESP data into memory referenced by RREQ WQE + * which is at the tip of the ORQ + * + * Function supports partially received RRESP's (suspending/resuming + * current receive processing) + */ +int siw_proc_rresp(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_wqe *wqe = rx_wqe(frx); + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + int bytes, rv; + + if (frx->first_ddp_seg) { + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", + qp_id(qp), wqe->wr_status, wqe->sqe.opcode); + rv = -EPROTO; + goto error_term; + } + /* + * fetch pending RREQ from orq + */ + rv = siw_orqe_start_rx(qp); + if (rv) { + pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", + qp_id(qp), qp->orq_get % qp->attrs.orq_size); + goto error_term; + } + rv = siw_rresp_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } else { + if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { + pr_warn("siw: [QP %u]: resume RRESP: status %d\n", + qp_id(qp), wqe->wr_status); + rv = -EPROTO; + goto error_term; + } + } + if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ + return 0; + + sge = wqe->sqe.sge; /* there is only one */ + mem = &wqe->mem[0]; + + if (!(*mem)) { + /* + * check target memory which resolves memory on first fragment + */ + rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, + wqe->bytes); + if (unlikely(rv)) { + siw_dbg_qp(qp, "target mem check: %d\n", rv); + wqe->wc_status = SIW_WC_LOC_PROT_ERR; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + siw_tagged_error(-rv), 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + } + mem_p = *mem; + + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), + bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, + bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + wqe->processed, bytes); + if (rv != bytes) { + wqe->wc_status = SIW_WC_GENERAL_ERR; + rv = -EINVAL; + goto error_term; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + wqe->processed += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; + +error_term: + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return rv; +} + +int siw_proc_terminate(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct sk_buff *skb = srx->skb; + struct iwarp_terminate *term = &srx->hdr.terminate; + union iwarp_hdr term_info; + u8 *infop = (u8 *)&term_info; + enum rdma_opcode op; + u16 to_copy = sizeof(struct iwarp_ctrl); + + pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term)); + + if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || + be32_to_cpu(term->ddp_msn) != + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || + be32_to_cpu(term->ddp_mo) != 0) { + pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", + be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), + be32_to_cpu(term->ddp_mo)); + return -ECONNRESET; + } + /* + * Receive remaining pieces of TERM if indicated + */ + if (!term->flag_m) + return -ECONNRESET; + + /* Do not take the effort to reassemble a network fragmented + * TERM message + */ + if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) + return -ECONNRESET; + + memset(infop, 0, sizeof(term_info)); + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + op = __rdmap_get_opcode(&term_info.ctrl); + if (op >= RDMAP_TERMINATE) + goto out; + + infop += to_copy; + srx->skb_offset += to_copy; + srx->skb_new -= to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + to_copy = iwarp_pktinfo[op].hdr_len - to_copy; + + /* Again, no network fragmented TERM's */ + if (to_copy + MPA_CRC_SIZE > srx->skb_new) + return -ECONNRESET; + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + if (term->flag_r) { + siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } else if (term->flag_d) { + siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } +out: + srx->skb_new -= to_copy; + srx->skb_offset += to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + return -ECONNRESET; +} + +static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; + __wsum crc_in, crc_own = 0; + + siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", + srx->fpdu_part_rem, srx->skb_new, srx->pad); + + if (srx->skb_new < srx->fpdu_part_rem) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); + + if (srx->mpa_crc_hd && srx->pad) + crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + + srx->skb_new -= srx->fpdu_part_rem; + srx->skb_offset += srx->fpdu_part_rem; + srx->skb_copied += srx->fpdu_part_rem; + + if (!srx->mpa_crc_hd) + return 0; + + /* + * CRC32 is computed, transmitted and received directly in NBO, + * so there's never a reason to convert byte order. + */ + crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + crc_in = (__force __wsum)srx->trailer.crc; + + if (unlikely(crc_in != crc_own)) { + pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", + crc_in, crc_own, qp->rx_stream.rdmap_op); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_RECEIVED_CRC, 0); + return -EINVAL; + } + return 0; +} + +#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) + +static int siw_get_hdr(struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + struct siw_qp *qp = rx_qp(srx); + struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; + struct siw_rx_fpdu *frx; + u8 opcode; + int bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { + /* + * copy a mimimum sized (tagged) DDP frame control part + */ + bytes = min_t(int, srx->skb_new, + MIN_DDP_HDR - srx->fpdu_part_rcvd); + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) + return -EAGAIN; + + if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { + enum ddp_etype etype; + enum ddp_ecode ecode; + + pr_warn("siw: received ddp version unsupported %d\n", + __ddp_get_version(c_hdr)); + + if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { + etype = DDP_ETYPE_TAGGED_BUF; + ecode = DDP_ECODE_T_VERSION; + } else { + etype = DDP_ETYPE_UNTAGGED_BUF; + ecode = DDP_ECODE_UT_VERSION; + } + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + etype, ecode, 0); + return -EINVAL; + } + if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { + pr_warn("siw: received rdmap version unsupported %d\n", + __rdmap_get_version(c_hdr)); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_VERSION, 0); + return -EINVAL; + } + opcode = __rdmap_get_opcode(c_hdr); + + if (opcode > RDMAP_TERMINATE) { + pr_warn("siw: received unknown packet type %u\n", + opcode); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_OPCODE, 0); + return -EINVAL; + } + siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); + } else { + opcode = __rdmap_get_opcode(c_hdr); + } + set_rx_fpdu_context(qp, opcode); + frx = qp->rx_fpdu; + + /* + * Figure out len of current hdr: variable length of + * iwarp hdr may force us to copy hdr information in + * two steps. Only tagged DDP messages are already + * completely received. + */ + if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { + bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; + + if (srx->skb_new < bytes) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + } + + /* + * DDP/RDMAP header receive completed. Check if the current + * DDP segment starts a new RDMAP message or continues a previously + * started RDMAP message. + * + * Alternating reception of DDP segments (or FPDUs) from incomplete + * tagged and untagged RDMAP messages is supported, as long as + * the current tagged or untagged message gets eventually completed + * w/o intersection from another message of the same type + * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, + * but not by a READ RESPONSE etc. + */ + if (srx->mpa_crc_hd) { + /* + * Restart CRC computation + */ + crypto_shash_init(srx->mpa_crc_hd); + crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, + srx->fpdu_part_rcvd); + } + if (frx->more_ddp_segs) { + frx->first_ddp_seg = 0; + if (frx->prev_rdmap_op != opcode) { + pr_warn("siw: packet intersection: %u : %u\n", + frx->prev_rdmap_op, opcode); + /* + * The last inbound RDMA operation of same type + * (tagged or untagged) is left unfinished. + * To complete it in error, make it the current + * operation again, even with the header already + * overwritten. For error handling, only the opcode + * and current rx context are relevant. + */ + set_rx_fpdu_context(qp, frx->prev_rdmap_op); + __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); + return -EPROTO; + } + } else { + frx->prev_rdmap_op = opcode; + frx->first_ddp_seg = 1; + } + frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; + + return 0; +} + +static int siw_check_tx_fence(struct siw_qp *qp) +{ + struct siw_wqe *tx_waiting = tx_wqe(qp); + struct siw_sqe *rreq; + int resume_tx = 0, rv = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->orq_lock, flags); + + rreq = orq_get_current(qp); + + /* free current orq entry */ + WRITE_ONCE(rreq->flags, 0); + + if (qp->tx_ctx.orq_fence) { + if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { + pr_warn("siw: [QP %u]: fence resume: bad status %d\n", + qp_id(qp), tx_waiting->wr_status); + rv = -EPROTO; + goto out; + } + /* resume SQ processing */ + if (tx_waiting->sqe.opcode == SIW_OP_READ || + tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + rreq = orq_get_tail(qp); + if (unlikely(!rreq)) { + pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); + rv = -EPROTO; + goto out; + } + siw_read_to_orq(rreq, &tx_waiting->sqe); + + qp->orq_put++; + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + + } else if (siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + } else { + pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", + qp_id(qp), qp->orq_get, qp->orq_put); + rv = -EPROTO; + } + } + qp->orq_get++; +out: + spin_unlock_irqrestore(&qp->orq_lock, flags); + + if (resume_tx) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * siw_rdmap_complete() + * + * Complete processing of an RDMA message after receiving all + * DDP segmens or ABort processing after encountering error case. + * + * o SENDs + RRESPs will need for completion, + * o RREQs need for READ RESPONSE initialization + * o WRITEs need memory dereferencing + * + * TODO: Failed WRITEs need local error to be surfaced. + */ +static int siw_rdmap_complete(struct siw_qp *qp, int error) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); + enum siw_wc_status wc_status = wqe->wc_status; + u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); + int rv = 0; + + switch (opcode) { + case RDMAP_SEND_SE: + case RDMAP_SEND_SE_INVAL: + wqe->rqe.flags |= SIW_WQE_SOLICITED; + case RDMAP_SEND: + case RDMAP_SEND_INVAL: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; + + if (error != 0 && wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + /* + * Handle STag invalidation request + */ + if (wc_status == SIW_WC_SUCCESS && + (opcode == RDMAP_SEND_INVAL || + opcode == RDMAP_SEND_SE_INVAL)) { + rv = siw_invalidate_stag(qp->pd, srx->inval_stag); + if (rv) { + siw_init_terminate( + qp, TERM_ERROR_LAYER_RDMAP, + rv == -EACCES ? + RDMAP_ETYPE_REMOTE_PROTECTION : + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CANNOT_INVALIDATE, 0); + + wc_status = SIW_WC_REM_INV_REQ_ERR; + } + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + rv ? 0 : srx->inval_stag, + wc_status); + } else { + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + 0, wc_status); + } + siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); + break; + + case RDMAP_RDMA_READ_RESP: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + if (error != 0) { + if ((srx->state == SIW_GET_HDR && + qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) + /* possible RREQ in ORQ left untouched */ + break; + + if (wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + } else if (qp->kernel_verbs && + rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { + /* + * Handle any STag invalidation request + */ + rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); + if (rv) { + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 0); + + if (wc_status == SIW_WC_SUCCESS) { + wc_status = SIW_WC_GENERAL_ERR; + error = rv; + } + } + } + /* + * All errors turn the wqe into signalled. + */ + if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) + rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_READ); + + if (!error) + rv = siw_check_tx_fence(qp); + else + /* Disable current ORQ eleement */ + WRITE_ONCE(orq_get_current(qp)->flags, 0); + break; + + case RDMAP_RDMA_READ_REQ: + if (!error) { + rv = siw_init_rresp(qp, srx); + srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; + } + break; + + case RDMAP_RDMA_WRITE: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + /* + * Free References from memory object if + * attached to receive context (inbound WRITE). + * While a zero-length WRITE is allowed, + * no memory reference got created. + */ + if (rx_mem(&qp->rx_tagged)) { + siw_mem_put(rx_mem(&qp->rx_tagged)); + rx_mem(&qp->rx_tagged) = NULL; + } + break; + + default: + break; + } + wqe->wr_status = SIW_WR_IDLE; + + return rv; +} + +/* + * siw_tcp_rx_data() + * + * Main routine to consume inbound TCP payload + * + * @rd_desc: read descriptor + * @skb: socket buffer + * @off: offset in skb + * @len: skb->len - offset : payload in skb + */ +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len) +{ + struct siw_qp *qp = rd_desc->arg.data; + struct siw_rx_stream *srx = &qp->rx_stream; + int rv; + + srx->skb = skb; + srx->skb_new = skb->len - off; + srx->skb_offset = off; + srx->skb_copied = 0; + + siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); + + while (srx->skb_new) { + int run_completion = 1; + + if (unlikely(srx->rx_suspend)) { + /* Do not process any more data */ + srx->skb_copied += srx->skb_new; + break; + } + switch (srx->state) { + case SIW_GET_HDR: + rv = siw_get_hdr(srx); + if (!rv) { + srx->fpdu_part_rem = + be16_to_cpu(srx->hdr.ctrl.mpa_len) - + srx->fpdu_part_rcvd + MPA_HDR_SIZE; + + if (srx->fpdu_part_rem) + srx->pad = -srx->fpdu_part_rem & 0x3; + else + srx->pad = 0; + + srx->state = SIW_GET_DATA_START; + srx->fpdu_part_rcvd = 0; + } + break; + + case SIW_GET_DATA_MORE: + /* + * Another data fragment of the same DDP segment. + * Setting first_ddp_seg = 0 avoids repeating + * initializations that shall occur only once per + * DDP segment. + */ + qp->rx_fpdu->first_ddp_seg = 0; + /* Fall through */ + + case SIW_GET_DATA_START: + /* + * Headers will be checked by the opcode-specific + * data receive function below. + */ + rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); + if (!rv) { + int mpa_len = + be16_to_cpu(srx->hdr.ctrl.mpa_len) + + MPA_HDR_SIZE; + + srx->fpdu_part_rem = (-mpa_len & 0x3) + + MPA_CRC_SIZE; + srx->fpdu_part_rcvd = 0; + srx->state = SIW_GET_TRAILER; + } else { + if (unlikely(rv == -ECONNRESET)) + run_completion = 0; + else + srx->state = SIW_GET_DATA_MORE; + } + break; + + case SIW_GET_TRAILER: + /* + * read CRC + any padding + */ + rv = siw_get_trailer(qp, srx); + if (likely(!rv)) { + /* + * FPDU completed. + * complete RDMAP message if last fragment + */ + srx->state = SIW_GET_HDR; + srx->fpdu_part_rcvd = 0; + + if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & + DDP_FLAG_LAST)) + /* more frags */ + break; + + rv = siw_rdmap_complete(qp, 0); + run_completion = 0; + } + break; + + default: + pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); + rv = -EPROTO; + run_completion = 0; + } + if (unlikely(rv != 0 && rv != -EAGAIN)) { + if ((srx->state > SIW_GET_HDR || + qp->rx_fpdu->more_ddp_segs) && run_completion) + siw_rdmap_complete(qp, rv); + + siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, + srx->state); + + siw_qp_cm_drop(qp, 1); + + break; + } + if (rv) { + siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", + srx->state, srx->fpdu_part_rem); + break; + } + } + return srx->skb_copied; +} From b0fff7317bb4325ace221a24c4bfa274f0046ee4 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:32 +0200 Subject: [PATCH 128/194] rdma/siw: completion queue methods Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 101 +++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_cq.c diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c new file mode 100644 index 000000000000..e2a0ee40d5b5 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include + +#include + +#include "siw.h" + +static int map_wc_opcode[SIW_NUM_OPCODES] = { + [SIW_OP_WRITE] = IB_WC_RDMA_WRITE, + [SIW_OP_SEND] = IB_WC_SEND, + [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND, + [SIW_OP_READ] = IB_WC_RDMA_READ, + [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ, + [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP, + [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV, + [SIW_OP_REG_MR] = IB_WC_REG_MR, + [SIW_OP_RECEIVE] = IB_WC_RECV, + [SIW_OP_READ_RESPONSE] = -1 /* not used */ +}; + +static struct { + enum siw_opcode siw; + enum ib_wc_status ib; +} map_cqe_status[SIW_NUM_WC_STATUS] = { + { SIW_WC_SUCCESS, IB_WC_SUCCESS }, + { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR }, + { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR }, + { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR }, + { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR }, + { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR }, + { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR }, + { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR }, + { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR }, + { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR } +}; + +/* + * Reap one CQE from the CQ. Only used by kernel clients + * during CQ normal operation. Might be called during CQ + * flush for user mapped CQE array as well. + */ +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) +{ + struct siw_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + cqe = &cq->queue[cq->cq_get % cq->num_cqe]; + if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) { + memset(wc, 0, sizeof(*wc)); + wc->wr_id = cqe->id; + wc->status = map_cqe_status[cqe->status].ib; + wc->opcode = map_wc_opcode[cqe->opcode]; + wc->byte_len = cqe->bytes; + + /* + * During CQ flush, also user land CQE's may get + * reaped here, which do not hold a QP reference + * and do not qualify for memory extension verbs. + */ + if (likely(cq->kernel_verbs)) { + if (cqe->flags & SIW_WQE_REM_INVAL) { + wc->ex.invalidate_rkey = cqe->inval_stag; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + } + wc->qp = cqe->base_qp; + siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n", + cq->cq_get % cq->num_cqe, cqe->opcode, + cqe->flags, (void *)cqe->id); + } + WRITE_ONCE(cqe->flags, 0); + cq->cq_get++; + + spin_unlock_irqrestore(&cq->lock, flags); + + return 1; + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} + +/* + * siw_cq_flush() + * + * Flush all CQ elements. + */ +void siw_cq_flush(struct siw_cq *cq) +{ + struct ib_wc wc; + + while (siw_reap_cqe(cq, &wc)) + ; +} From c0cf5bdde46c664d583518addc19d6dabb6a8ec9 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:33 +0200 Subject: [PATCH 129/194] rdma/siw: addition to kernel build environment Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 7 +++++++ drivers/infiniband/Kconfig | 1 + drivers/infiniband/sw/Makefile | 1 + drivers/infiniband/sw/siw/Kconfig | 17 +++++++++++++++++ drivers/infiniband/sw/siw/Makefile | 11 +++++++++++ 5 files changed, 37 insertions(+) create mode 100644 drivers/infiniband/sw/siw/Kconfig create mode 100644 drivers/infiniband/sw/siw/Makefile diff --git a/MAINTAINERS b/MAINTAINERS index 83a62b911692..6d2de0c1520e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14558,6 +14558,13 @@ M: Chris Boot S: Maintained F: drivers/leds/leds-net48xx.c +SOFT-IWARP DRIVER (siw) +M: Bernard Metzler +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/sw/siw/ +F: include/uapi/rdma/siw-abi.h + SOFT-ROCE DRIVER (rxe) M: Moni Shoua L: linux-rdma@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 42af4cd40ba2..f277cb7aea29 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -96,6 +96,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/qedr/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" +source "drivers/infiniband/sw/siw/Kconfig" endif source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index ab48a9b60844..68e0230f8f31 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ obj-$(CONFIG_RDMA_RXE) += rxe/ +obj-$(CONFIG_RDMA_SIW) += siw/ diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig new file mode 100644 index 000000000000..94f684174ce3 --- /dev/null +++ b/drivers/infiniband/sw/siw/Kconfig @@ -0,0 +1,17 @@ +config RDMA_SIW + tristate "Software RDMA over TCP/IP (iWARP) driver" + depends on INET && INFINIBAND && CRYPTO_CRC32 + help + This driver implements the iWARP RDMA transport over + the Linux TCP/IP network stack. It enables a system with a + standard Ethernet adapter to interoperate with a iWARP + adapter or with another system running the SIW driver. + (See also RXE which is a similar software driver for RoCE.) + + The driver interfaces with the Linux RDMA stack and + implements both a kernel and user space RDMA verbs API. + The user space verbs API requires a support + library named libsiw which is loaded by the generic user + space verbs API, libibverbs. To implement RDMA over + TCP/IP, the driver further interfaces with the Linux + in-kernel TCP socket layer. diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile new file mode 100644 index 000000000000..f5f7e3867889 --- /dev/null +++ b/drivers/infiniband/sw/siw/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_RDMA_SIW) += siw.o + +siw-y := \ + siw_cm.o \ + siw_cq.o \ + siw_main.o \ + siw_mem.o \ + siw_qp.o \ + siw_qp_tx.o \ + siw_qp_rx.o \ + siw_verbs.o From 4c44d4634b5c90993fccca9f155347221df6f877 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 10:47:19 +0800 Subject: [PATCH 130/194] IB: Remove unneeded memset In commit af7ddd8a627c ("Merge tag 'dma-mapping-4.21' of git://git.infradead.org/users/hch/dma-mapping"), dma_alloc_coherent/dmam_alloc_coherent always zeroed the returned memory. So the memset after a coherent allocation function is not needed. Signed-off-by: Fuqian Huang Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 3 --- drivers/infiniband/hw/cxgb4/cq.c | 1 - drivers/infiniband/hw/cxgb4/qp.c | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 --- drivers/infiniband/hw/mthca/mthca_allocator.c | 2 -- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3 --- 7 files changed, 14 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 37ee93824349..95b22a651673 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -174,7 +174,6 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) return -ENOMEM; } dma_unmap_addr_set(cq, mapping, cq->dma_addr); - memset(cq->queue, 0, size); setup.id = cq->cqid; setup.base_addr = (u64) (cq->dma_addr); setup.size = 1UL << cq->size_log2; @@ -522,8 +521,6 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping, rdev_p->ctrl_qp.dma_addr); rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; - memset(rdev_p->ctrl_qp.workq, 0, - (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); mutex_init(&rdev_p->ctrl_qp.lock); init_waitqueue_head(&rdev_p->ctrl_qp.waitq); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 3cc4d3331a3f..b1bb61c65f4f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -102,7 +102,6 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, goto err3; } dma_unmap_addr_set(cq, mapping, cq->dma_addr); - memset(cq->queue, 0, cq->memsize); if (user && ucontext->is_32b_cqe) { cq->qp_errp = &((struct t4_status_page *) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 9523986d62b1..eb9368be28c1 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -274,7 +274,6 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, (unsigned long long)virt_to_phys(wq->sq.queue), wq->rq.queue, (unsigned long long)virt_to_phys(wq->rq.queue)); - memset(wq->rq.queue, 0, wq->rq.memsize); dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 056a6873df7a..998431c39b8d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -4265,7 +4265,6 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, } eq->buf_list[i].map = tmp_dma_addr; - memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); } eq->cons_index = 0; roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7f2da5e10e67..107330df6ce8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1818,7 +1818,6 @@ static int hns_roce_init_link_table(struct hns_roce_dev *hr_dev, goto err_alloc_buf_failed; link_tbl->pg_list[i].map = t; - memset(link_tbl->pg_list[i].buf, 0, buf_chk_sz); entry[i].blk_ba0 = (t >> 12) & 0xffffffff; roce_set_field(entry[i].blk_ba1_nxt_ptr, @@ -5467,8 +5466,6 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, eq->cur_eqe_ba = eq->l0_dma; eq->nxt_eqe_ba = 0; - memset(eq->bt_l0, 0, eq->entries * eq->eqe_size); - return 0; } diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c index aaf10dd5364d..aef1d274a14e 100644 --- a/drivers/infiniband/hw/mthca/mthca_allocator.c +++ b/drivers/infiniband/hw/mthca/mthca_allocator.c @@ -214,8 +214,6 @@ int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, dma_unmap_addr_set(&buf->direct, mapping, t); - memset(buf->direct.buf, 0, size); - while (t & ((1 << shift) - 1)) { --shift; npages *= 2; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index b2dd4e0a4be2..d82d3ec3649e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1351,7 +1351,6 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); mqe->u.nonemb_req.sge[0].len = dma.size; - memset(dma.va, 0, dma.size); ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, OCRDMA_CMD_GET_CTRL_ATTRIBUTES, OCRDMA_SUBSYS_COMMON, @@ -1690,7 +1689,6 @@ static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) goto mem_err_ah; dev->av_tbl.pa = pa; dev->av_tbl.num_ah = max_ah; - memset(dev->av_tbl.va, 0, dev->av_tbl.size); pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va; for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) { @@ -2903,7 +2901,6 @@ static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, mqe_sge->pa_hi = (u32) upper_32_bits(pa); mqe_sge->len = cmd.hdr.pyld_len; - memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req)); ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG, OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len); req->param_type = ptype; From 5d7d78eaecc5c91c9b2001e544a0ae2788d40d1c Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 01:38:04 +0800 Subject: [PATCH 131/194] IB/ipoib: Remove memset after vzalloc in ipoib_cm.c vzalloc has already zeroed the memory. So a memset is unneeded. Signed-off-by: Fuqian Huang Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index aa9dcfc36cd3..c59e00a0881f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1153,7 +1153,6 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, ret = -ENOMEM; goto err_tx; } - memset(p->tx_ring, 0, ipoib_sendq_size * sizeof(*p->tx_ring)); p->qp = ipoib_cm_create_tx_qp(p->dev, p); memalloc_noio_restore(noio_flag); From cda8cf56d8e29c70dc8a3d989846c66ed1638e74 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Thu, 4 Jul 2019 00:27:42 +0800 Subject: [PATCH 132/194] IB/i40iw: Use kmemdup rather than open coding Use kmemdump instead of kzmalloc + memcpy. Signed-off-by: Fuqian Huang Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 8233f5a4e623..84b3ff2687fb 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -4276,11 +4276,11 @@ static void i40iw_qhash_ctrl(struct i40iw_device *iwdev, /* if not found then add a child listener if interface is going up */ if (!ifup) return; - child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC); + child_listen_node = kmemdup(parent_listen_node, + sizeof(*child_listen_node), GFP_ATOMIC); if (!child_listen_node) return; node_allocated = true; - memcpy(child_listen_node, parent_listen_node, sizeof(*child_listen_node)); memcpy(child_listen_node->loc_addr, ipaddr, ipv4 ? 4 : 16); From 2e67e775845373905d2c2aecb9062c2c4352a535 Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Fri, 28 Jun 2019 14:16:13 +0800 Subject: [PATCH 133/194] RDMA/i40iw: Set queue pair state when being queried The API for ib_query_qp requires the driver to set qp_state and cur_qp_state on return, add the missing sets. Fixes: d37498417947 ("i40iw: add files for iwarp interface") Signed-off-by: Changcheng Liu Acked-by: Shiraz Saleem Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 3c0c6aabc64e..d169a8031375 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -772,6 +772,8 @@ static int i40iw_query_qp(struct ib_qp *ibqp, struct i40iw_qp *iwqp = to_iwqp(ibqp); struct i40iw_sc_qp *qp = &iwqp->sc_qp; + attr->qp_state = iwqp->ibqp_state; + attr->cur_qp_state = attr->qp_state; attr->qp_access_flags = 0; attr->cap.max_send_wr = qp->qp_uk.sq_size; attr->cap.max_recv_wr = qp->qp_uk.rq_size; From 6044414fa849e14fa0de60a75e3f85ea048c89db Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 3 Jul 2019 03:10:21 +0000 Subject: [PATCH 134/194] RDMA/hns: Remove set but not used variable 'fclr_write_fail_flag' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/hw/hns/hns_roce_hw_v2.c: In function 'hns_roce_function_clear': drivers/infiniband/hw/hns/hns_roce_hw_v2.c:1135:7: warning: variable 'fclr_write_fail_flag' set but not used [-Wunused-but-set-variable] It is never used, so can be removed. Signed-off-by: YueHaibing Reviewed-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 107330df6ce8..20e6b5139ef4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1132,7 +1132,6 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) { - bool fclr_write_fail_flag = false; struct hns_roce_func_clear *resp; struct hns_roce_cmq_desc desc; unsigned long end; @@ -1143,7 +1142,6 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) ret = hns_roce_cmq_send(hr_dev, &desc, 1); if (ret) { - fclr_write_fail_flag = true; dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n", ret); return; From 2f40cf30c8644360d37287861d5288f00eab35e5 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 30 Jun 2019 10:52:52 +0300 Subject: [PATCH 135/194] IB/mlx5: Fixed reporting counters on 2nd port for Dual port RoCE Currently during dual port IB device registration in below code flow, ib_register_device() ib_device_register_sysfs() ib_setup_port_attrs() add_port() get_counter_table() get_perf_mad() process_mad() mlx5_ib_process_mad() mlx5_ib_process_mad() fails on 2nd port when both the ports are not fully setup at the device level (because 2nd port is unaffiliated). As a result, get_perf_mad() registers different PMA counter group for 1st and 2nd port, namely pma_counter_ext and pma_counter. However both ports have the same capability and counter offsets. Due to this when counters are read by the user via sysfs in below code flow, counters are queried from wrong location from the device mainly from PPCNT instead of VPORT counters. show_pma_counter() get_perf_mad() process_mad() mlx5_ib_process_mad() process_pma_cmd() This shows all zero counters for 2nd port. To overcome this, process_pma_cmd() is invoked, and when unaffiliated port is not yet setup during device registration phase, make the query on the first port. while at it, only process_pma_cmd() needs to work on the native port number and underlying mdev, so shift the get, put calls to where its needed inside process_pma_cmd(). Fixes: 212f2a87b74f ("IB/mlx5: Route MADs for dual port RoCE") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mad.c | 60 +++++++++++++++++++------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 6c529e6f3a01..348c1df69cdc 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -200,19 +200,33 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, +static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num, const struct ib_mad *in_mad, struct ib_mad *out_mad) { - int err; + struct mlx5_core_dev *mdev; + bool native_port = true; + u8 mdev_port_num; void *out_cnt; + int err; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* Fail to get the native port, likely due to 2nd port is still + * unaffiliated. In such case default to 1st port and attached + * PF device. + */ + native_port = false; + mdev = dev->mdev; + mdev_port_num = 1; + } /* Declaring support of extended counters */ if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { struct ib_class_port_info cpi = {}; cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); - return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + goto done; } if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { @@ -221,11 +235,13 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); out_cnt = kvzalloc(sz, GFP_KERNEL); - if (!out_cnt) - return IB_MAD_RESULT_FAILURE; + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } err = mlx5_core_query_vport_counter(mdev, 0, 0, - port_num, out_cnt, sz); + mdev_port_num, out_cnt, sz); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); } else { @@ -234,20 +250,23 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); out_cnt = kvzalloc(sz, GFP_KERNEL); - if (!out_cnt) - return IB_MAD_RESULT_FAILURE; + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } - err = mlx5_core_query_ib_ppcnt(mdev, port_num, + err = mlx5_core_query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz); if (!err) pma_cnt_assign(pma_cnt, out_cnt); - } - + } kvfree(out_cnt); - if (err) - return IB_MAD_RESULT_FAILURE; - - return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + err = err ? IB_MAD_RESULT_FAILURE : + IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +done: + if (native_port) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; } int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -259,8 +278,6 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct mlx5_ib_dev *dev = to_mdev(ibdev); const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; - struct mlx5_core_dev *mdev; - u8 mdev_port_num; int ret; if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || @@ -269,19 +286,14 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, memset(out_mad->data, 0, sizeof(out_mad->data)); - mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); - if (!mdev) - return IB_MAD_RESULT_FAILURE; - - if (MLX5_CAP_GEN(mdev, vport_counters) && + if (MLX5_CAP_GEN(dev->mdev, vport_counters) && in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { - ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad); + ret = process_pma_cmd(dev, port_num, in_mad, out_mad); } else { ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); } - mlx5_ib_put_native_port_mdev(dev, port_num); return ret; } From 2afc5e1b9c340ff20848c8dd8fb60342617bce52 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:29 +0300 Subject: [PATCH 136/194] IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async events. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 95 +++++++++++++++++++++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 +++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 + 3 files changed, 109 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 931f587dfb8f..ed01523f0f02 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -33,6 +33,17 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +struct devx_async_event_file { + struct ib_uobject uobj; + /* Head of events that are subscribed to this FD */ + struct list_head subscribed_events_list; + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; + struct mlx5_ib_dev *dev; + u8 omit_data:1; +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; @@ -1365,6 +1376,37 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( return 0; } +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE); + struct devx_async_event_file *ev_file; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 flags; + int err; + + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA); + + if (err) + return err; + + ev_file = container_of(uobj, struct devx_async_event_file, + uobj); + spin_lock_init(&ev_file->lock); + INIT_LIST_HEAD(&ev_file->event_list); + init_waitqueue_head(&ev_file->poll_wait); + if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA) + ev_file->omit_data = 1; + INIT_LIST_HEAD(&ev_file->subscribed_events_list); + ev_file->dev = dev; + return 0; +} + static void devx_query_callback(int status, struct mlx5_async_work *context) { struct devx_async_data *async_data = @@ -1719,6 +1761,32 @@ static const struct file_operations devx_async_cmd_event_fops = { .llseek = no_llseek, }; +static ssize_t devx_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + return -EINVAL; +} + +static __poll_t devx_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + return 0; +} + +static int devx_async_event_close(struct inode *inode, struct file *filp) +{ + uverbs_close_fd(filp); + return 0; +} + +static const struct file_operations devx_async_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_event_read, + .poll = devx_async_event_poll, + .release = devx_async_event_close, + .llseek = no_llseek, +}; + static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { @@ -1738,6 +1806,12 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, return 0; }; +static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +}; + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_UMEM_REG, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, @@ -1903,6 +1977,24 @@ DECLARE_UVERBS_NAMED_OBJECT( O_RDONLY), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + enum mlx5_ib_uapi_devx_create_event_channel_flags, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file), + devx_hot_unplug_async_event_file, + &devx_async_event_fops, "[devx_async_event]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)); + static bool devx_is_supported(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -1923,5 +2015,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), {}, }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index d404c951954c..6ad8f4f11ddd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -127,16 +127,26 @@ enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_devx_async_event_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, +}; + enum mlx5_ib_devx_async_cmd_fd_methods { MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), }; +enum mlx5_ib_devx_async_event_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, }; enum mlx5_ib_flow_matcher_create_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index a8f34c237458..b44691315d39 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -63,5 +63,9 @@ enum mlx5_ib_uapi_dm_type { MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM, }; +enum mlx5_ib_uapi_devx_create_event_channel_flags { + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0, +}; + #endif From e337dd53ce4cc3db79e52704e554f648c46d5e91 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:30 +0300 Subject: [PATCH 137/194] IB/mlx5: Register DEVX with mlx5_core to get async events Register DEVX with with mlx5_core to get async events. This will enable to dispatch the applicable events to its consumers in down stream patches. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 30 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/main.c | 8 ++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index ed01523f0f02..a9affc905bfa 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1663,6 +1663,36 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static int devx_event_notifier(struct notifier_block *nb, + unsigned long event_type, void *data) +{ + return NOTIFY_DONE; +} + +void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + + xa_init(&table->event_xa); + mutex_init(&table->event_xa_lock); + MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY); + mlx5_eq_notifier_register(dev->mdev, &table->devx_nb); +} + +void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + void *entry; + unsigned long id; + + mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb); + + xa_for_each(&table->event_xa, id, entry) + kfree(entry); + + xa_destroy(&table->event_xa); +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9db8c06aa01e..692b60898ee2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6630,15 +6630,19 @@ static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev) int uid; uid = mlx5_ib_devx_create(dev, false); - if (uid > 0) + if (uid > 0) { dev->devx_whitelist_uid = uid; + mlx5_ib_devx_init_event_table(dev); + } return 0; } static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev) { - if (dev->devx_whitelist_uid) + if (dev->devx_whitelist_uid) { + mlx5_ib_devx_cleanup_event_table(dev); mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); + } } void __mlx5_ib_remove(struct mlx5_ib_dev *dev, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 305d26cdf7f3..7373e9da0919 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -936,6 +936,13 @@ struct mlx5_ib_pf_eq { mempool_t *pool; }; +struct mlx5_devx_event_table { + struct mlx5_nb devx_nb; + /* serialize updating the event_xa */ + struct mutex event_xa_lock; + struct xarray event_xa; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -985,6 +992,7 @@ struct mlx5_ib_dev { u16 devx_whitelist_uid; struct mlx5_srq_table srq_table; struct mlx5_async_ctx async_ctx; + struct mlx5_devx_event_table devx_event_table; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1324,6 +1332,8 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); +void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev); +void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev); const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; @@ -1341,6 +1351,8 @@ static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { return -EOPNOTSUPP; } static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} +static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {} +static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {} static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) { From 7597385371425febdaa8c6a1da3625d4ffff16f5 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:31 +0300 Subject: [PATCH 138/194] IB/mlx5: Enable subscription for device events over DEVX Enable subscription for device events over DEVX. Each subscription is added to the two level xarray data structure according to its event number and the DEVX object information in case was given with the given target fd. Those events will be reported over the given fd once will occur. Downstream patches will mange the dispatching to any subscription. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 560 ++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 + 2 files changed, 562 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index a9affc905bfa..9c21cafc44a6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -14,6 +14,7 @@ #include #include #include "mlx5_ib.h" +#include #define UVERBS_MODULE_NAME mlx5_ib #include @@ -33,6 +34,40 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +/* first level XA value data structure */ +struct devx_event { + struct xarray object_ids; /* second XA level, Key = object id */ + struct list_head unaffiliated_list; +}; + +/* second level XA value data structure */ +struct devx_obj_event { + struct rcu_head rcu; + struct list_head obj_sub_list; +}; + +struct devx_event_subscription { + struct list_head file_list; /* headed in ev_file-> + * subscribed_events_list + */ + struct list_head xa_list; /* headed in devx_event->unaffiliated_list or + * devx_obj_event->obj_sub_list + */ + struct list_head obj_list; /* headed in devx_object */ + struct list_head event_list; /* headed in ev_file->event_list or in + * temp list via subscription + */ + + u8 is_cleaned:1; + u32 xa_key_level1; + u32 xa_key_level2; + struct rcu_head rcu; + u64 cookie; + struct devx_async_event_file *ev_file; + struct file *filp; /* Upon hot unplug we need a direct access to */ + struct eventfd_ctx *eventfd; +}; + struct devx_async_event_file { struct ib_uobject uobj; /* Head of events that are subscribed to this FD */ @@ -55,6 +90,7 @@ struct devx_obj { struct mlx5_ib_devx_mr devx_mr; struct mlx5_core_dct core_dct; }; + struct list_head event_sub; /* holds devx_event_subscription entries */ }; struct devx_umem { @@ -160,6 +196,104 @@ bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id) return false; } +static bool is_legacy_unaffiliated_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PORT_CHANGE: + return true; + default: + return false; + } +} + +static bool is_legacy_obj_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_CQ_ERROR: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_COMP: + return true; + default: + return false; + } +} + +static u16 get_legacy_obj_type(u16 opcode) +{ + switch (opcode) { + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_EVENT_QUEUE_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_EVENT_QUEUE_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_EVENT_QUEUE_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return 0; + } +} + +static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) +{ + u16 opcode; + + opcode = (obj->obj_id >> 32) & 0xffff; + + if (is_legacy_obj_event_num(event_num)) + return get_legacy_obj_type(opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + return (obj->obj_id >> 48); + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_OBJ_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_OBJ_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_OBJ_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_OBJ_TYPE_DCT; + case MLX5_CMD_OP_CREATE_TIR: + return MLX5_OBJ_TYPE_TIR; + case MLX5_CMD_OP_CREATE_TIS: + return MLX5_OBJ_TYPE_TIS; + case MLX5_CMD_OP_CREATE_PSV: + return MLX5_OBJ_TYPE_PSV; + case MLX5_OBJ_TYPE_MKEY: + return MLX5_OBJ_TYPE_MKEY; + case MLX5_CMD_OP_CREATE_RMP: + return MLX5_OBJ_TYPE_RMP; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + return MLX5_OBJ_TYPE_XRC_SRQ; + case MLX5_CMD_OP_CREATE_XRQ: + return MLX5_OBJ_TYPE_XRQ; + case MLX5_CMD_OP_CREATE_RQT: + return MLX5_OBJ_TYPE_RQT; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + return MLX5_OBJ_TYPE_FLOW_COUNTER; + case MLX5_CMD_OP_CREATE_CQ: + return MLX5_OBJ_TYPE_CQ; + default: + return 0; + } +} + +static u32 get_dec_obj_id(u64 obj_id) +{ + return (obj_id & 0xffffffff); +} + /* * As the obj_id in the firmware is not globally unique the object type * must be considered upon checking for a valid object id. @@ -1126,14 +1260,47 @@ static void devx_cleanup_mkey(struct devx_obj *obj) mlx5_base_mkey(obj->devx_mr.mmkey.key)); } +static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, + struct devx_event_subscription *sub) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + if (sub->is_cleaned) + return; + + sub->is_cleaned = 1; + list_del_rcu(&sub->xa_list); + + if (list_empty(&sub->obj_list)) + return; + + list_del_rcu(&sub->obj_list); + /* check whether key level 1 for this obj_sub_list is empty */ + event = xa_load(&dev->devx_event_table.event_xa, + sub->xa_key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + sub->xa_key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + static int devx_obj_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) { u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct mlx5_devx_event_table *devx_event_table; struct devx_obj *obj = uobject->object; + struct devx_event_subscription *sub_entry, *tmp; + struct mlx5_ib_dev *dev; int ret; + dev = mlx5_udata_to_mdev(&attrs->driver_udata); if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); @@ -1145,10 +1312,14 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, if (ib_is_destroy_retryable(ret, why, uobject)) return ret; - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { - struct mlx5_ib_dev *dev = - mlx5_udata_to_mdev(&attrs->driver_udata); + devx_event_table = &dev->devx_event_table; + mutex_lock(&devx_event_table->event_xa_lock); + list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list) + devx_cleanup_subscription(dev, sub_entry); + mutex_unlock(&devx_event_table->event_xa_lock); + + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, devx_free_indirect_mkey); return ret; @@ -1220,6 +1391,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( uobj->object = obj; obj->mdev = dev->mdev; + INIT_LIST_HEAD(&obj->event_sub); devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); @@ -1404,6 +1576,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( ev_file->omit_data = 1; INIT_LIST_HEAD(&ev_file->subscribed_events_list); ev_file->dev = dev; + get_device(&dev->ib_dev.dev); return 0; } @@ -1516,6 +1689,331 @@ sub_bytes: return err; } +static void +subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + /* Level 1 is valid for future use, no need to free */ + if (!is_level2) + return; + + event = xa_load(&devx_event_table->event_xa, key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, + key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + +static int +subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_obj_event *obj_event; + struct devx_event *event; + int err; + + event = xa_load(&devx_event_table->event_xa, key_level1); + if (!event) { + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->unaffiliated_list); + xa_init(&event->object_ids); + + err = xa_insert(&devx_event_table->event_xa, + key_level1, + event, + GFP_KERNEL); + if (err) { + kfree(event); + return err; + } + } + + if (!is_level2) + return 0; + + obj_event = xa_load(&event->object_ids, key_level2); + if (!obj_event) { + obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL); + if (!obj_event) + /* Level1 is valid for future use, no need to free */ + return -ENOMEM; + + err = xa_insert(&event->object_ids, + key_level2, + obj_event, + GFP_KERNEL); + if (err) + return err; + INIT_LIST_HEAD(&obj_event->obj_sub_list); + } + + return 0; +} + +static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + int i; + + for (i = 0; i < num_events; i++) { + if (obj) { + if (!is_legacy_obj_event_num(event_type_num_list[i])) + return false; + } else if (!is_legacy_unaffiliated_event_num( + event_type_num_list[i])) { + return false; + } + } + + return true; +} + +#define MAX_SUPP_EVENT_NUM 255 +static bool is_valid_events(struct mlx5_core_dev *dev, + int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + __be64 *aff_events; + __be64 *unaff_events; + int mask_entry; + int mask_bit; + int i; + + if (MLX5_CAP_GEN(dev, event_cap)) { + aff_events = MLX5_CAP_DEV_EVENT(dev, + user_affiliated_events); + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + } else { + return is_valid_events_legacy(num_events, event_type_num_list, + obj); + } + + for (i = 0; i < num_events; i++) { + if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM) + return false; + + mask_entry = event_type_num_list[i] / 64; + mask_bit = event_type_num_list[i] % 64; + + if (obj) { + /* CQ completion */ + if (event_type_num_list[i] == 0) + continue; + + if (!(be64_to_cpu(aff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + + continue; + } + + if (!(be64_to_cpu(unaff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + } + + return true; +} + +#define MAX_NUM_EVENTS 16 +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *devx_uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + struct ib_uobject *fd_uobj; + struct devx_obj *obj = NULL; + struct devx_async_event_file *ev_file; + struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table; + u16 *event_type_num_list; + struct devx_event_subscription *event_sub, *tmp_sub; + struct list_head sub_list; + int redirect_fd; + bool use_eventfd = false; + int num_events; + int num_alloc_xa_entries = 0; + u16 obj_type = 0; + u64 cookie = 0; + u32 obj_id = 0; + int err; + int i; + + if (!c->devx_uid) + return -EINVAL; + + if (!IS_ERR(devx_uobj)) { + obj = (struct devx_obj *)devx_uobj->object; + if (obj) + obj_id = get_dec_obj_id(obj->obj_id); + } + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_event_file, + uobj); + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) { + err = uverbs_copy_from(&redirect_fd, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM); + if (err) + return err; + + use_eventfd = true; + } + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) { + if (use_eventfd) + return -EINVAL; + + err = uverbs_copy_from(&cookie, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE); + if (err) + return err; + } + + num_events = uverbs_attr_ptr_get_array_size( + attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + sizeof(u16)); + + if (num_events < 0) + return num_events; + + if (num_events > MAX_NUM_EVENTS) + return -EINVAL; + + event_type_num_list = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST); + + if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj)) + return -EINVAL; + + INIT_LIST_HEAD(&sub_list); + + /* Protect from concurrent subscriptions to same XA entries to allow + * both to succeed + */ + mutex_lock(&devx_event_table->event_xa_lock); + for (i = 0; i < num_events; i++) { + u32 key_level1; + + if (obj) + obj_type = get_dec_obj_type(obj, + event_type_num_list[i]); + key_level1 = event_type_num_list[i] | obj_type << 16; + + err = subscribe_event_xa_alloc(devx_event_table, + key_level1, + obj, + obj_id); + if (err) + goto err; + + num_alloc_xa_entries++; + event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL); + if (!event_sub) + goto err; + + list_add_tail(&event_sub->event_list, &sub_list); + if (use_eventfd) { + event_sub->eventfd = + eventfd_ctx_fdget(redirect_fd); + + if (IS_ERR(event_sub)) { + err = PTR_ERR(event_sub->eventfd); + event_sub->eventfd = NULL; + goto err; + } + } + + event_sub->cookie = cookie; + event_sub->ev_file = ev_file; + event_sub->filp = fd_uobj->object; + /* May be needed upon cleanup the devx object/subscription */ + event_sub->xa_key_level1 = key_level1; + event_sub->xa_key_level2 = obj_id; + INIT_LIST_HEAD(&event_sub->obj_list); + } + + /* Once all the allocations and the XA data insertions were done we + * can go ahead and add all the subscriptions to the relevant lists + * without concern of a failure. + */ + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + struct devx_event *event; + struct devx_obj_event *obj_event; + + list_del_init(&event_sub->event_list); + + spin_lock_irq(&ev_file->lock); + list_add_tail_rcu(&event_sub->file_list, + &ev_file->subscribed_events_list); + spin_unlock_irq(&ev_file->lock); + + event = xa_load(&devx_event_table->event_xa, + event_sub->xa_key_level1); + WARN_ON(!event); + + if (!obj) { + list_add_tail_rcu(&event_sub->xa_list, + &event->unaffiliated_list); + continue; + } + + obj_event = xa_load(&event->object_ids, obj_id); + WARN_ON(!obj_event); + list_add_tail_rcu(&event_sub->xa_list, + &obj_event->obj_sub_list); + list_add_tail_rcu(&event_sub->obj_list, + &obj->event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return 0; + +err: + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + list_del(&event_sub->event_list); + + subscribe_event_xa_dealloc(devx_event_table, + event_sub->xa_key_level1, + obj, + obj_id); + + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + kfree(event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return err; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1682,14 +2180,21 @@ void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) { struct mlx5_devx_event_table *table = &dev->devx_event_table; + struct devx_event_subscription *sub, *tmp; + struct devx_event *event; void *entry; unsigned long id; mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb); - - xa_for_each(&table->event_xa, id, entry) + mutex_lock(&dev->devx_event_table.event_xa_lock); + xa_for_each(&table->event_xa, id, entry) { + event = entry; + list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list, + xa_list) + devx_cleanup_subscription(dev, sub); kfree(entry); - + } + mutex_unlock(&dev->devx_event_table.event_xa_lock); xa_destroy(&table->event_xa); } @@ -1805,7 +2310,26 @@ static __poll_t devx_async_event_poll(struct file *filp, static int devx_async_event_close(struct inode *inode, struct file *filp) { + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub, *event_sub_tmp; + + mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock); + /* delete the subscriptions which are related to this FD */ + list_for_each_entry_safe(event_sub, event_sub_tmp, + &ev_file->subscribed_events_list, file_list) { + devx_cleanup_subscription(ev_file->dev, event_sub); + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + + list_del_rcu(&event_sub->file_list); + /* subscription may not be used by the read API any more */ + kfree_rcu(event_sub, rcu); + } + + mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock); + uverbs_close_fd(filp); + put_device(&ev_file->dev->ib_dev.dev); return 0; } @@ -1973,10 +2497,32 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(u16)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), - &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN)); + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ad8f4f11ddd..d0da070cf0ab 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -51,6 +51,7 @@ enum mlx5_ib_devx_methods { MLX5_IB_METHOD_DEVX_OTHER = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_METHOD_DEVX_QUERY_UAR, MLX5_IB_METHOD_DEVX_QUERY_EQN, + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, }; enum mlx5_ib_devx_other_attrs { @@ -93,6 +94,14 @@ enum mlx5_ib_devx_obj_query_async_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, }; +enum mlx5_ib_devx_subscribe_event_attrs { + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, From 5ec9d8ee87c627a2c981d871e41f6e2a942f53fd Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:32 +0300 Subject: [PATCH 139/194] IB/mlx5: Implement DEVX dispatching event Implement DEVX dispatching event by looking up for the applicable subscriptions for the reported event and using their target fd to signal/set the event. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 303 +++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 + 2 files changed, 305 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9c21cafc44a6..867b9778c063 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -34,6 +34,11 @@ struct devx_async_data { struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; }; +struct devx_async_event_data { + struct list_head list; /* headed in ev_file->event_list */ + struct mlx5_ib_uapi_devx_async_event_hdr hdr; +}; + /* first level XA value data structure */ struct devx_event { struct xarray object_ids; /* second XA level, Key = object id */ @@ -77,6 +82,8 @@ struct devx_async_event_file { struct list_head event_list; struct mlx5_ib_dev *dev; u8 omit_data:1; + u8 is_overflow_err:1; + u8 is_destroyed:1; }; #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) @@ -289,6 +296,29 @@ static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) } } +static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe) +{ + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return eqe->data.qp_srq.type; + case MLX5_EVENT_TYPE_CQ_ERROR: + return 0; + case MLX5_EVENT_TYPE_DCT_DRAINED: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return MLX5_GET(affiliated_event_header, &eqe->data, obj_type); + } +} + static u32 get_dec_obj_id(u64 obj_id) { return (obj_id & 0xffffffff); @@ -2161,10 +2191,170 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static bool is_unaffiliated_event(struct mlx5_core_dev *dev, + unsigned long event_type) +{ + __be64 *unaff_events; + int mask_entry; + int mask_bit; + + if (!MLX5_CAP_GEN(dev, event_cap)) + return is_legacy_unaffiliated_event_num(event_type); + + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + WARN_ON(event_type > MAX_SUPP_EVENT_NUM); + + mask_entry = event_type / 64; + mask_bit = event_type % 64; + + if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit))) + return false; + + return true; +} + +static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data) +{ + struct mlx5_eqe *eqe = data; + u32 obj_id = 0; + + switch (event_type) { + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + break; + case MLX5_EVENT_TYPE_DCT_DRAINED: + obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + break; + case MLX5_EVENT_TYPE_CQ_ERROR: + obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + break; + default: + obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id); + break; + } + + return obj_id; +} + +static int deliver_event(struct devx_event_subscription *event_sub, + const void *data) +{ + struct devx_async_event_file *ev_file; + struct devx_async_event_data *event_data; + unsigned long flags; + + ev_file = event_sub->ev_file; + + if (ev_file->omit_data) { + spin_lock_irqsave(&ev_file->lock, flags); + if (!list_empty(&event_sub->event_list)) { + spin_unlock_irqrestore(&ev_file->lock, flags); + return 0; + } + + list_add_tail(&event_sub->event_list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + return 0; + } + + event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe), + GFP_ATOMIC); + if (!event_data) { + spin_lock_irqsave(&ev_file->lock, flags); + ev_file->is_overflow_err = 1; + spin_unlock_irqrestore(&ev_file->lock, flags); + return -ENOMEM; + } + + event_data->hdr.cookie = event_sub->cookie; + memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe)); + + spin_lock_irqsave(&ev_file->lock, flags); + list_add_tail(&event_data->list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + + return 0; +} + +static void dispatch_event_fd(struct list_head *fd_list, + const void *data) +{ + struct devx_event_subscription *item; + + list_for_each_entry_rcu(item, fd_list, xa_list) { + if (!get_file_rcu(item->filp)) + continue; + + if (item->eventfd) { + eventfd_signal(item->eventfd, 1); + fput(item->filp); + continue; + } + + deliver_event(item, data); + fput(item->filp); + } +} + static int devx_event_notifier(struct notifier_block *nb, unsigned long event_type, void *data) { - return NOTIFY_DONE; + struct mlx5_devx_event_table *table; + struct mlx5_ib_dev *dev; + struct devx_event *event; + struct devx_obj_event *obj_event; + u16 obj_type = 0; + bool is_unaffiliated; + u32 obj_id; + + /* Explicit filtering to kernel events which may occur frequently */ + if (event_type == MLX5_EVENT_TYPE_CMD || + event_type == MLX5_EVENT_TYPE_PAGE_REQUEST) + return NOTIFY_OK; + + table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb); + dev = container_of(table, struct mlx5_ib_dev, devx_event_table); + is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type); + + if (!is_unaffiliated) + obj_type = get_event_obj_type(event_type, data); + + rcu_read_lock(); + event = xa_load(&table->event_xa, event_type | (obj_type << 16)); + if (!event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + if (is_unaffiliated) { + dispatch_event_fd(&event->unaffiliated_list, data); + rcu_read_unlock(); + return NOTIFY_OK; + } + + obj_id = devx_get_obj_id_from_event(event_type, data); + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + dispatch_event_fd(&obj_event->obj_sub_list, data); + + rcu_read_unlock(); + return NOTIFY_OK; } void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) @@ -2299,19 +2489,108 @@ static const struct file_operations devx_async_cmd_event_fops = { static ssize_t devx_async_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { - return -EINVAL; + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub; + struct devx_async_event_data *uninitialized_var(event); + int ret = 0; + size_t eventsz; + bool omit_data; + void *event_data; + + omit_data = ev_file->omit_data; + + spin_lock_irq(&ev_file->lock); + + if (ev_file->is_overflow_err) { + ev_file->is_overflow_err = 0; + spin_unlock_irq(&ev_file->lock); + return -EOVERFLOW; + } + + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + + while (list_empty(&ev_file->event_list)) { + spin_unlock_irq(&ev_file->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(ev_file->poll_wait, + (!list_empty(&ev_file->event_list) || + ev_file->is_destroyed))) { + return -ERESTARTSYS; + } + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + } + + if (omit_data) { + event_sub = list_first_entry(&ev_file->event_list, + struct devx_event_subscription, + event_list); + eventsz = sizeof(event_sub->cookie); + event_data = &event_sub->cookie; + } else { + event = list_first_entry(&ev_file->event_list, + struct devx_async_event_data, list); + eventsz = sizeof(struct mlx5_eqe) + + sizeof(struct mlx5_ib_uapi_devx_async_event_hdr); + event_data = &event->hdr; + } + + if (eventsz > count) { + spin_unlock_irq(&ev_file->lock); + return -EINVAL; + } + + if (omit_data) + list_del_init(&event_sub->event_list); + else + list_del(&event->list); + + spin_unlock_irq(&ev_file->lock); + + if (copy_to_user(buf, event_data, eventsz)) + /* This points to an application issue, not a kernel concern */ + ret = -EFAULT; + else + ret = eventsz; + + if (!omit_data) + kfree(event); + return ret; } static __poll_t devx_async_event_poll(struct file *filp, struct poll_table_struct *wait) { - return 0; + struct devx_async_event_file *ev_file = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &ev_file->poll_wait, wait); + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (!list_empty(&ev_file->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + spin_unlock_irq(&ev_file->lock); + + return pollflags; } static int devx_async_event_close(struct inode *inode, struct file *filp) { struct devx_async_event_file *ev_file = filp->private_data; struct devx_event_subscription *event_sub, *event_sub_tmp; + struct devx_async_event_data *entry, *tmp; mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock); /* delete the subscriptions which are related to this FD */ @@ -2328,6 +2607,15 @@ static int devx_async_event_close(struct inode *inode, struct file *filp) mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock); + /* free the pending events allocation */ + if (!ev_file->omit_data) { + spin_lock_irq(&ev_file->lock); + list_for_each_entry_safe(entry, tmp, + &ev_file->event_list, list) + kfree(entry); /* read can't come any more */ + spin_unlock_irq(&ev_file->lock); + } + uverbs_close_fd(filp); put_device(&ev_file->dev->ib_dev.dev); return 0; @@ -2363,6 +2651,15 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { + struct devx_async_event_file *ev_file = + container_of(uobj, struct devx_async_event_file, + uobj); + + spin_lock_irq(&ev_file->lock); + ev_file->is_destroyed = 1; + spin_unlock_irq(&ev_file->lock); + + wake_up_interruptible(&ev_file->poll_wait); return 0; }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index b44691315d39..7e9900b0e746 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -67,5 +67,10 @@ enum mlx5_ib_uapi_devx_create_event_channel_flags { MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0, }; +struct mlx5_ib_uapi_devx_async_event_hdr { + __aligned_u64 cookie; + __u8 out_data[]; +}; + #endif From ef1659ade3590e4a29a999c6f0cb2272857638a6 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:33 +0300 Subject: [PATCH 140/194] IB/mlx5: Add DEVX support for CQ events Add DEVX support for CQ events by creating and destroying the CQ via mlx5_core and set an handler to manage its completions. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 867b9778c063..b6cae4ea7a37 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -19,9 +19,12 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +static void dispatch_event_fd(struct list_head *fd_list, const void *data); + enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, + DEVX_OBJ_FLAGS_CQ = 1 << 2, }; struct devx_async_data { @@ -89,6 +92,7 @@ struct devx_async_event_file { #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ib_dev; u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; @@ -96,6 +100,7 @@ struct devx_obj { union { struct mlx5_ib_devx_mr devx_mr; struct mlx5_core_dct core_dct; + struct mlx5_core_cq core_cq; }; struct list_head event_sub; /* holds devx_event_subscription entries */ }; @@ -1336,6 +1341,8 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + ret = mlx5_core_destroy_cq(obj->mdev, &obj->core_cq); else ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); @@ -1359,6 +1366,29 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, return ret; } +static void devx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) +{ + struct devx_obj *obj = container_of(mcq, struct devx_obj, core_cq); + struct mlx5_devx_event_table *table; + struct devx_event *event; + struct devx_obj_event *obj_event; + u32 obj_id = mcq->cqn; + + table = &obj->ib_dev->devx_event_table; + rcu_read_lock(); + event = xa_load(&table->event_xa, MLX5_EVENT_TYPE_COMP); + if (!event) + goto out; + + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) + goto out; + + dispatch_event_fd(&obj_event->obj_sub_list, eqe); +out: + rcu_read_unlock(); +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( struct uverbs_attr_bundle *attrs) { @@ -1410,6 +1440,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( err = mlx5_core_create_dct(dev->mdev, &obj->core_dct, cmd_in, cmd_in_len, cmd_out, cmd_out_len); + } else if (opcode == MLX5_CMD_OP_CREATE_CQ) { + obj->flags |= DEVX_OBJ_FLAGS_CQ; + obj->core_cq.comp = devx_cq_comp; + err = mlx5_core_create_cq(dev->mdev, &obj->core_cq, + cmd_in, cmd_in_len, cmd_out, + cmd_out_len); } else { err = mlx5_cmd_exec(dev->mdev, cmd_in, cmd_in_len, @@ -1422,6 +1458,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( uobj->object = obj; obj->mdev = dev->mdev; INIT_LIST_HEAD(&obj->event_sub); + obj->ib_dev = dev; devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); @@ -1449,6 +1486,8 @@ err_copy: obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + mlx5_core_destroy_cq(obj->mdev, &obj->core_cq); else mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); From 5832fdd35e61bf4793da0f0480fb873af645a7b4 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 30 Jun 2019 19:23:34 +0300 Subject: [PATCH 141/194] IB/mlx5: DEVX cleanup mdev No need any more to hold mlx5_core_dev on the devx_object, it can be accessed from ib_dev. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index b6cae4ea7a37..e69a8693f102 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -91,7 +91,6 @@ struct devx_async_event_file { #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { - struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ib_dev; u64 obj_id; u32 dinlen; /* destroy inbox length */ @@ -1291,7 +1290,7 @@ static void devx_free_indirect_mkey(struct rcu_head *rcu) */ static void devx_cleanup_mkey(struct devx_obj *obj) { - xa_erase(&obj->mdev->priv.mkey_table, + xa_erase(&obj->ib_dev->mdev->priv.mkey_table, mlx5_base_mkey(obj->devx_mr.mmkey.key)); } @@ -1340,12 +1339,12 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, devx_cleanup_mkey(obj); if (obj->flags & DEVX_OBJ_FLAGS_DCT) - ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) - ret = mlx5_core_destroy_cq(obj->mdev, &obj->core_cq); + ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else - ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, - sizeof(out)); + ret = mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, + obj->dinlen, out, sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; @@ -1456,7 +1455,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( goto obj_free; uobj->object = obj; - obj->mdev = dev->mdev; INIT_LIST_HEAD(&obj->event_sub); obj->ib_dev = dev; devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, @@ -1485,11 +1483,11 @@ err_copy: devx_cleanup_mkey(obj); obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) - mlx5_core_destroy_dct(obj->mdev, &obj->core_dct); + mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) - mlx5_core_destroy_cq(obj->mdev, &obj->core_cq); + mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); else - mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, + mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); From 9a601fc43ee0157a2575fe81beffe653c25edcd6 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Thu, 4 Jul 2019 14:22:58 +0800 Subject: [PATCH 142/194] RDMA/hns: Fix building modular hns The patch below wasn't fully tested for all combinations of module and configs, and causes a compile failure: WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_ah.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_alloc.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_cmd.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_cq.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_db.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_hem.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_mr.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_pd.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_qp.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_restrack.o see include/linux/module.h for more information WARNING: modpost: missing MODULE_LICENSE() in drivers/infiniband/hw/hns/hns_roce_srq.o see include/linux/module.h for more information ERROR: "hns_roce_bitmap_cleanup" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_bitmap_init" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_free_cmd_mailbox" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_alloc_cmd_mailbox" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_table_get" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_bitmap_alloc" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! ERROR: "hns_roce_table_find" [drivers/infiniband/hw/hns/hns_roce_srq.ko] undefined! The fix is to put the module sub components in the right line. Fixes: e9816ddf2a33 ("RDMA/hns: Cleanup unnecessary exported symbols") Reported-by: Stephen Rothwell Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index 1c5a20af6992..e105945b94a1 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -10,8 +10,8 @@ hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o ifdef CONFIG_INFINIBAND_HNS_HIP06 -hns-roce-hw-v1-objs := hns_roce_hw_v1.o -obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v1.o $(hns-roce-objs) +hns-roce-hw-v1-objs := hns_roce_hw_v1.o $(hns-roce-objs) +obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v1.o endif ifdef CONFIG_INFINIBAND_HNS_HIP08 From 0bddcff628db5f516b046019b31f2430e7a8eaba Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Jul 2019 13:50:27 +0100 Subject: [PATCH 143/194] RDMA/uverbs: remove redundant assignment to variable ret The variable ret is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 750c4d484329..7ddd0e5bc6b3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2548,7 +2548,7 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret; bool found = false; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); From 50ba3c18a4e549ba6a5a4672dfb3eb30fcb7d570 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 30 Jun 2019 18:48:32 +0300 Subject: [PATCH 144/194] RDMA/mlx5: Use proper allocation API to get zeroed memory There is no need in custom memory zeroing, because it can be done by using kzalloc from the beginning. Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 692b60898ee2..1af36497d54c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4726,7 +4726,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) int err = -ENOMEM; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; - pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + pprops = kzalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; @@ -4740,7 +4740,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port) goto out; } - memset(pprops, 0, sizeof(*pprops)); err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", From 91b01061fef9c57d2f5b712a6322ef51061f4efd Mon Sep 17 00:00:00 2001 From: Valentine Fatiev Date: Sun, 30 Jun 2019 16:48:41 +0300 Subject: [PATCH 145/194] IB/ipoib: Add child to parent list only if device initialized Despite failure in ipoib_dev_init() we continue with initialization flow and creation of child device. It causes to the situation where this child device is added too early to parent device list. Change the logic, so in case of failure we properly return error from ipoib_dev_init() and add child only in success path. Fixes: eaeb39842508 ("IB/ipoib: Move init code to ndo_init") Signed-off-by: Valentine Fatiev Reviewed-by: Feras Daoud Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 34 +++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9b5e11d3fb85..bb904ec511be 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1893,12 +1893,6 @@ static void ipoib_child_init(struct net_device *ndev) struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); - dev_hold(priv->parent); - - down_write(&ppriv->vlan_rwsem); - list_add_tail(&priv->list, &ppriv->child_intfs); - up_write(&ppriv->vlan_rwsem); - priv->max_ib_mtu = ppriv->max_ib_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); @@ -1941,6 +1935,17 @@ static int ipoib_ndo_init(struct net_device *ndev) if (rc) { pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", priv->ca->name, priv->dev->name, priv->port, rc); + return rc; + } + + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + dev_hold(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_add_tail(&priv->list, &ppriv->child_intfs); + up_write(&ppriv->vlan_rwsem); } return 0; @@ -1958,6 +1963,14 @@ static void ipoib_ndo_uninit(struct net_device *dev) */ WARN_ON(!list_empty(&priv->child_intfs)); + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); + } + ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); @@ -1969,15 +1982,8 @@ static void ipoib_ndo_uninit(struct net_device *dev) priv->wq = NULL; } - if (priv->parent) { - struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); - - down_write(&ppriv->vlan_rwsem); - list_del(&priv->list); - up_write(&ppriv->vlan_rwsem); - + if (priv->parent) dev_put(priv->parent); - } } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) From bcde9a83b13ede042fd76e4cf0b759b6d6c0abe9 Mon Sep 17 00:00:00 2001 From: Daniel Kranzdorf Date: Sun, 30 Jun 2019 17:53:02 +0300 Subject: [PATCH 146/194] RDMA/efa: Entropy in admin commands id Make admin commands id easier to distinguish by using relevant bits from the producer counter. This allows us to differentiate admin commands with the same producer index (happens after admin queue overlap), which is helpful when debugging. Signed-off-by: Daniel Kranzdorf Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_com.c | 44 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index ec04ced9fd2b..2cb42484b0f8 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -278,36 +278,34 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, struct efa_comp_ctx *comp_ctx) { - u16 comp_id = comp_ctx->user_cqe->acq_common_descriptor.command & - EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; + u16 cmd_id = comp_ctx->user_cqe->acq_common_descriptor.command & + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; + u16 ctx_id = cmd_id & (aq->depth - 1); - ibdev_dbg(aq->efa_dev, "Putting completion command_id %d\n", comp_id); + ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); comp_ctx->occupied = 0; - efa_com_dealloc_ctx_id(aq, comp_id); + efa_com_dealloc_ctx_id(aq, ctx_id); } static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, - u16 command_id, bool capture) + u16 cmd_id, bool capture) { - if (command_id >= aq->depth) { - ibdev_err(aq->efa_dev, - "command id is larger than the queue size. cmd_id: %u queue size %d\n", - command_id, aq->depth); - return NULL; - } + u16 ctx_id = cmd_id & (aq->depth - 1); - if (aq->comp_ctx[command_id].occupied && capture) { - ibdev_err(aq->efa_dev, "Completion context is occupied\n"); + if (aq->comp_ctx[ctx_id].occupied && capture) { + ibdev_err(aq->efa_dev, + "Completion context for command_id %#x is occupied\n", + cmd_id); return NULL; } if (capture) { - aq->comp_ctx[command_id].occupied = 1; - ibdev_dbg(aq->efa_dev, "Taking completion ctxt command_id %d\n", - command_id); + aq->comp_ctx[ctx_id].occupied = 1; + ibdev_dbg(aq->efa_dev, + "Take completion ctxt for command_id %#x\n", cmd_id); } - return &aq->comp_ctx[command_id]; + return &aq->comp_ctx[ctx_id]; } static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, @@ -318,6 +316,7 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu { struct efa_comp_ctx *comp_ctx; u16 queue_size_mask; + u16 cmd_id; u16 ctx_id; u16 pi; @@ -326,13 +325,16 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu ctx_id = efa_com_alloc_ctx_id(aq); + /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ + cmd_id = ctx_id & queue_size_mask; + cmd_id |= aq->sq.pc & ~queue_size_mask; + cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; + + cmd->aq_common_descriptor.command_id = cmd_id; cmd->aq_common_descriptor.flags |= aq->sq.phase & EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK; - cmd->aq_common_descriptor.command_id |= ctx_id & - EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; - - comp_ctx = efa_com_get_comp_ctx(aq, ctx_id, true); + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); if (!comp_ctx) { efa_com_dealloc_ctx_id(aq, ctx_id); return ERR_PTR(-EINVAL); From 7ade1ff96c7aa7e10445688a433d7ae39a13c6c9 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:31 +0300 Subject: [PATCH 147/194] RDMA/restrack: Introduce statistic counter Introduce statistic counter as a new resource. It allows a user to monitor specific objects (e.g., QPs) by binding to a counter. In some cases a user counter resource is created with task other then "current", because its creation is done as part of rdmatool call. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 22 +++++++++++++++++----- include/rdma/rdma_counter.h | 18 ++++++++++++++++++ include/rdma/restrack.h | 4 ++++ 3 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 include/rdma/rdma_counter.h diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b5ff2f7b5f8..95573f292aae 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type) [RDMA_RESTRACK_CM_ID] = "CM_ID", [RDMA_RESTRACK_MR] = "MR", [RDMA_RESTRACK_CTX] = "CTX", + [RDMA_RESTRACK_COUNTER] = "COUNTER", }; return names[type]; @@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct ib_mr, res)->device; case RDMA_RESTRACK_CTX: return container_of(res, struct ib_ucontext, res)->device; + case RDMA_RESTRACK_COUNTER: + return container_of(res, struct rdma_counter, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; @@ -203,15 +207,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) kref_init(&res->kref); init_completion(&res->comp); - if (res->type != RDMA_RESTRACK_QP) - ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, - &rt->next_id, GFP_KERNEL); - else { + if (res->type == RDMA_RESTRACK_QP) { /* Special case to ensure that LQPN points to right QP */ struct ib_qp *qp = container_of(res, struct ib_qp, res); ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL); res->id = ret ? 0 : qp->qp_num; + } else if (res->type == RDMA_RESTRACK_COUNTER) { + /* Special case to ensure that cntn points to right counter */ + struct rdma_counter *counter; + + counter = container_of(res, struct rdma_counter, res); + ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL); + res->id = ret ? 0 : counter->id; + } else { + ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, + &rt->next_id, GFP_KERNEL); } if (!ret) @@ -237,7 +248,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd); */ void rdma_restrack_uadd(struct rdma_restrack_entry *res) { - if (res->type != RDMA_RESTRACK_CM_ID) + if ((res->type != RDMA_RESTRACK_CM_ID) && + (res->type != RDMA_RESTRACK_COUNTER)) res->task = NULL; if (!res->task) diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h new file mode 100644 index 000000000000..283ac1a0cdb7 --- /dev/null +++ b/include/rdma/rdma_counter.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_COUNTER_H_ +#define _RDMA_COUNTER_H_ + +#include +#include + +struct rdma_counter { + struct rdma_restrack_entry res; + struct ib_device *device; + uint32_t id; + u8 port; +}; +#endif /* _RDMA_COUNTER_H_ */ diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index ecf3c7702a4f..4041a4d96524 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -42,6 +42,10 @@ enum rdma_restrack_type { * @RDMA_RESTRACK_CTX: Verbs contexts (CTX) */ RDMA_RESTRACK_CTX, + /** + * @RDMA_RESTRACK_COUNTER: Statistic Counter + */ + RDMA_RESTRACK_COUNTER, /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ From 699a9c540a04d05aa342cd84606bc8b1e8c05b7b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:32 +0300 Subject: [PATCH 148/194] RDMA/restrack: Add an API to attach a task to a resource Add rdma_restrack_attach_task() which is able to attach a task other then "current" to a resource. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 14 ++++++++++++++ drivers/infiniband/core/restrack.h | 2 ++ 2 files changed, 16 insertions(+) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 95573f292aae..3714634ae296 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -194,6 +194,20 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res, } EXPORT_SYMBOL(rdma_restrack_set_task); +/** + * rdma_restrack_attach_task() - attach the task onto this resource + * @res: resource entry + * @task: the task to attach, the current task will be used if it is NULL. + */ +void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} + static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h index 09a1fbdf578e..d084e5f89849 100644 --- a/drivers/infiniband/core/restrack.h +++ b/drivers/infiniband/core/restrack.h @@ -25,4 +25,6 @@ struct rdma_restrack_root { int rdma_restrack_init(struct ib_device *dev); void rdma_restrack_clean(struct ib_device *dev); +void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task); #endif /* _RDMA_CORE_RESTRACK_H_ */ From 6a6c306a09b5227d51fcc1643c888e316935dfa8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:33 +0300 Subject: [PATCH 149/194] RDMA/restrack: Make is_visible_in_pid_ns() as an API Remove is_visible_in_pid_ns() from nldev.c and make it as a restrack API, so that it can be taken advantage by other parts like counter. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 15 ++------------- drivers/infiniband/core/restrack.c | 13 +++++++++++++ drivers/infiniband/core/restrack.h | 1 + 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 5499f5629dc2..d9ebfb50962b 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1002,17 +1002,6 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { }, }; -static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res) -{ - /* - * 1. Kern resources should be visible in init name space only - * 2. Present only resources visible in the current namespace - */ - if (rdma_is_kernel_res(res)) - return task_active_pid_ns(current) == &init_pid_ns; - return task_active_pid_ns(current) == task_active_pid_ns(res->task); -} - static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, enum rdma_restrack_type res_type) @@ -1057,7 +1046,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto err; } - if (!is_visible_in_pid_ns(res)) { + if (!rdma_is_visible_in_pid_ns(res)) { ret = -ENOENT; goto err_get; } @@ -1169,7 +1158,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, * objects. */ xa_for_each(&rt->xa, id, res) { - if (!is_visible_in_pid_ns(res)) + if (!rdma_is_visible_in_pid_ns(res)) continue; if (idx < start || !rdma_restrack_get(res)) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3714634ae296..bddff426ee0f 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -349,3 +349,16 @@ out: } } EXPORT_SYMBOL(rdma_restrack_del); + +bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res) +{ + /* + * 1. Kern resources should be visible in init + * namespace only + * 2. Present only resources visible in the current + * namespace + */ + if (rdma_is_kernel_res(res)) + return task_active_pid_ns(current) == &init_pid_ns; + return task_active_pid_ns(current) == task_active_pid_ns(res->task); +} diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h index d084e5f89849..7bd177cc0a61 100644 --- a/drivers/infiniband/core/restrack.h +++ b/drivers/infiniband/core/restrack.h @@ -27,4 +27,5 @@ int rdma_restrack_init(struct ib_device *dev); void rdma_restrack_clean(struct ib_device *dev); void rdma_restrack_attach_task(struct rdma_restrack_entry *res, struct task_struct *task); +bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res); #endif /* _RDMA_CORE_RESTRACK_H_ */ From 413d3347503bc39e17577eaf16451fd492a68558 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:34 +0300 Subject: [PATCH 150/194] RDMA/counter: Add set/clear per-port auto mode support Add an API to support set/clear per-port auto mode. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/counters.c | 74 ++++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 5 ++ include/rdma/ib_verbs.h | 2 + include/rdma/rdma_counter.h | 24 ++++++++++ include/uapi/rdma/rdma_netlink.h | 26 +++++++++++ 6 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/counters.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 42f1b2a4f746..09881bd5f12d 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - nldev.o restrack.o + nldev.o restrack.o counters.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c new file mode 100644 index 000000000000..6167914fba06 --- /dev/null +++ b/drivers/infiniband/core/counters.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ +#include +#include + +#include "core_priv.h" +#include "restrack.h" + +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) + +static int __counter_set_mode(struct rdma_counter_mode *curr, + enum rdma_nl_counter_mode new_mode, + enum rdma_nl_counter_mask new_mask) +{ + if ((new_mode == RDMA_COUNTER_MODE_AUTO) && + ((new_mask & (~ALL_AUTO_MODE_MASKS)) || + (curr->mode != RDMA_COUNTER_MODE_NONE))) + return -EINVAL; + + curr->mode = new_mode; + curr->mask = new_mask; + return 0; +} + +/** + * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode + * + * When @on is true, the @mask must be set + */ +int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, + bool on, enum rdma_nl_counter_mask mask) +{ + struct rdma_port_counter *port_counter; + int ret; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (on) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_AUTO, mask); + } else { + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) { + ret = -EINVAL; + goto out; + } + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_NONE, 0); + } + +out: + mutex_unlock(&port_counter->lock); + return ret; +} + +void rdma_counter_init(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + if (!dev->ops.alloc_hw_stats || !dev->port_data) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; + mutex_init(&port_counter->lock); + } +} + +void rdma_counter_release(struct ib_device *dev) +{ +} diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8a6ccb936dfe..6579865e4866 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "core_priv.h" #include "restrack.h" @@ -492,10 +493,12 @@ static void ib_device_release(struct device *device) if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); + rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } + xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); @@ -1316,6 +1319,8 @@ int ib_register_device(struct ib_device *device, const char *name) ib_device_register_rdmacg(device); + rdma_counter_init(device); + /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 26e9c2594913..3d19c056fbc0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -2119,6 +2120,7 @@ struct ib_port_data { spinlock_t netdev_lock; struct net_device __rcu *netdev; struct hlist_node ndev_hash_link; + struct rdma_port_counter port_counter; }; /* rdma netdev type - specifies protocol type */ diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 283ac1a0cdb7..8dd2619c015d 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -6,8 +6,26 @@ #ifndef _RDMA_COUNTER_H_ #define _RDMA_COUNTER_H_ +#include + #include #include +#include + +struct auto_mode_param { + int qp_type; +}; + +struct rdma_counter_mode { + enum rdma_nl_counter_mode mode; + enum rdma_nl_counter_mask mask; + struct auto_mode_param param; +}; + +struct rdma_port_counter { + struct rdma_counter_mode mode; + struct mutex lock; +}; struct rdma_counter { struct rdma_restrack_entry res; @@ -15,4 +33,10 @@ struct rdma_counter { uint32_t id; u8 port; }; + +void rdma_counter_init(struct ib_device *dev); +void rdma_counter_release(struct ib_device *dev); +int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, + bool on, enum rdma_nl_counter_mask mask); + #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 650cee8c4bf1..e3cd912e9cef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -507,4 +507,30 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_MAX }; + +/* + * Supported counter bind modes. All modes are mutual-exclusive. + */ +enum rdma_nl_counter_mode { + RDMA_COUNTER_MODE_NONE, + + /* + * A qp is bound with a counter automatically during initialization + * based on the auto mode (e.g., qp type, ...) + */ + RDMA_COUNTER_MODE_AUTO, + + /* + * Always the end + */ + RDMA_COUNTER_MODE_MAX, +}; + +/* + * Supported criteria in counter auto mode. + * Currently only "qp type" is supported + */ +enum rdma_nl_counter_mask { + RDMA_COUNTER_MASK_QP_TYPE = 1, +}; #endif /* _UAPI_RDMA_NETLINK_H */ From 99fa331dc8629be55ac7a0cca0dc56492070ddac Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:35 +0300 Subject: [PATCH 151/194] RDMA/counter: Add "auto" configuration mode support In auto mode all QPs belong to one category are bind automatically to a single counter set. Currently only "qp type" is supported. In this mode the qp counter is set in RST2INIT modification, and when a qp is destroyed the counter is unbound. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 221 +++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 3 + drivers/infiniband/core/verbs.c | 9 ++ include/rdma/ib_verbs.h | 18 +++ include/rdma/rdma_counter.h | 8 ++ 5 files changed, 259 insertions(+) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 6167914fba06..615ee731a1de 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -54,6 +54,227 @@ out: return ret; } +static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode mode) +{ + struct rdma_counter *counter; + + if (!dev->ops.counter_dealloc) + return NULL; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return NULL; + + counter->device = dev; + counter->port = port; + counter->res.type = RDMA_RESTRACK_COUNTER; + counter->mode.mode = mode; + kref_init(&counter->kref); + mutex_init(&counter->lock); + + return counter; +} + +static void rdma_counter_free(struct rdma_counter *counter) +{ + rdma_restrack_del(&counter->res); + kfree(counter); +} + +static void auto_mode_init_counter(struct rdma_counter *counter, + const struct ib_qp *qp, + enum rdma_nl_counter_mask new_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + + counter->mode.mode = RDMA_COUNTER_MODE_AUTO; + counter->mode.mask = new_mask; + + if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) + param->qp_type = qp->qp_type; +} + +static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, + enum rdma_nl_counter_mask auto_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + bool match = true; + + if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) + return false; + + /* Ensure that counter belong to right PID */ + if (!rdma_is_kernel_res(&counter->res) && + !rdma_is_kernel_res(&qp->res) && + (task_pid_vnr(counter->res.task) != current->pid)) + return false; + + if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) + match &= (param->qp_type == qp->qp_type); + + return match; +} + +static int __rdma_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + int ret; + + if (qp->counter) + return -EINVAL; + + if (!qp->device->ops.counter_bind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_bind_qp(counter, qp); + mutex_unlock(&counter->lock); + + return ret; +} + +static int __rdma_counter_unbind_qp(struct ib_qp *qp) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!qp->device->ops.counter_unbind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_unbind_qp(qp); + mutex_unlock(&counter->lock); + + return ret; +} + +/** + * rdma_get_counter_auto_mode - Find the counter that @qp should be bound + * with in auto mode + * + * Return: The counter (with ref-count increased) if found + */ +static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, + u8 port) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter = NULL; + struct ib_device *dev = qp->device; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + unsigned long id = 0; + + port_counter = &dev->port_data[port].port_counter; + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_is_visible_in_pid_ns(res)) + continue; + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != qp->device) || (counter->port != port)) + goto next; + + if (auto_mode_match(qp, counter, port_counter->mode.mask)) + break; +next: + counter = NULL; + } + + if (counter && !kref_get_unless_zero(&counter->kref)) + counter = NULL; + + xa_unlock(&rt->xa); + return counter; +} + +static void rdma_counter_res_add(struct rdma_counter *counter, + struct ib_qp *qp) +{ + if (rdma_is_kernel_res(&qp->res)) { + rdma_restrack_set_task(&counter->res, qp->res.kern_name); + rdma_restrack_kadd(&counter->res); + } else { + rdma_restrack_attach_task(&counter->res, qp->res.task); + rdma_restrack_uadd(&counter->res); + } +} + +static void counter_release(struct kref *kref) +{ + struct rdma_counter *counter; + + counter = container_of(kref, struct rdma_counter, kref); + counter->device->ops.counter_dealloc(counter); + rdma_counter_free(counter); +} + +/** + * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on + * the auto-mode rule + */ +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port) +{ + struct rdma_port_counter *port_counter; + struct ib_device *dev = qp->device; + struct rdma_counter *counter; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) + return 0; + + counter = rdma_get_counter_auto_mode(qp, port); + if (counter) { + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) { + kref_put(&counter->kref, counter_release); + return ret; + } + } else { + counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO); + if (!counter) + return -ENOMEM; + + auto_mode_init_counter(counter, qp, port_counter->mode.mask); + + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) { + rdma_counter_free(counter); + return ret; + } + + rdma_counter_res_add(counter, qp); + } + + return 0; +} + +/** + * rdma_counter_unbind_qp - Unbind a qp from a counter + * @force: + * true - Decrease the counter ref-count anyway (e.g., qp destroy) + */ +int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!counter) + return -EINVAL; + + ret = __rdma_counter_unbind_qp(qp); + if (ret && !force) + return ret; + + kref_put(&counter->kref, counter_release); + return 0; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 6579865e4866..f3181b74c863 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2471,6 +2471,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_bind_qp); + SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 4a04e94a72db..92349bf37589 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1690,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, } } + /* + * Bind this qp to a counter automatically based on the rdma counter + * rules. This only set in RST2INIT with port specified + */ + if (!qp->counter && (attr_mask & IB_QP_PORT) && + ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) + rdma_counter_bind_qp_auto(qp, attr->port_num); + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (ret) goto out; @@ -1885,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); + rdma_counter_unbind_qp(qp, true); rdma_restrack_del(&qp->res); ret = qp->device->ops.destroy_qp(qp, udata); if (!ret) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3d19c056fbc0..0205472eb73a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1698,6 +1698,9 @@ struct ib_qp { * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; + + /* The counter the qp is bind to */ + struct rdma_counter *counter; }; struct ib_dm { @@ -2485,6 +2488,21 @@ struct ib_device_ops { u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); + /** + * counter_bind_qp - Bind a QP to a counter. + * @counter - The counter to be bound. If counter->id is zero then + * the driver needs to allocate a new counter and set counter->id + */ + int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp); + /** + * counter_unbind_qp - Unbind the qp from the dynamically-allocated + * counter and bind it onto the default one + */ + int (*counter_unbind_qp)(struct ib_qp *qp); + /** + * counter_dealloc -De-allocate the hw counter + */ + int (*counter_dealloc)(struct rdma_counter *counter); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_cq); diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 8dd2619c015d..9f93a2403c9c 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -7,11 +7,14 @@ #define _RDMA_COUNTER_H_ #include +#include #include #include #include +struct ib_qp; + struct auto_mode_param { int qp_type; }; @@ -31,6 +34,9 @@ struct rdma_counter { struct rdma_restrack_entry res; struct ib_device *device; uint32_t id; + struct kref kref; + struct rdma_counter_mode mode; + struct mutex lock; u8 port; }; @@ -38,5 +44,7 @@ void rdma_counter_init(struct ib_device *dev); void rdma_counter_release(struct ib_device *dev); int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, bool on, enum rdma_nl_counter_mask mask); +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port); +int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); #endif /* _RDMA_COUNTER_H_ */ From d14133dd41614aaaac1fa0505c7dab01f4211d2c Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:36 +0300 Subject: [PATCH 152/194] IB/mlx5: Support set qp counter Support bind a qp with counter. If counter is null then bind the qp to the default counter. Different QP state has different operation: - RESET: Set the counter field so that it will take effective during RST2INIT change; - RTS: Issue an RTS2RTS change to update the QP counter; - Other: Set the counter field and mark the counter_pending flag, when QP is moved to RTS state and this flag is set, then issue an RTS2RTS modification to update the counter. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Acked-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++ drivers/infiniband/hw/mlx5/qp.c | 76 +++++++++++++++++++++++++++- include/linux/mlx5/qp.h | 1 + 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7373e9da0919..c482f19958b3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -439,6 +439,10 @@ struct mlx5_ib_qp { u32 flags_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ enum ib_qp_type qp_sub_type; + /* A flag to indicate if there's a new counter is configured + * but not take effective + */ + u32 counter_pending; }; struct mlx5_ib_cq_buf { @@ -1468,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void); int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg); + +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8b7a60ada92c..2a97619ed603 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "mlx5_ib.h" #include "ib_rep.h" @@ -3380,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, return tx_port_affinity; } +static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, + struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_qp_context context = {}; + struct mlx5_ib_port *mibport = NULL; + struct mlx5_ib_qp_base *base; + u32 set_id; + + if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) + return 0; + + if (counter) { + set_id = counter->id; + } else { + mibport = &dev->port[mqp->port - 1]; + set_id = mibport->cnts.set_id; + } + + base = &mqp->trans_qp.base; + context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff); + context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24); + return mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_RTS2RTS_QP, + MLX5_QP_OPTPAR_COUNTER_SET_ID, + &context, &base->mqp); +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, @@ -3433,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_port *mibport = NULL; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; + u32 set_id = 0; int mlx5_st; int err; u16 op; @@ -3595,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, port_num = 0; mibport = &dev->port[port_num]; + if (ibqp->counter) + set_id = ibqp->counter->id; + else + set_id = mibport->cnts.set_id; context->qp_counter_set_usr_page |= - cpu_to_be32((u32)(mibport->cnts.set_id) << 24); + cpu_to_be32(set_id << 24); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) @@ -3624,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, raw_qp_param.operation = op; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { - raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id; + raw_qp_param.rq_q_ctr_id = set_id; raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; } @@ -3701,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, qp->db.db[MLX5_SND_DBR] = 0; } + if ((new_state == IB_QPS_RTS) && qp->counter_pending) { + err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter); + if (!err) + qp->counter_pending = 0; + } + out: kfree(context); return err; @@ -6435,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp) handle_drain_completion(cq, &rdrain, dev); } + +/** + * Bind a qp to a counter. If @counter is NULL then bind the qp to + * the default counter + */ +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + int err = 0; + + mutex_lock(&mqp->mutex); + if (mqp->state == IB_QPS_RESET) { + qp->counter = counter; + goto out; + } + + if (mqp->state == IB_QPS_RTS) { + err = __mlx5_ib_qp_set_counter(qp, counter); + if (!err) + qp->counter = counter; + + goto out; + } + + mqp->counter_pending = 1; + qp->counter = counter; + +out: + mutex_unlock(&mqp->mutex); + return err; +} diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 937041101504..ae63b1ae9004 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -71,6 +71,7 @@ enum mlx5_qp_optpar { MLX5_QP_OPTPAR_CQN_RCV = 1 << 19, MLX5_QP_OPTPAR_DC_HS = 1 << 20, MLX5_QP_OPTPAR_DC_KEY = 1 << 21, + MLX5_QP_OPTPAR_COUNTER_SET_ID = 1 << 25, }; enum mlx5_qp_state { From 318d535cefecf2227ab20ec6e2961f9830c96e9b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:37 +0300 Subject: [PATCH 153/194] IB/mlx5: Add counter set id as a parameter for mlx5_ib_query_q_counters() Add counter set id as a parameter so that this API can be used for querying any q counter. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1af36497d54c..ac09706b9407 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5477,7 +5477,8 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct mlx5_ib_port *port, - struct rdma_hw_stats *stats) + struct rdma_hw_stats *stats, + u16 set_id) { int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; @@ -5488,9 +5489,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, if (!out) return -ENOMEM; - ret = mlx5_core_query_q_counter(mdev, - port->cnts.set_id, 0, - out, outlen); + ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen); if (ret) goto free; @@ -5550,7 +5549,8 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, port->cnts.num_ext_ppcnt_counters; /* q_counters are per IB device, query the master mdev */ - ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); + ret = mlx5_ib_query_q_counters(dev->mdev, port, stats, + port->cnts.set_id); if (ret) return ret; From 45842fc627c7f0e55ad41613b64055266c710ac7 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:38 +0300 Subject: [PATCH 154/194] IB/mlx5: Support statistic q counter configuration Add support for ib callbacks counter_bind_qp(), counter_unbind_qp() and counter_dealloc(). Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ac09706b9407..2914eab7c480 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5586,6 +5586,47 @@ done: return num_counters; } +static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + u16 cnt_set_id = 0; + int err; + + if (!counter->id) { + err = mlx5_cmd_alloc_q_counter(dev->mdev, + &cnt_set_id, + MLX5_SHARED_RESOURCE_UID); + if (err) + return err; + counter->id = cnt_set_id; + } + + err = mlx5_ib_qp_set_counter(qp, counter); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id); + counter->id = 0; + + return err; +} + +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +{ + return mlx5_ib_qp_set_counter(qp, NULL); +} + +static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + + return mlx5_core_dealloc_q_counter(dev->mdev, counter->id); +} + static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params) @@ -6509,6 +6550,9 @@ static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = { .alloc_hw_stats = mlx5_ib_alloc_hw_stats, .get_hw_stats = mlx5_ib_get_hw_stats, + .counter_bind_qp = mlx5_ib_counter_bind_qp, + .counter_unbind_qp = mlx5_ib_counter_unbind_qp, + .counter_dealloc = mlx5_ib_counter_dealloc, }; static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) From b47ae6f803b727952dfb37afd83e51c465147b85 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:39 +0300 Subject: [PATCH 155/194] RDMA/nldev: Allow counter auto mode configration through RDMA netlink Provide an option to enable/disable per-port counter auto mode through RDMA netlink. Limit it to users with ADMIN capability only. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 78 ++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 8 ++++ 2 files changed, 86 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index d9ebfb50962b..9a4cf285f447 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -126,6 +126,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, @@ -1482,6 +1485,78 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } +static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + u32 index, port, mode, mask = 0; + struct ib_device *device; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + /* Currently only counter for QP is supported */ + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode != RDMA_COUNTER_MODE_AUTO) { + ret = -EMSGSIZE; + goto err_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + + ret = rdma_counter_set_auto_mode(device, port, + mask ? true : false, mask); + if (ret) + goto err_msg; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + ret = -EMSGSIZE; + goto err_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1535,6 +1610,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, + }, + [RDMA_NLDEV_CMD_STAT_SET] = { + .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index e3cd912e9cef..0cb47d23fd86 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -281,6 +281,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET_CHARDEV, + RDMA_NLDEV_CMD_STAT_SET, + RDMA_NLDEV_NUM_OPS }; @@ -488,6 +490,12 @@ enum rdma_nldev_attr { * File descriptor handle of the net namespace object */ RDMA_NLDEV_NET_NS_FD, /* u32 */ + /* + * Counter-specific attributes. + */ + RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ + RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ + RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ /* * Information about a chardev. From c4ffee7c9bdba7b189df3251e375c4c7e93a91ac Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:40 +0300 Subject: [PATCH 156/194] RDMA/netlink: Implement counter dumpit calback This patch adds the ability to return all available counters together with their properties and hwstats. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 26 +++- drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/nldev.c | 213 +++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 10 ++ include/rdma/rdma_counter.h | 3 + include/uapi/rdma/rdma_netlink.h | 22 ++- 6 files changed, 268 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 615ee731a1de..3741b9e5126a 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -59,7 +59,7 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, { struct rdma_counter *counter; - if (!dev->ops.counter_dealloc) + if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -69,16 +69,25 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, counter->device = dev; counter->port = port; counter->res.type = RDMA_RESTRACK_COUNTER; + counter->stats = dev->ops.counter_alloc_stats(counter); + if (!counter->stats) + goto err_stats; + counter->mode.mode = mode; kref_init(&counter->kref); mutex_init(&counter->lock); return counter; + +err_stats: + kfree(counter); + return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { rdma_restrack_del(&counter->res); + kfree(counter->stats); kfree(counter); } @@ -275,6 +284,21 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) return 0; } +int rdma_counter_query_stats(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + int ret; + + if (!dev->ops.counter_update_stats) + return -EINVAL; + + mutex_lock(&counter->lock); + ret = dev->ops.counter_update_stats(counter); + mutex_unlock(&counter->lock); + + return ret; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f3181b74c863..bdf61499e6d5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2471,9 +2471,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_unbind_qp); + SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9a4cf285f447..cebc15b23b15 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -129,6 +129,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, @@ -636,6 +643,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, err: return -EMSGSIZE; } +static int fill_stat_counter_mode(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_counter_mode *m = &counter->mode; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) + return -EMSGSIZE; + + if (m->mode == RDMA_COUNTER_MODE_AUTO) + if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) + return -EMSGSIZE; + + return 0; +} + +static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_qps(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + int ret = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + + rt = &counter->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_is_visible_in_pid_ns(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + continue; + + if (!qp->counter || (qp->counter->id != counter->id)) + continue; + + ret = fill_stat_counter_qp_entry(msg, qp->qp_num); + if (ret) + goto err; + } + + xa_unlock(&rt->xa); + nla_nest_end(msg, table_attr); + return 0; + +err: + xa_unlock(&rt->xa); + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_stat_hwcounter_entry(struct sk_buff *msg, + const char *name, u64 value) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, + value, RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_hwcounters(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_hw_stats *st = counter->stats; + struct nlattr *table_attr; + int i; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < st->num_counters; i++) + if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i])) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, + uint32_t port) +{ + struct rdma_counter *counter = + container_of(res, struct rdma_counter, res); + + if (port && port != counter->port) + return 0; + + /* Dump it even query failed */ + rdma_counter_query_stats(counter); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || + fill_res_name_pid(msg, &counter->res) || + fill_stat_counter_mode(msg, counter) || + fill_stat_counter_qps(msg, counter) || + fill_stat_counter_hwcounters(msg, counter)) + return -EMSGSIZE; + + return 0; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1003,6 +1156,13 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, + [RDMA_RESTRACK_COUNTER] = { + .fill_res_func = fill_res_counter_entry, + .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, + .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, + .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, + .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, + }, }; static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1239,6 +1399,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); @@ -1557,6 +1718,54 @@ err: return ret; } +static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_doit(skb, nlh, extack); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_dumpit(skb, cb); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1615,6 +1824,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_STAT_GET] = { + .doit = nldev_stat_get_doit, + .dump = nldev_stat_get_dumpit, + }, }; void __init nldev_init(void) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0205472eb73a..0c5151a12ae4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2503,6 +2503,16 @@ struct ib_device_ops { * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); + /** + * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in + * the driver initialized data. + */ + struct rdma_hw_stats *(*counter_alloc_stats)( + struct rdma_counter *counter); + /** + * counter_update_stats - Query the stats value of this counter + */ + int (*counter_update_stats)(struct rdma_counter *counter); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_cq); diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 9f93a2403c9c..f2a5c8efc404 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -37,6 +37,7 @@ struct rdma_counter { struct kref kref; struct rdma_counter_mode mode; struct mutex lock; + struct rdma_hw_stats *stats; u8 port; }; @@ -47,4 +48,6 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port); int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); +int rdma_counter_query_stats(struct rdma_counter *counter); + #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0cb47d23fd86..18dd88c0add8 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -283,6 +283,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_SET, + RDMA_NLDEV_CMD_STAT_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -490,13 +492,6 @@ enum rdma_nldev_attr { * File descriptor handle of the net namespace object */ RDMA_NLDEV_NET_NS_FD, /* u32 */ - /* - * Counter-specific attributes. - */ - RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ - RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ - RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ - /* * Information about a chardev. * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc) @@ -509,6 +504,19 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, /* u64 */ + /* + * Counter-specific attributes. + */ + RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ + RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ + RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ + RDMA_NLDEV_ATTR_STAT_COUNTER, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ID, /* u32 */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTERS, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */ /* * Always the end From 18d422ce8ccf47c65b98c2ce9e1758d84c8434eb Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:41 +0300 Subject: [PATCH 157/194] IB/mlx5: Add counter_alloc_stats() and counter_update_stats() support Add support for ib callback counter_alloc_stats() and counter_update_stats(). Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2914eab7c480..7581571bd9cd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5586,6 +5586,27 @@ done: return num_counters; } +static struct rdma_hw_stats * +mlx5_ib_counter_alloc_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_port *port = &dev->port[counter->port - 1]; + + /* Q counters are in the beginning of all counters */ + return rdma_alloc_hw_stats_struct(port->cnts.names, + port->cnts.num_q_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_port *port = &dev->port[counter->port - 1]; + + return mlx5_ib_query_q_counters(dev->mdev, port, + counter->stats, counter->id); +} + static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { @@ -6553,6 +6574,8 @@ static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = { .counter_bind_qp = mlx5_ib_counter_bind_qp, .counter_unbind_qp = mlx5_ib_counter_unbind_qp, .counter_dealloc = mlx5_ib_counter_dealloc, + .counter_alloc_stats = mlx5_ib_counter_alloc_stats, + .counter_update_stats = mlx5_ib_counter_update_stats, }; static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) From f34a55e497e81347ffbdc6e828f123520d33ce5d Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:42 +0300 Subject: [PATCH 158/194] RDMA/core: Get sum value of all counters when perform a sysfs stat read Since a QP can only be bound to one counter, then if it is bound to a separate counter, for backward compatibility purpose, the statistic value must be: * stat of default counter + stat of all running allocated counters + stat of all deallocated counters (history stats) Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 89 ++++++++++++++++++++++++++++++ drivers/infiniband/core/sysfs.c | 10 +++- include/rdma/rdma_counter.h | 2 + 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 3741b9e5126a..8810a8a8d1f5 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -158,6 +158,20 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp) return ret; } +static void counter_history_stat_update(const struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + struct rdma_port_counter *port_counter; + int i; + + port_counter = &dev->port_data[counter->port].port_counter; + if (!port_counter->hstats) + return; + + for (i = 0; i < counter->stats->num_counters; i++) + port_counter->hstats->value[i] += counter->stats->value[i]; +} + /** * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode @@ -215,6 +229,7 @@ static void counter_release(struct kref *kref) struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); + counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } @@ -299,6 +314,55 @@ int rdma_counter_query_stats(struct rdma_counter *counter) return ret; } +static u64 get_running_counters_hwstat_sum(struct ib_device *dev, + u8 port, u32 index) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct rdma_counter *counter; + unsigned long id = 0; + u64 sum = 0; + + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + xa_unlock(&rt->xa); + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != dev) || (counter->port != port) || + rdma_counter_query_stats(counter)) + goto next; + + sum += counter->stats->value[index]; + +next: + xa_lock(&rt->xa); + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + return sum; +} + +/** + * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a + * specific port, including the running ones and history data + */ +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) +{ + struct rdma_port_counter *port_counter; + u64 sum; + + port_counter = &dev->port_data[port].port_counter; + sum = get_running_counters_hwstat_sum(dev, port, index); + sum += port_counter->hstats->value[index]; + + return sum; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; @@ -311,9 +375,34 @@ void rdma_counter_init(struct ib_device *dev) port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); + + port_counter->hstats = dev->ops.alloc_hw_stats(dev, port); + if (!port_counter->hstats) + goto fail; } + + return; + +fail: + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + kfree(port_counter->hstats); + port_counter->hstats = NULL; + } + + return; } void rdma_counter_release(struct ib_device *dev) { + struct rdma_port_counter *port_counter; + u32 port; + + if (!dev->ops.alloc_hw_stats) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + kfree(port_counter->hstats); + } } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index c78d0c9646ae..c59b80e0a740 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -43,6 +43,7 @@ #include #include #include +#include struct ib_port; @@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, return 0; } -static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +static ssize_t print_hw_stat(struct ib_device *dev, int port_num, + struct rdma_hw_stats *stats, int index, char *buf) { - return sprintf(buf, "%llu\n", stats->value[index]); + u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); + + return sprintf(buf, "%llu\n", stats->value[index] + v); } static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, @@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); if (ret) goto unlock; - ret = print_hw_stat(stats, hsa->index, buf); + ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf); unlock: mutex_unlock(&stats->lock); diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index f2a5c8efc404..bf2c3578768f 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -27,6 +27,7 @@ struct rdma_counter_mode { struct rdma_port_counter { struct rdma_counter_mode mode; + struct rdma_hw_stats *hstats; struct mutex lock; }; @@ -49,5 +50,6 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port); int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index); #endif /* _RDMA_COUNTER_H_ */ From 1bd8e0a9d0fd1be03d2833a0c15ac676bdf275d8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:43 +0300 Subject: [PATCH 159/194] RDMA/counter: Allow manual mode configuration support In manual mode a QP is bound to a counter manually. If counter is not specified then a new one will be allocated. Manual mode is enabled when user binds a QP, and disabled when the last manually bound QP is unbound. When auto-mode is turned off and there are counters left, manual mode is enabled so that the user is able to access these counters. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 219 ++++++++++++++++++++++++++++- include/rdma/rdma_counter.h | 7 + include/uapi/rdma/rdma_netlink.h | 6 + 3 files changed, 229 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 8810a8a8d1f5..0ebe36e9fa7b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -27,7 +27,9 @@ static int __counter_set_mode(struct rdma_counter_mode *curr, /** * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * - * When @on is true, the @mask must be set + * When @on is true, the @mask must be set; When @on is false, it goes + * into manual mode if there's any counter, so that the user is able to + * manually access them. */ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, bool on, enum rdma_nl_counter_mask mask) @@ -45,8 +47,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, ret = -EINVAL; goto out; } - ret = __counter_set_mode(&port_counter->mode, - RDMA_COUNTER_MODE_NONE, 0); + + if (port_counter->num_counters) + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + else + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_NONE, 0); } out: @@ -57,7 +64,9 @@ out: static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, enum rdma_nl_counter_mode mode) { + struct rdma_port_counter *port_counter; struct rdma_counter *counter; + int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; @@ -73,12 +82,27 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, if (!counter->stats) goto err_stats; + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (mode == RDMA_COUNTER_MODE_MANUAL) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + if (ret) + goto err_mode; + } + + port_counter->num_counters++; + mutex_unlock(&port_counter->lock); + counter->mode.mode = mode; kref_init(&counter->kref); mutex_init(&counter->lock); return counter; +err_mode: + mutex_unlock(&port_counter->lock); + kfree(counter->stats); err_stats: kfree(counter); return NULL; @@ -86,6 +110,18 @@ err_stats: static void rdma_counter_free(struct rdma_counter *counter) { + struct rdma_port_counter *port_counter; + + port_counter = &counter->device->port_data[counter->port].port_counter; + mutex_lock(&port_counter->lock); + port_counter->num_counters--; + if (!port_counter->num_counters && + (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) + __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE, + 0); + + mutex_unlock(&port_counter->lock); + rdma_restrack_del(&counter->res); kfree(counter->stats); kfree(counter); @@ -363,6 +399,183 @@ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) return sum; } +static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) +{ + struct rdma_restrack_entry *res = NULL; + struct ib_qp *qp = NULL; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) + goto err; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + goto err; + + return qp; + +err: + rdma_restrack_put(&qp->res); + return NULL; +} + +static int rdma_counter_bind_qp_manual(struct rdma_counter *counter, + struct ib_qp *qp) +{ + if ((counter->device != qp->device) || (counter->port != qp->port)) + return -EINVAL; + + return __rdma_counter_bind_qp(counter, qp); +} + +static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, + u32 counter_id) +{ + struct rdma_restrack_entry *res; + struct rdma_counter *counter; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) { + rdma_restrack_put(res); + return NULL; + } + + counter = container_of(res, struct rdma_counter, res); + kref_get(&counter->kref); + rdma_restrack_put(res); + + return counter; +} + +/** + * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id + */ +int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + counter = rdma_get_counter_by_id(dev, counter_id); + if (!counter) { + ret = -ENOENT; + goto err; + } + + if (counter->res.task != qp->res.task) { + ret = -EINVAL; + goto err_task; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_task; + + rdma_restrack_put(&qp->res); + return 0; + +err_task: + kref_put(&counter->kref, counter_release); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it + * The id of new counter is returned in @counter_id + */ +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, + u32 qp_num, u32 *counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto err; + } + + counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL); + if (!counter) { + ret = -ENOMEM; + goto err; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_bind; + + if (counter_id) + *counter_id = counter->id; + + rdma_counter_res_add(counter, qp); + + rdma_restrack_put(&qp->res); + return ret; + +err_bind: + rdma_counter_free(counter); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter + */ +int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto out; + } + + port_counter = &dev->port_data[port].port_counter; + if (!qp->counter || qp->counter->id != counter_id || + port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { + ret = -EINVAL; + goto out; + } + + ret = rdma_counter_unbind_qp(qp, false); + +out: + rdma_restrack_put(&qp->res); + return ret; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index bf2c3578768f..6603e10eb352 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -28,6 +28,7 @@ struct rdma_counter_mode { struct rdma_port_counter { struct rdma_counter_mode mode; struct rdma_hw_stats *hstats; + unsigned int num_counters; struct mutex lock; }; @@ -51,5 +52,11 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index); +int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id); +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, + u32 qp_num, u32 *counter_id); +int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id); #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 18dd88c0add8..ec86fab3d040 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -536,6 +536,12 @@ enum rdma_nl_counter_mode { */ RDMA_COUNTER_MODE_AUTO, + /* + * Which qp are bound with which counter is explicitly specified + * by the user + */ + RDMA_COUNTER_MODE_MANUAL, + /* * Always the end */ From b389327df90530d47931d0f5616b5cd6abb96c96 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:44 +0300 Subject: [PATCH 160/194] RDMA/nldev: Allow counter manual mode configration through RDMA netlink Provide an option to allow users to manually bind a qp with a counter through RDMA netlink. Limit it to users with ADMIN capability only. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 117 ++++++++++++++++++++++++++----- include/rdma/rdma_counter.h | 3 + include/uapi/rdma/rdma_netlink.h | 2 + 3 files changed, 106 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index cebc15b23b15..3d750eca53d5 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1649,8 +1649,8 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + u32 index, port, mode, mask = 0, qpn, cntn = 0; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; - u32 index, port, mode, mask = 0; struct ib_device *device; struct sk_buff *msg; int ret; @@ -1688,29 +1688,42 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, 0, 0); mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); - if (mode != RDMA_COUNTER_MODE_AUTO) { - ret = -EMSGSIZE; - goto err_msg; - } + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); - if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) - mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + ret = rdma_counter_set_auto_mode(device, port, + mask ? true : false, mask); + if (ret) + goto err_msg; + } else { + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, + qpn, &cntn); + } + if (ret) + goto err_msg; - ret = rdma_counter_set_auto_mode(device, port, - mask ? true : false, mask); - if (ret) - goto err_msg; - - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) || - nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { - ret = -EMSGSIZE; - goto err_msg; + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); err_msg: nlmsg_free(msg); err: @@ -1718,6 +1731,74 @@ err: return ret; } +static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port, qpn, cntn; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || + !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || + !tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); + if (ret) + goto err_unbind; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_fill: + rdma_counter_bind_qpn(device, port, qpn, cntn); +err_unbind: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1828,6 +1909,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_stat_get_doit, .dump = nldev_stat_get_dumpit, }, + [RDMA_NLDEV_CMD_STAT_DEL] = { + .doit = nldev_stat_del_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 6603e10eb352..68827700ba95 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -58,5 +58,8 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, u32 qp_num, u32 *counter_id); int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, u32 qp_num, u32 counter_id); +int rdma_counter_get_mode(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask); #endif /* _RDMA_COUNTER_H_ */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index ec86fab3d040..ce6fd66e7aa3 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -285,6 +285,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_GET, /* can dump */ + RDMA_NLDEV_CMD_STAT_DEL, + RDMA_NLDEV_NUM_OPS }; From 83c2c1fcbd08ec3eb69a7f381c4e453d1a45ade3 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:45 +0300 Subject: [PATCH 161/194] RDMA/nldev: Allow get counter mode through RDMA netlink Provide an option to get current counter mode through RDMA netlink. Signed-off-by: Mark Zhang Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 13 ++++++ drivers/infiniband/core/nldev.c | 66 +++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 0ebe36e9fa7b..01faef7bc061 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -576,6 +576,19 @@ out: return ret; } +int rdma_counter_get_mode(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask) +{ + struct rdma_port_counter *port_counter; + + port_counter = &dev->port_data[port].port_counter; + *mode = port_counter->mode.mode; + *mask = port_counter->mode.mask; + + return 0; +} + void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 3d750eca53d5..4993f47b0731 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1799,6 +1799,70 @@ err: return ret; } +static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) + +{ + static enum rdma_nl_counter_mode mode; + static enum rdma_nl_counter_mask mask; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + int ret; + + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) + return nldev_res_get_counter_doit(skb, nlh, extack); + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + ret = rdma_counter_get_mode(device, port, &mode, &mask); + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) + goto err_msg; + + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) + goto err_msg; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1812,7 +1876,7 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: - ret = nldev_res_get_counter_doit(skb, nlh, extack); + ret = stat_get_doit_qp(skb, nlh, extack, tb); break; default: From 6e7be47a53459ba3d288c3240ccd948fc699c377 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 2 Jul 2019 13:02:46 +0300 Subject: [PATCH 162/194] RDMA/nldev: Allow get default counter statistics through RDMA netlink This patch adds the ability to return the hwstats of per-port default counters (which can also be queried through sysfs nodes). Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 98 ++++++++++++++++++++++++++++++++- drivers/infiniband/core/sysfs.c | 6 ++ include/rdma/ib_verbs.h | 1 + 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 4993f47b0731..a4431ed566b6 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1799,6 +1799,99 @@ err: return ret; } +static int stat_get_doit_default_counter(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) +{ + struct rdma_hw_stats *stats; + struct nlattr *table_attr; + struct ib_device *device; + int ret, num_cnts, i; + struct sk_buff *msg; + u32 index, port; + u64 v; + + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) { + ret = -EINVAL; + goto err; + } + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_msg; + } + + stats = device->port_data ? device->port_data[port].hw_stats : NULL; + if (stats == NULL) { + ret = -EINVAL; + goto err_msg; + } + mutex_lock(&stats->lock); + + num_cnts = device->ops.get_hw_stats(device, stats, port, 0); + if (num_cnts < 0) { + ret = -EINVAL; + goto err_stats; + } + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) { + ret = -EMSGSIZE; + goto err_stats; + } + for (i = 0; i < num_cnts; i++) { + v = stats->value[i] + + rdma_counter_get_hwstat_value(device, port, i); + if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) { + ret = -EMSGSIZE; + goto err_table; + } + } + nla_nest_end(msg, table_attr); + + mutex_unlock(&stats->lock); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_table: + nla_nest_cancel(msg, table_attr); +err_stats: + mutex_unlock(&stats->lock); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) @@ -1871,9 +1964,12 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); - if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + if (ret) return -EINVAL; + if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) + return stat_get_doit_default_counter(skb, nlh, extack, tb); + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = stat_get_doit_qp(skb, nlh, extack, tb); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index c59b80e0a740..b477295a96c2 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1003,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port, goto err; port->hw_stats_ag = hsag; port->hw_stats = stats; + if (device->port_data) + device->port_data[port_num].hw_stats = stats; } else { struct kobject *kobj = &device->dev.kobj; ret = sysfs_create_group(kobj, hsag); @@ -1293,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = { void ib_free_port_attrs(struct ib_core_device *coredev) { + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; struct kobject *p, *t; list_for_each_entry_safe(p, t, &coredev->port_list, entry) { @@ -1302,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev) if (port->hw_stats_ag) free_hsag(&port->kobj, port->hw_stats_ag); kfree(port->hw_stats); + if (device->port_data && is_full_dev) + device->port_data[port->port_num].hw_stats = NULL; if (port->pma_table) sysfs_remove_group(p, port->pma_table); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0c5151a12ae4..50806bef9f20 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2124,6 +2124,7 @@ struct ib_port_data { struct net_device __rcu *netdev; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; + struct rdma_hw_stats *hw_stats; }; /* rdma netdev type - specifies protocol type */ From 6fafe560ee4ddb79b1c9d6c168d4f2e7eef2a037 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Mon, 24 Jun 2019 19:47:45 +0800 Subject: [PATCH 163/194] RDMA/hns: Bugfix for cleaning mtr It uses hns_roce_mtr_init in hns_roce_create_qp_common function. As a result, it should use hns_roce_mtr_cleanup function for cleaning mtr when destroying qp. Fixes: 8d18ad83f19b ("RDMA/hns: Fix bug when wqe num is larger than 16K") Signed-off-by: Xi Wang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 20e6b5139ef4..4ae547d96cd4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4565,7 +4565,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, (hr_qp->ibqp.qp_type == IB_QPT_UD)) hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1); - hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); + hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr); if (udata) { struct hns_roce_ucontext *context = From 21b97f538765996ac76b3a99adf371d0da5450f9 Mon Sep 17 00:00:00 2001 From: chenglang Date: Mon, 24 Jun 2019 19:47:46 +0800 Subject: [PATCH 164/194] RDMA/hns: Fixup qp release bug Hip06 reserve 12 qps, Hip08 reserve 8 qps. When the QP is released, the chip model is not judged, and the Hip08 cannot release the qpn 8~12 Signed-off-by: Lang Cheng Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 1 + drivers/infiniband/hw/hns/hns_roce_qp.c | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 998431c39b8d..2189e528d32d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1559,6 +1559,7 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->reserved_mrws = 1; caps->reserved_uars = 0; caps->reserved_cqs = 0; + caps->reserved_qps = 12; /* 2 SQP per port, six ports total 12 */ caps->chunk_sz = HNS_ROCE_V1_TABLE_CHUNK_SIZE; for (i = 0; i < caps->num_ports; i++) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 4f693cded74e..e7bbd6d967cb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -263,7 +263,7 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; - if (base_qpn < SQP_NUM) + if (base_qpn < hr_dev->caps.reserved_qps) return; hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR); @@ -1226,11 +1226,7 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) mutex_init(&qp_table->scc_mutex); xa_init(&hr_dev->qp_table_xa); - /* In hw v1, a port include two SQP, six ports total 12 */ - if (hr_dev->caps.max_sq_sg <= 2) - reserved_from_bot = SQP_NUM; - else - reserved_from_bot = hr_dev->caps.reserved_qps; + reserved_from_bot = hr_dev->caps.reserved_qps; ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps, hr_dev->caps.num_qps - 1, reserved_from_bot, From f5662b4ddc9db20033f310694edbd3fc7041f6f2 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Mon, 24 Jun 2019 19:47:47 +0800 Subject: [PATCH 165/194] RDMA/hns: Modify ba page size for cqe Currently, the depth of cq only supports 64K. According to the UM, the depth of cq is up to 4M, Therefore the ba page size of cqe was modified to support the maximum specification of cq depth. Signed-off-by: Yangyang Li Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4ae547d96cd4..3559c0cf112d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1616,7 +1616,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->wqe_sq_hop_num = 2; caps->wqe_sge_hop_num = 1; caps->wqe_rq_hop_num = 2; - caps->cqe_ba_pg_sz = 0; + caps->cqe_ba_pg_sz = 6; caps->cqe_buf_pg_sz = 0; caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM; caps->srqwqe_ba_pg_sz = 0; From 726be12f5ca0a9b464e7d91add512071e4c224f6 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Mon, 24 Jun 2019 19:47:48 +0800 Subject: [PATCH 166/194] RDMA/hns: Set reset flag when hw resetting When hw resetting, there is no response from hw when driver sending cmdq. If driver still send cmdq to hw, the reset process may be blocked. So reset flag should be set to intercept the cmdq command when driver receiving "notify down" signal. Signed-off-by: Lang Cheng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 3559c0cf112d..7676ea07fdec 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6384,6 +6384,7 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) if (!hr_dev) return 0; + hr_dev->is_reset = true; hr_dev->active = false; hr_dev->dis_db = true; From 1dd7382b1bb608e7ccae3672621eaceca355ae8b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 1 Jul 2019 21:14:01 +0300 Subject: [PATCH 167/194] net/mlx5: Introduce VHCA tunnel device capability When using the device emulation feature (introduced in Bluefield-1 SOC), a privileged function (the device emulation manager) will be able to create a channel to execute commands on behalf of the emulated function. This channel will be a general object of type VHCA_TUNNEL that will have a unique ID for each emulated function. This ID will be passed in each cmd that will be issued by the emulation SW in a well known offset in the command header. This channel is needed since the emulated function doesn't have a normal command interface to the HCA HW, but some basic configuration for that function is needed (e.g. initialize and enable the HCA). For that matter, a specific command-set was defined and only those commands will be issued by the HCA. Signed-off-by: Max Gurtovoy Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 06881b79167e..ba60bd17a92a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1390,7 +1390,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_6c8[0x28]; u8 sf_base_id[0x10]; - u8 reserved_at_700[0x100]; + u8 reserved_at_700[0x80]; + u8 vhca_tunnel_commands[0x40]; + u8 reserved_at_7c0[0x40]; }; enum mlx5_flow_destination_type { @@ -9694,7 +9696,7 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits { u8 opcode[0x10]; u8 uid[0x10]; - u8 reserved_at_20[0x10]; + u8 vhca_tunnel_id[0x10]; u8 obj_type[0x10]; u8 obj_id[0x20]; From e0222d18d628ce96f0def790ea26b6b7b3c18f78 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Mon, 24 Jun 2019 19:47:49 +0800 Subject: [PATCH 168/194] RDMA/hns: Bugfix for calculating qp buffer size The buffer size of qp which used to allocate qp buffer space for storing sqwqe and rqwqe will be the length of buffer space. The kernel driver will use the buffer address and the same size to get the user memory. The same size named buff_size of qp. According the algorithm of calculating, The size of the two is not equal when users set the max sge of sq. Fixes: b28ca7cceff8 ("RDMA/hns: Limit extend sq sge num") Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index e7bbd6d967cb..c10960267f00 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -387,8 +387,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift), PAGE_SIZE); } else { page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - hr_qp->sge.sge_cnt = - max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num); + hr_qp->sge.sge_cnt = ex_sge_num ? + max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0; hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift), page_size) + HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt << From fd7dd8bc8784bc96213781f1bfb454ae1c13e79f Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Mon, 24 Jun 2019 19:47:50 +0800 Subject: [PATCH 169/194] RDMA/hns: Use %pK format pointer print The format specifier \"%p\" can leak kernel addresses. Use \"%pK\" instead. Signed-off-by: Lang Cheng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 2189e528d32d..1a2c7dad2a0d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -3903,7 +3903,8 @@ static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev, */ dma_rmb(); - dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, + dev_dbg(dev, "aeqe = %pK, aeqe->asyn.event_type = 0x%lx\n", + aeqe, roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); From ec5bc2cc69b4fc494e04d10fc5226f6f9cf67c56 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 24 Jun 2019 19:47:51 +0800 Subject: [PATCH 170/194] RDMA/hns: Fixs hw access invalid dma memory error When smmu is enable, if execute the perftest command and then use 'kill -9' to exit, follow this operation repeatedly, the kernel will have a high probability to print the following smmu event: arm-smmu-v3 arm-smmu-v3.1.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.1.auto: 0x00007d0000000010 arm-smmu-v3 arm-smmu-v3.1.auto: 0x0000020900000080 arm-smmu-v3 arm-smmu-v3.1.auto: 0x00000000f47cf000 arm-smmu-v3 arm-smmu-v3.1.auto: 0x00000000f47cf000 This is because the hw will periodically refresh the qpc cache until the next reset. This patch fixed it by removing the action that release qpc memory in the 'hns_roce_qp_free' function. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Xi Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index c10960267f00..e0424029b058 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -254,7 +254,6 @@ void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn); hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn); - hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn); } } From 617cf24f9f9efcd5e2f0258e7da10b8a2447662b Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Mon, 24 Jun 2019 19:47:52 +0800 Subject: [PATCH 171/194] RDMA/hns: Clean up unnecessary variable initialization Here Clean up unnecessary initial value for some variable. Signed-off-by: Lang Cheng Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/hns/hns_roce_pd.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index b83d5bd92329..0cd09bf4d7ea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -161,7 +161,7 @@ static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, u8 op_modifier, u16 op, unsigned long timeout) { - int ret = 0; + int ret; down(&hr_dev->cmd.event_sem); ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7676ea07fdec..b76e3beeafb8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2018,7 +2018,7 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev, unsigned long timeout) { struct device *dev = hr_dev->dev; - unsigned long end = 0; + unsigned long end; u32 status; end = msecs_to_jiffies(timeout) + jiffies; @@ -3016,7 +3016,7 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; struct hns_roce_cmd_mailbox *mailbox; - int ret = 0; + int ret; u16 op = 0xff; if (!hns_roce_check_whether_mhop(hr_dev, table->type)) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 9f83acec6001..1e4ba48f5613 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -309,7 +309,7 @@ static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask, static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret = 0; + int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 920ca76b5db1..912b89b4da34 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -92,7 +92,7 @@ void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) { struct resource *res; - int ret = 0; + int ret; /* Using bitmap to manager UAR index */ ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx); From d3e5397169175628696e92191dfc0e86d8e48db9 Mon Sep 17 00:00:00 2001 From: Maksym Planeta Date: Tue, 2 Jul 2019 15:49:28 +0200 Subject: [PATCH 172/194] ibverbs/rxe: Remove variable self-initialization In some cases (not in this particular one) variable self-initialization can lead to undefined behavior. In this case, it is just obscure code. Signed-off-by: Maksym Planeta Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 00eb99d3df86..116cafc9afcf 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -558,7 +558,7 @@ int rxe_completer(void *arg) { struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - struct rxe_send_wqe *wqe = wqe; + struct rxe_send_wqe *wqe = NULL; struct sk_buff *skb = NULL; struct rxe_pkt_info *pkt = NULL; enum comp_state state; From 4c7d6dcd364843e408a60952ba914bb72bafc6cc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 8 Jul 2019 11:36:32 -0300 Subject: [PATCH 173/194] RDMA/siw: Fix DEFINE_PER_CPU compilation when ARCH_NEEDS_WEAK_PER_CPU The initializer for the variable cannot be inside the macro (and zero initialization isn't needed anyhow). include/linux/percpu-defs.h:92:33: warning: '__pcpu_unique_use_cnt' initialized and declared 'extern' extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ ^~~~~~~~~~~~~~ include/linux/percpu-defs.h:115:2: note: in expansion of macro 'DEFINE_PER_CPU_SECTION' DEFINE_PER_CPU_SECTION(type, name, "") ^~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_main.c:129:8: note: in expansion of macro 'DEFINE_PER_CPU' static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); ^~~~~~~~~~~~~~ Also the rules for PER_CPU require the variable names to be globally unique, so prefix them with siw_ Fixes: b9be6f18cf9e ("rdma/siw: transmit path") Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 8 ++++---- drivers/infiniband/sw/siw/siw_qp_tx.c | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 3f5f3d27ebe5..fd2552a9091d 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -126,7 +126,7 @@ static int siw_dev_qualified(struct net_device *netdev) return 0; } -static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); +static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; @@ -215,7 +215,7 @@ int siw_get_tx_cpu(struct siw_device *sdev) if (!siw_tx_thread[cpu]) continue; - usage = atomic_read(&per_cpu(use_cnt, cpu)); + usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; @@ -226,7 +226,7 @@ int siw_get_tx_cpu(struct siw_device *sdev) out: if (tx_cpu >= 0) - atomic_inc(&per_cpu(use_cnt, tx_cpu)); + atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); @@ -235,7 +235,7 @@ out: void siw_put_tx_cpu(int cpu) { - atomic_dec(&per_cpu(use_cnt, cpu)); + atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 5e926fac51db..1c9fa8fa96e5 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1183,12 +1183,12 @@ struct tx_task_t { wait_queue_head_t waiting; }; -static DEFINE_PER_CPU(struct tx_task_t, tx_task_g); +static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); void siw_stop_tx_thread(int nr_cpu) { kthread_stop(siw_tx_thread[nr_cpu]); - wake_up(&per_cpu(tx_task_g, nr_cpu).waiting); + wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting); } int siw_run_sq(void *data) @@ -1196,7 +1196,7 @@ int siw_run_sq(void *data) const int nr_cpu = (unsigned int)(long)data; struct llist_node *active; struct siw_qp *qp; - struct tx_task_t *tx_task = &per_cpu(tx_task_g, nr_cpu); + struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); init_llist_head(&tx_task->active); init_waitqueue_head(&tx_task->waiting); @@ -1261,9 +1261,9 @@ int siw_sq_start(struct siw_qp *qp) } siw_qp_get(qp); - llist_add(&qp->tx_list, &per_cpu(tx_task_g, qp->tx_cpu).active); + llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); - wake_up(&per_cpu(tx_task_g, qp->tx_cpu).waiting); + wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); return 0; } From f10ff380fd7dfba4a36d40f8dd00fe17da8a1a10 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 8 Jul 2019 12:17:48 -0300 Subject: [PATCH 174/194] RDMA/rvt: Do not use a kernel header in the ABI rvt was using ib_sge as part of it's ABI, which is not allowed. Introduce a new struct with the same layout and use it instead. Fixes: dabac6e460ce ("IB/hfi1: Move receive work queue struct into uapi directory") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 32 ++++++++++++++++++++++++++----- include/uapi/rdma/rvt-abi.h | 9 +++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 11b4d3c1efd4..0b0a241c57ff 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1847,8 +1847,11 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* * Make sure queue entry is written * before the head index. @@ -2250,8 +2253,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* Make sure queue entry is written before the head index. */ smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); @@ -2259,6 +2265,22 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return 0; } +/* + * rvt used the internal kernel struct as part of its ABI, for now make sure + * the kernel struct does not change layout. FIXME: rvt should never cast the + * user struct to a kernel struct. + */ +static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge) +{ + BUILD_BUG_ON(offsetof(struct ib_sge, addr) != + offsetof(struct rvt_wqe_sge, addr)); + BUILD_BUG_ON(offsetof(struct ib_sge, length) != + offsetof(struct rvt_wqe_sge, length)); + BUILD_BUG_ON(offsetof(struct ib_sge, lkey) != + offsetof(struct rvt_wqe_sge, lkey)); + return (struct ib_sge *)sge; +} + /* * Validate a RWQE and fill in the SGE state. * Return 1 if OK. @@ -2282,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], + NULL, rvt_cast_sge(&wqe->sg_list[i]), IB_ACCESS_LOCAL_WRITE); if (unlikely(ret <= 0)) goto bad_lkey; diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index d2e35d24f1a9..7328293c715c 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,11 +10,16 @@ #include #include -#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif +struct rvt_wqe_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so @@ -39,7 +44,7 @@ struct rvt_rwqe { __u64 wr_id; __u8 num_sge; __u8 padding[7]; - struct ib_sge sg_list[]; + struct rvt_wqe_sge sg_list[]; }; /* From b6142608e8069dda26398e65b0a14eda6ca4282d Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 1 Jul 2019 21:14:02 +0300 Subject: [PATCH 175/194] IB/mlx5: Implement VHCA tunnel mechanism in DEVX This mechanism will allow function-A to perform operations "on behalf" of function-B via tunnel object. Function-A will have privileges for creating and using this tunnel object. For example, in the device emulation feature presented in Bluefield-1 SoC, using device emulation capability, one can present NVMe function to the host OS. Since the NVMe function doesn't have a normal command interface to the HCA HW, here is a need to create a channel that will be able to issue commands "on behalf" of this function. This channel is the VHCA_TUNNEL general object. The emulation software will create this tunnel for every managed function and issue commands via devx general cmd interface using the appropriate tunnel ID. When devX context will receive a command with non-zero vhca_tunnel_id, it will pass the command as-is down to the HCA. All the validation, security and resource tracking of the commands and the created tunneled objects is in the responsibility of the HCA FW. When a VHCA_TUNNEL object destroyed, the device will issue an internal FLR (function level reset) to the emulated function associated with this tunnel. This will destroy all the created resources using the tunnel mechanism. Signed-off-by: Max Gurtovoy Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 931f587dfb8f..2aa833417492 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -715,12 +715,16 @@ static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in) return c->devx_uid; } -static bool devx_is_general_cmd(void *in) + +static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); - if (opcode >= MLX5_CMD_OP_GENERAL_START && - opcode < MLX5_CMD_OP_GENERAL_END) + /* Pass all cmds for vhca_tunnel as general, tracking is done in FW */ + if ((MLX5_CAP_GEN_64(dev->mdev, vhca_tunnel_commands) && + MLX5_GET(general_obj_in_cmd_hdr, in, vhca_tunnel_id)) || + (opcode >= MLX5_CMD_OP_GENERAL_START && + opcode < MLX5_CMD_OP_GENERAL_END)) return true; switch (opcode) { @@ -846,7 +850,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( return uid; /* Only white list of some general HCA commands are allowed for this method. */ - if (!devx_is_general_cmd(cmd_in)) + if (!devx_is_general_cmd(cmd_in, dev)) return -EINVAL; cmd_out = uverbs_zalloc(attrs, cmd_out_len); @@ -1169,6 +1173,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( u32 obj_id; u16 opcode; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1259,6 +1266,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( int err; int uid; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1301,6 +1311,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( int uid; struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; @@ -1406,6 +1419,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( struct devx_async_cmd_event_file *ev_file; struct devx_async_data *async_data; + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; From a3a400da206bd0cf426571633da51547d44f4f42 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 28 Jun 2019 09:30:28 -0300 Subject: [PATCH 176/194] docs: infiniband: add it to the driver-api bookset While this contains some uAPI stuff, it was intended to be read by a kernel doc. So, let's not move it to a different dir, but, instead, just add it to the driver-api bookset. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jason Gunthorpe --- Documentation/index.rst | 1 + Documentation/infiniband/index.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/index.rst b/Documentation/index.rst index a7566ef62411..869616b57aa8 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -90,6 +90,7 @@ needed). driver-api/index core-api/index + infiniband/index media/index networking/index input/index diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst index 22eea64de722..9cd7615438b9 100644 --- a/Documentation/infiniband/index.rst +++ b/Documentation/infiniband/index.rst @@ -1,4 +1,4 @@ -:orphan: +.. SPDX-License-Identifier: GPL-2.0 ========== InfiniBand From 89705e92700170888236555fe91b45e4c1bb0985 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Fri, 5 Jul 2019 19:21:57 +0300 Subject: [PATCH 177/194] IB/mlx5: Report correctly tag matching rendezvous capability Userspace expects the IB_TM_CAP_RC bit to indicate that the device supports RC transport tag matching with rendezvous offload. However the firmware splits this into two capabilities for eager and rendezvous tag matching. Only if the FW supports both modes should userspace be told the tag matching capability is available. Cc: # 4.13 Fixes: eb761894351d ("IB/mlx5: Fill XRQ capabilities") Signed-off-by: Danit Goldberg Reviewed-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++-- include/rdma/ib_verbs.h | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 7581571bd9cd..56d4b1e9dd23 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1046,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->tm_caps.flags = IB_TM_CAP_RC; props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } + if (MLX5_CAP_GEN(mdev, tag_matching) && + MLX5_CAP_GEN(mdev, rndv_offload_rc)) { + props->tm_caps.flags = IB_TM_CAP_RNDV_RC; + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + } + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { props->cq_caps.max_cq_moderation_count = MLX5_MAX_CQ_COUNT; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 50806bef9f20..4053be51b7fa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -307,8 +307,8 @@ struct ib_rss_caps { }; enum ib_tm_cap_flags { - /* Support tag matching on RC transport */ - IB_TM_CAP_RC = 1 << 0, + /* Support tag matching with rendezvous offload for RC transport */ + IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { From f4915455dcf07c4f237d6160a4b6adb0575d2909 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 8 Jul 2019 13:59:02 +0300 Subject: [PATCH 178/194] linux/dim: Implement RDMA adaptive moderation (DIM) RDMA DIM implements a different algorithm from net DIM and is based on completions which is how we can implement interrupt moderation in RDMA. The algorithm optimizes for number of completions and ratio between completions and events. In order to avoid long latencies, the implementation performs fast reduction of moderation level when the traffic changes. Signed-off-by: Yamin Friedman Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/dim.h | 36 +++++++++++++++ lib/dim/Makefile | 6 +-- lib/dim/rdma_dim.c | 108 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 lib/dim/rdma_dim.c diff --git a/include/linux/dim.h b/include/linux/dim.h index aa9bdd47a648..aa69730c3b8d 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -82,6 +82,7 @@ struct dim_stats { * @prev_stats: Measured rates from previous iteration (for comparison) * @start_sample: Sampled data at start of current iteration * @work: Work to perform on action required + * @priv: A pointer to the struct that points to dim * @profile_ix: Current moderation profile * @mode: CQ period count mode * @tune_state: Algorithm tuning state (see below) @@ -95,6 +96,7 @@ struct dim { struct dim_sample start_sample; struct dim_sample measuring_sample; struct work_struct work; + void *priv; u8 profile_ix; u8 mode; u8 tune_state; @@ -363,4 +365,38 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); */ void net_dim(struct dim *dim, struct dim_sample end_sample); +/* RDMA DIM */ + +/* + * RDMA DIM profile: + * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES. + */ +#define RDMA_DIM_PARAMS_NUM_PROFILES 9 +#define RDMA_DIM_START_PROFILE 0 + +static const struct dim_cq_moder +rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +/** + * rdma_dim - Runs the adaptive moderation. + * @dim: The moderation struct. + * @completions: The number of completions collected in this round. + * + * Each call to rdma_dim takes the latest amount of completions that + * have been collected and counts them as a new event. + * Once enough events have been collected the algorithm decides a new + * moderation level. + */ +void rdma_dim(struct dim *dim, u64 completions); + #endif /* DIM_H */ diff --git a/lib/dim/Makefile b/lib/dim/Makefile index 160afe288df0..1d6858a108cb 100644 --- a/lib/dim/Makefile +++ b/lib/dim/Makefile @@ -2,8 +2,6 @@ # DIM Dynamic Interrupt Moderation library # -obj-$(CONFIG_DIMLIB) = net_dim.o +obj-$(CONFIG_DIMLIB) += dim.o -net_dim-y = \ - dim.o \ - net_dim.o +dim-y := dim.o net_dim.o rdma_dim.o diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c new file mode 100644 index 000000000000..f7e26c7b4749 --- /dev/null +++ b/lib/dim/rdma_dim.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include + +static int rdma_dim_step(struct dim *dim) +{ + if (dim->tune_state == DIM_GOING_RIGHT) { + if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + } + if (dim->tune_state == DIM_GOING_LEFT) { + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + } + + return DIM_STEPPED; +} + +static int rdma_dim_stats_compare(struct dim_stats *curr, + struct dim_stats *prev) +{ + /* first stat */ + if (!prev->cpms) + return DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms)) + return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio)) + return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_ix = dim->profile_ix; + u8 state = dim->tune_state; + int stats_res; + int step_res; + + if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) { + stats_res = rdma_dim_stats_compare(curr_stats, + &dim->prev_stats); + + switch (stats_res) { + case DIM_STATS_SAME: + if (curr_stats->cpe_ratio <= 50 * prev_ix) + dim->profile_ix = 0; + break; + case DIM_STATS_WORSE: + dim_turn(dim); + /* fall through */ + case DIM_STATS_BETTER: + step_res = rdma_dim_step(dim); + if (step_res == DIM_ON_EDGE) + dim_turn(dim); + break; + } + } + + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +void rdma_dim(struct dim *dim, u64 completions) +{ + struct dim_sample *curr_sample = &dim->measuring_sample; + struct dim_stats curr_stats; + u32 nevents; + + dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, + curr_sample->comp_ctr + completions, + &dim->measuring_sample); + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; + if (nevents < DIM_NEVENTS) + break; + dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); + if (rdma_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + /* fall through */ + case DIM_START_MEASURE: + dim->state = DIM_MEASURE_IN_PROGRESS; + dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, + curr_sample->comp_ctr, + &dim->start_sample); + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} +EXPORT_SYMBOL(rdma_dim); From da6629793aa6944db6c8a908ca1a52d87f1489aa Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 8 Jul 2019 13:59:03 +0300 Subject: [PATCH 179/194] RDMA/core: Provide RDMA DIM support for ULPs Added the interface in the infiniband driver that applies the rdma_dim adaptive moderation. There is now a special function for allocating an ib_cq that uses rdma_dim. Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over NVMf between two equal end-hosts with 56 cores across a Mellanox switch using null_blk device: READS without DIM: blk size | BW | IOPS | 99th percentile latency | 99.99th latency 512B | 3.8GiB/s | 7.7M | 1401 usec | 2442 usec 4k | 7.0GiB/s | 1.8M | 4817 usec | 6587 usec 64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec IO WRITES without DIM: blk size | BW | IOPS | 99th percentile latency | 99.99th latency 512B | 3.6GiB/s | 7.5M | 1434 usec | 2474 usec 4k | 6.3GiB/s | 1.6M | 938 usec | 1221 usec 64k | 10.7GiB/s| 175k | 8979 usec | 12780 usec IO READS with DIM: blk size | BW | IOPS | 99th percentile latency | 99.99th latency 512B | 4GiB/s | 8.2M | 816 usec | 889 usec 4k | 10.1GiB/s| 2.65M| 3359 usec | 5080 usec 64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec IO WRITES with DIM: blk size | BW | IOPS | 99th percentile latency | 99.99th latency 512B | 3.9GiB/s | 8.1M | 799 usec | 922 usec 4k | 9.6GiB/s | 2.5M | 717 usec | 1004 usec 64k | 10.7GiB/s| 176k | 8586 usec | 12256 usec The rdma_dim algorithm was designed to measure the effectiveness of moderation on the flow in a general way and thus should be appropriate for all RDMA storage protocols. rdma_dim is configured to be the default option based on performance improvement seen after extensive tests. Signed-off-by: Yamin Friedman Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 45 ++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 4 ++++ 2 files changed, 49 insertions(+) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 00d70f166209..ffd6e24109d5 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -18,6 +18,40 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) +static void ib_cq_rdma_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = rdma_dim_prof[dim->profile_ix].usec; + u16 comps = rdma_dim_prof[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void rdma_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || + cq->poll_ctx == IB_POLL_DIRECT) + return; + + dim = kzalloc(sizeof(struct dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, ib_cq_rdma_dim_work); +} + static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { @@ -78,6 +112,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private) static int ib_poll_handler(struct irq_poll *iop, int budget) { struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + struct dim *dim = cq->dim; int completed; completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); @@ -87,6 +122,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) irq_poll_sched(&cq->iop); } + if (dim) + rdma_dim(dim, completed); + return completed; } @@ -105,6 +143,8 @@ static void ib_cq_poll_work(struct work_struct *work) if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); + else if (cq->dim) + rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) @@ -161,6 +201,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, rdma_restrack_kadd(&cq->res); + rdma_dim_init(cq); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -223,6 +265,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); + if (cq->dim) + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); kfree(cq->wc); kfree(cq); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4053be51b7fa..c5f8a9f17063 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -1509,6 +1510,7 @@ struct ib_cq { struct work_struct work; }; struct workqueue_struct *comp_wq; + struct dim *dim; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2576,6 +2578,8 @@ struct ib_device { u16 is_switch:1; /* Indicates kernel verbs support, should not be used in drivers */ u16 kverbs_provider:1; + /* CQ adaptive moderation (RDMA DIM) */ + u16 use_cq_dim:1; u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; From f8fc8cd9c612c31f92b19b72f619fa043ec76e5e Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 8 Jul 2019 13:59:04 +0300 Subject: [PATCH 180/194] RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink Added parameter in ib_device for enabling dynamic interrupt moderation so that it can be configured in userspace using rdma tool. In order to set adaptive-moderation for an ib device the command is: rdma dev set [DEV] adaptive-moderation [on|off] Please set on/off. rdma dev show 0: mlx5_0: node_type ca fw 16.26.0055 node_guid 248a:0703:00a5:29d0 sys_image_guid 248a:0703:00a5:29d0 adaptive-moderation on rdma resource show cq dev mlx5_0 cqn 0 cqe 1023 users 4 poll-ctx UNBOUND_WORKQUEUE adaptive-moderation off comm [ib_core] Signed-off-by: Yamin Friedman Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 9 +++++++++ drivers/infiniband/core/nldev.c | 14 ++++++++++++++ include/uapi/rdma/rdma_netlink.h | 5 +++++ 5 files changed, 30 insertions(+) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index f277cb7aea29..85e103b147cc 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -7,6 +7,7 @@ menuconfig INFINIBAND depends on m || IPV6 != m depends on !ALPHA select IRQ_POLL + select DIMLIB ---help--- Core support for InfiniBand (IB). Make sure to also select any protocols you wish to use as well as drivers for your diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a953c2fa2e78..888d89ce81df 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns; int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index bdf61499e6d5..7f4affe8a10d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -448,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) return 0; } +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) +{ + if (use_dim > 1) + return -EINVAL; + ibdev->use_cq_dim = use_dim; + + return 0; +} + static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a4431ed566b6..d9f2a30e6467 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -52,6 +52,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, + [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX }, @@ -252,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) + return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device @@ -552,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) goto err; if (!rdma_is_kernel_res(res) && @@ -870,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_done; } + if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { + u8 use_dim; + + use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); + err = ib_device_set_dim(device, use_dim); + goto done; + } + done: ib_device_put(device); put_done: diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index ce6fd66e7aa3..8e277783fa96 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -520,6 +520,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */ + /* + * CQ adaptive moderatio (DIM) + */ + RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + /* * Always the end */ From 96e2fd733b9af304edcf30ef922dcaa4d3e1bfa6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 8 Jul 2019 13:59:05 +0300 Subject: [PATCH 181/194] RDMA/mlx5: Set RDMA DIM to be enabled by default Enable RDMA DIM by default for better user experience. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 56d4b1e9dd23..c2a5780cb394 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6428,6 +6428,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) mutex_init(&dev->lb.mutex); + dev->ib_dev.use_cq_dim = true; + return 0; } From bdce1290493caa3f8119f24b5dacc3fb7ca27389 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 27 Jun 2019 16:06:43 +0200 Subject: [PATCH 182/194] RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM Calculate the correct byte_len on the receiving side when a work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode. According to the IBA byte_len must indicate the number of written bytes, whereas it was always equal to zero for the IB_WC_RECV_RDMA_WITH_IMM opcode, even though data was transferred. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Konstantin Taranov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++++- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aca9f60f9b21..1cbfbd98eb22 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp->resp.va = reth_va(pkt); qp->resp.rkey = reth_rkey(pkt); qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); } access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp, pkt->mask & RXE_WRITE_MASK) ? IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; wc->vendor_err = 0; - wc->byte_len = wqe->dma.length - wqe->dma.resid; + wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + qp->resp.length : wqe->dma.length - wqe->dma.resid; /* fields after byte_len are different between kernel and user * space diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 6c997d39a418..5c4b2239129c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -213,6 +213,7 @@ struct rxe_resp_info { struct rxe_mem *mr; u32 resid; u32 rkey; + u32 length; u64 atomic_orig; /* SRQ only */ From 7a54f78d9387b75d8e64bac18e84cd65f639121f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 4 Jul 2019 16:04:01 +0300 Subject: [PATCH 183/194] IB/core: Work on the caller socket net namespace in nldev_newlink() While creating new RDMA devices based on netdevice name, consider the net namespace of the caller skb's socket similar to rest of the doit() callbacks and nldev_dellink() which deletes the RDMA device created using nldev_newlink(). Fixes: 3856ec4b93c94 ("RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index d9f2a30e6467..783e465e7c41 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1476,7 +1476,7 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); - ndev = dev_get_by_name(&init_net, ndev_name); + ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); if (!ndev) return -ENODEV; From 390d57728d8e6f7283030cb20d3b5459771a32f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 9 Jul 2019 09:44:47 -0300 Subject: [PATCH 184/194] RDMA/core: Make rdma_counter.h compile stand alone 5.4-rc1 will have new compile time debugging to test that headers can be compiled stand alone. Many rdma headers are already broken and excluded from the mechanism, however to avoid compile failures during the merge window fix enough so that the newly added header compiles clean. Fixes: 413d3347503b ("RDMA/counter: Add set/clear per-port auto mode support") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe Signed-off-by: Mark Zhang --- include/rdma/rdma_counter.h | 2 +- include/rdma/restrack.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 68827700ba95..eb99856e8b30 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -9,10 +9,10 @@ #include #include -#include #include #include +struct ib_device; struct ib_qp; struct auto_mode_param { diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 4041a4d96524..b0fc6b26bdf5 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -14,6 +14,9 @@ #include #include +struct ib_device; +struct sk_buff; + /** * enum rdma_restrack_type - HW objects to track */ @@ -52,8 +55,6 @@ enum rdma_restrack_type { RDMA_RESTRACK_MAX }; -struct ib_device; - /** * struct rdma_restrack_entry - metadata per-entry */ From d8d9ec7dc5abbb3f11d866e983c4984f5c2de9d6 Mon Sep 17 00:00:00 2001 From: Dag Moxnes Date: Tue, 9 Jul 2019 13:50:26 +0200 Subject: [PATCH 185/194] RDMA/core: Fix race when resolving IP address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the neighbour lock when copying the MAC address from the neighbour data struct in dst_fetch_ha. When not using the lock, it is possible for the function to race with neigh_update(), causing it to copy an torn MAC address: rdma_resolve_addr() rdma_resolve_ip() addr_resolve() addr_resolve_neigh() fetch_ha() dst_fetch_ha() memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN) and net_ioctl() arp_ioctl() arp_rec_delete() arp_invalidate() neigh_update() __neigh_update() memcpy(&neigh->ha, lladdr, dev->addr_len) It is possible to provoke this error by calling rdma_resolve_addr() in a tight loop, while deleting the corresponding ARP entry in another tight loop. Fixes: 51d45974515c ("infiniband: addr: Consolidate code to fetch neighbour hardware address from dst.") Signed-off-by: Dag Moxnes Signed-off-by: Håkon Bugge Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 2f7d14159841..9b76a8fcdd24 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -337,7 +337,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); + neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev); } neigh_release(n); From 4d2b8517ba1f3aba9a952ebf153ec972a127c80c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 9 Jul 2019 16:05:53 -0700 Subject: [PATCH 186/194] IB/rdmavt: Fix variable shadowing issue in rvt_create_cq clang warns: drivers/infiniband/sw/rdmavt/cq.c:260:7: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (err) ^~~ drivers/infiniband/sw/rdmavt/cq.c:310:9: note: uninitialized use occurs here return err; ^~~ drivers/infiniband/sw/rdmavt/cq.c:260:3: note: remove the 'if' if its condition is always false if (err) ^~~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:253:7: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!cq->ip) { ^~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:310:9: note: uninitialized use occurs here return err; ^~~ drivers/infiniband/sw/rdmavt/cq.c:253:3: note: remove the 'if' if its condition is always false if (!cq->ip) { ^~~~~~~~~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:211:9: note: initialize the variable 'err' to silence this warning int err; ^ = 0 2 warnings generated. The function scoped err variable is uninitialized when the flow jumps into the if statement. The if scoped err variable shadows the function scoped err variable, preventing the err assignments within the if statement to be reflected at the function level, which will cause uninitialized use when the goto statements are taken. Just remove the if scoped err declaration so that there is only one copy of the err variable for this function. Fixes: 239b0e52d8aa ("IB/hfi1: Move rvt_cq_wc struct into uapi directory") Link: https://github.com/ClangBuiltLinux/linux/issues/594 Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Acked-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/cq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index fac87b13329d..a85571a4cf57 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -247,8 +247,6 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - int err; - cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { err = -ENOMEM; From 85de5d53366f02dd0f81c9be8f435b27fb82b1f7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Wed, 10 Jul 2019 08:38:00 +0000 Subject: [PATCH 187/194] RDMA/siw: Remove unnecessary kthread create/destroy printouts There is already a warning if we cannot start any thread, and stopping those threads is not worth spamming the console. This also corrects a warning from gcc: drivers/infiniband/sw/siw/siw_main.c: In function 'siw_create_tx_threads': drivers/infiniband/sw/siw/siw_main.c:91:11: warning: variable 'rv' set but not used [-Wunused-but-set-variable] Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 4 +--- drivers/infiniband/sw/siw/siw_qp_tx.c | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index fd2552a9091d..f55c4e80aea4 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -88,7 +88,7 @@ static void siw_device_cleanup(struct ib_device *base_dev) static int siw_create_tx_threads(void) { - int cpu, rv, assigned = 0; + int cpu, assigned = 0; for_each_online_cpu(cpu) { /* Skip HT cores */ @@ -99,9 +99,7 @@ static int siw_create_tx_threads(void) kthread_create(siw_run_sq, (unsigned long *)(long)cpu, "siw_tx/%d", cpu); if (IS_ERR(siw_tx_thread[cpu])) { - rv = PTR_ERR(siw_tx_thread[cpu]); siw_tx_thread[cpu] = NULL; - pr_info("Creating TX thread for CPU %d failed", cpu); continue; } kthread_bind(siw_tx_thread[cpu], cpu); diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 1c9fa8fa96e5..f0d949e2e318 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1201,8 +1201,6 @@ int siw_run_sq(void *data) init_llist_head(&tx_task->active); init_waitqueue_head(&tx_task->waiting); - pr_info("Started siw TX thread on CPU %u\n", nr_cpu); - while (1) { struct llist_node *fifo_list = NULL; @@ -1240,8 +1238,6 @@ int siw_run_sq(void *data) siw_sq_resume(qp); } } - pr_info("Stopped siw TX thread on CPU %u\n", nr_cpu); - return 0; } From 775a41e281cf08b1d9e0f2dd89c062430b289a10 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 10 Jul 2019 10:48:00 -0700 Subject: [PATCH 188/194] rdma/siw: Use proper enumerated type in map_cqe_status clang warns several times: drivers/infiniband/sw/siw/siw_cq.c:31:4: warning: implicit conversion from enumeration type 'enum siw_wc_status' to different enumeration type 'enum siw_opcode' [-Wenum-conversion] { SIW_WC_SUCCESS, IB_WC_SUCCESS }, ~ ^~~~~~~~~~~~~~ Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods") Link: https://github.com/ClangBuiltLinux/linux/issues/596 Signed-off-by: Nathan Chancellor Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index e2a0ee40d5b5..e381ae9b7d62 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -25,7 +25,7 @@ static int map_wc_opcode[SIW_NUM_OPCODES] = { }; static struct { - enum siw_opcode siw; + enum siw_wc_status siw; enum ib_wc_status ib; } map_cqe_status[SIW_NUM_WC_STATUS] = { { SIW_WC_SUCCESS, IB_WC_SUCCESS }, From c421651fa2295d1219c36674c7eb8c574542ceea Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jul 2019 11:29:42 -0300 Subject: [PATCH 189/194] RDMA/siw: Add missing rtnl_lock around access to ifa ifa is protected by rcu or rtnl, add the missing locking. In this case we have to use rtnl since siw_listen_address() is sleeping. Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 8e618cb7261f..c25be723c15b 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1975,6 +1975,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); + rtnl_lock(); for_ifa(in_dev) { if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || @@ -1989,6 +1990,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } } endfor_ifa(in_dev); + rtnl_unlock(); in_dev_put(in_dev); } else if (id->local_addr.ss_family == AF_INET6) { struct inet6_dev *in6_dev = in6_dev_get(dev); From b45305d777f2f9209dae5a3b8249ca03166a4df3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 10 Jul 2019 15:39:30 +0200 Subject: [PATCH 190/194] rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS If LIBCRC32C and DMA_VIRT_OPS are not enabled: drivers/infiniband/sw/siw/siw_main.o: In function `siw_newlink': siw_main.c:(.text+0x35c): undefined reference to `dma_virt_ops' drivers/infiniband/sw/siw/siw_qp_rx.o: In function `siw_csum_update': siw_qp_rx.c:(.text+0x16): undefined reference to `crc32c' Fix the first issue by adding a select of DMA_VIRT_OPS. Fix the second issue by replacing the unneeded dependency on CRYPTO_CRC32 by a dependency on LIBCRC32C. Reported-by: noreply@ellerman.id.au (first issue) Fixes: c0cf5bdde46c ("rdma/siw: addition to kernel build environment") Signed-off-by: Geert Uytterhoeven Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index 94f684174ce3..b622fc62f2cd 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,7 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && CRYPTO_CRC32 + depends on INET && INFINIBAND && LIBCRC32C + select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over the Linux TCP/IP network stack. It enables a system with a From 855085d9686e107b77a1bdb935224a238a9fd8b9 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 11 Jul 2019 15:12:13 +0800 Subject: [PATCH 191/194] rdma/siw: Remove set but not used variable 's' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/sw/siw/siw_cm.c: In function siw_cm_llp_state_change: drivers/infiniband/sw/siw/siw_cm.c:1278:17: warning: variable s set but not used [-Wunused-but-set-variable] Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Reported-by: Hulk Robot Signed-off-by: YueHaibing Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index c25be723c15b..43f7f12e5f7f 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1275,7 +1275,6 @@ static void siw_cm_llp_error_report(struct sock *sk) static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; - struct socket *s; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); @@ -1288,8 +1287,6 @@ static void siw_cm_llp_state_change(struct sock *sk) } orig_state_change = cep->sk_state_change; - s = sk->sk_socket; - siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { From bedc0fd0f9b517698193d644f914b33951856fd2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 11 Jul 2019 09:55:56 -0400 Subject: [PATCH 192/194] RDMA/core: Fix -Wunused-const-variable warnings The commit below introduced a few compilation warnings. In file included from ./include/rdma/ib_verbs.h:64, from ./include/linux/mlx5/device.h:37, from ./include/linux/mlx5/driver.h:51, from drivers/net/ethernet/mellanox/mlx5/core/uar.c:36: ./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not used [-Wunused-const-variable=] rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { ^~~~~~~~~~~~~ In file included from ./include/rdma/ib_verbs.h:64, from ./include/linux/mlx5/device.h:37, from ./include/linux/mlx5/driver.h:51, from drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:37: ./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not used [-Wunused-const-variable=] rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { ^~~~~~~~~~~~~ Since only ib_cq_rdma_dim_work() in drivers/infiniband/core/cq.c uses it, just move the definition over there. Fixes: f4915455dcf0 ("linux/dim: Implement RDMA adaptive moderation (DIM)") Signed-off-by: Qian Cai Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 13 +++++++++++++ include/linux/dim.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index ffd6e24109d5..7c599878ccf7 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -18,6 +18,19 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) +static const struct dim_cq_moder +rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + static void ib_cq_rdma_dim_work(struct work_struct *w) { struct dim *dim = container_of(w, struct dim, work); diff --git a/include/linux/dim.h b/include/linux/dim.h index aa69730c3b8d..d3a0fbfff2bb 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -374,19 +374,6 @@ void net_dim(struct dim *dim, struct dim_sample end_sample); #define RDMA_DIM_PARAMS_NUM_PROFILES 9 #define RDMA_DIM_START_PROFILE 0 -static const struct dim_cq_moder -rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { - {1, 0, 1, 0}, - {1, 0, 4, 0}, - {2, 0, 4, 0}, - {2, 0, 8, 0}, - {4, 0, 8, 0}, - {16, 0, 8, 0}, - {16, 0, 16, 0}, - {32, 0, 16, 0}, - {32, 0, 32, 0}, -}; - /** * rdma_dim - Runs the adaptive moderation. * @dim: The moderation struct. From cea743f2ea814d3d54dfab667b68271d4f4e5fdf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 Jul 2019 11:12:18 -0500 Subject: [PATCH 193/194] RDMA/siw: Mark expected switch fall-throughs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/infiniband/sw/siw/siw_qp_rx.c: In function ‘siw_rdmap_complete’: drivers/infiniband/sw/siw/siw_qp_rx.c:1214:18: warning: this statement may fall through [-Wimplicit-fallthrough=] wqe->rqe.flags |= SIW_WQE_SOLICITED; ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_rx.c:1215:2: note: here case RDMAP_SEND: ^~~~ drivers/infiniband/sw/siw/siw_qp_tx.c: In function ‘siw_qp_sq_process’: drivers/infiniband/sw/siw/siw_qp_tx.c:1044:4: warning: this statement may fall through [-Wimplicit-fallthrough=] siw_wqe_put_mem(wqe, tx_type); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1045:3: note: here case SIW_OP_INVAL_STAG: ^~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1128:4: warning: this statement may fall through [-Wimplicit-fallthrough=] siw_wqe_put_mem(wqe, tx_type); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1129:3: note: here case SIW_OP_INVAL_STAG: ^~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_rx.c | 2 ++ drivers/infiniband/sw/siw/siw_qp_tx.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 682a290bc11e..f87657a11657 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1212,6 +1212,8 @@ static int siw_rdmap_complete(struct siw_qp *qp, int error) case RDMAP_SEND_SE: case RDMAP_SEND_SE_INVAL: wqe->rqe.flags |= SIW_WQE_SOLICITED; + /* Fall through */ + case RDMAP_SEND: case RDMAP_SEND_INVAL: if (wqe->wr_status == SIW_WR_IDLE) diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index f0d949e2e318..43020d2040fc 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1042,6 +1042,8 @@ next_wqe: case SIW_OP_SEND_REMOTE_INV: case SIW_OP_WRITE: siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + case SIW_OP_INVAL_STAG: case SIW_OP_REG_MR: if (tx_flags(wqe) & SIW_WQE_SIGNALLED) @@ -1126,6 +1128,8 @@ next_wqe: case SIW_OP_READ: case SIW_OP_READ_LOCAL_INV: siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + case SIW_OP_INVAL_STAG: case SIW_OP_REG_MR: siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, From 0b043644c0ca601cb19943a81aa1f1455dbe9461 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 12 Jul 2019 12:12:06 -0300 Subject: [PATCH 194/194] RMDA/siw: Require a 64 bit arch The new siw driver fails to build on i386 with drivers/infiniband/sw/siw/siw_qp.c:1025:3: error: invalid output size for constraint '+q' smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); As it is using 64 bit values with the smp_store_mb. Since the entire scheme here seems questionable, and we are in the merge window, fix the compile failures by disabling 32 bit support on this driver. A proper fix will be reviewed post merge window. Fixes: c0cf5bdde46c ("rdma/siw: addition to kernel build environment") Reported-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index b622fc62f2cd..dace276aea14 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,6 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && LIBCRC32C + depends on INET && INFINIBAND && LIBCRC32C && 64BIT select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over