{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA

Use the new generic EQ API to move all ODP RDMA data structures and logic
form mlx5 core driver into mlx5_ib driver.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
This commit is contained in:
Saeed Mahameed 2018-11-19 10:52:41 -08:00 committed by Leon Romanovsky
parent 7701707cb9
commit d5d284b829
10 changed files with 308 additions and 381 deletions

View File

@ -6040,6 +6040,11 @@ static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
return mlx5_ib_odp_init_one(dev); return mlx5_ib_odp_init_one(dev);
} }
void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_ib_odp_cleanup_one(dev);
}
int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
{ {
if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
@ -6225,7 +6230,7 @@ static const struct mlx5_ib_profile pf_profile = {
mlx5_ib_stage_dev_res_cleanup), mlx5_ib_stage_dev_res_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_ODP, STAGE_CREATE(MLX5_IB_STAGE_ODP,
mlx5_ib_stage_odp_init, mlx5_ib_stage_odp_init,
NULL), mlx5_ib_stage_odp_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_stage_counters_init, mlx5_ib_stage_counters_init,
mlx5_ib_stage_counters_cleanup), mlx5_ib_stage_counters_cleanup),
@ -6395,9 +6400,6 @@ static struct mlx5_interface mlx5_ib_interface = {
.add = mlx5_ib_add, .add = mlx5_ib_add,
.remove = mlx5_ib_remove, .remove = mlx5_ib_remove,
.event = mlx5_ib_event, .event = mlx5_ib_event,
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
.pfault = mlx5_ib_pfault,
#endif
.protocol = MLX5_INTERFACE_PROTOCOL_IB, .protocol = MLX5_INTERFACE_PROTOCOL_IB,
}; };

View File

@ -880,6 +880,15 @@ struct mlx5_ib_lb_state {
bool enabled; bool enabled;
}; };
struct mlx5_ib_pf_eq {
struct mlx5_ib_dev *dev;
struct mlx5_eq *core;
struct work_struct work;
spinlock_t lock; /* Pagefaults spinlock */
struct workqueue_struct *wq;
mempool_t *pool;
};
struct mlx5_ib_dev { struct mlx5_ib_dev {
struct ib_device ib_dev; struct ib_device ib_dev;
const struct uverbs_object_tree_def *driver_trees[7]; const struct uverbs_object_tree_def *driver_trees[7];
@ -902,6 +911,8 @@ struct mlx5_ib_dev {
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps; struct ib_odp_caps odp_caps;
u64 odp_max_size; u64 odp_max_size;
struct mlx5_ib_pf_eq odp_pf_eq;
/* /*
* Sleepable RCU that prevents destruction of MRs while they are still * Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler. * being used by a page fault handler.
@ -1158,9 +1169,8 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
struct mlx5_pagefault *pfault);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void); int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void); void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
@ -1175,6 +1185,7 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
} }
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; } static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}

View File

@ -37,6 +37,46 @@
#include "mlx5_ib.h" #include "mlx5_ib.h"
#include "cmd.h" #include "cmd.h"
#include <linux/mlx5/eq.h>
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed;
u32 token;
u8 event_subtype;
u8 type;
union {
/* Initiator or send message responder pagefault details. */
struct {
/* Received packet size, only valid for responders. */
u32 packet_size;
/*
* Number of resource holding WQE, depends on type.
*/
u32 wq_num;
/*
* WQE index. Refers to either the send queue or
* receive queue, according to event_subtype.
*/
u16 wqe_index;
} wqe;
/* RDMA responder pagefault details */
struct {
u32 r_key;
/*
* Received packet size, minimal size page fault
* resolution required for forward progress.
*/
u32 packet_size;
u32 rdma_op_len;
u64 rdma_va;
} rdma;
};
struct mlx5_ib_pf_eq *eq;
struct work_struct work;
};
#define MAX_PREFETCH_LEN (4*1024*1024U) #define MAX_PREFETCH_LEN (4*1024*1024U)
/* Timeout in ms to wait for an active mmu notifier to complete when handling /* Timeout in ms to wait for an active mmu notifier to complete when handling
@ -304,14 +344,20 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
{ {
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
pfault->wqe.wq_num : pfault->token; pfault->wqe.wq_num : pfault->token;
int ret = mlx5_core_page_fault_resume(dev->mdev, u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { };
pfault->token, u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { };
wq_num, int err;
pfault->type,
error); MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
if (ret) MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", MLX5_SET(page_fault_resume_in, in, token, pfault->token);
wq_num); MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
MLX5_SET(page_fault_resume_in, in, error, !!error);
err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
if (err)
mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
wq_num, err);
} }
static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
@ -1196,10 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
} }
} }
void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
struct mlx5_pagefault *pfault)
{ {
struct mlx5_ib_dev *dev = context;
u8 event_subtype = pfault->event_subtype; u8 event_subtype = pfault->event_subtype;
switch (event_subtype) { switch (event_subtype) {
@ -1216,6 +1260,203 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
} }
} }
static void mlx5_ib_eqe_pf_action(struct work_struct *work)
{
struct mlx5_pagefault *pfault = container_of(work,
struct mlx5_pagefault,
work);
struct mlx5_ib_pf_eq *eq = pfault->eq;
mlx5_ib_pfault(eq->dev, pfault);
mempool_free(pfault, eq->pool);
}
static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
{
struct mlx5_eqe_page_fault *pf_eqe;
struct mlx5_pagefault *pfault;
struct mlx5_eqe *eqe;
int cc = 0;
while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
if (!pfault) {
schedule_work(&eq->work);
break;
}
pf_eqe = &eqe->data.page_fault;
pfault->event_subtype = eqe->sub_type;
pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
eqe->sub_type, pfault->bytes_committed);
switch (eqe->sub_type) {
case MLX5_PFAULT_SUBTYPE_RDMA:
/* RDMA based event */
pfault->type =
be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
pfault->token =
be32_to_cpu(pf_eqe->rdma.pftype_token) &
MLX5_24BIT_MASK;
pfault->rdma.r_key =
be32_to_cpu(pf_eqe->rdma.r_key);
pfault->rdma.packet_size =
be16_to_cpu(pf_eqe->rdma.packet_length);
pfault->rdma.rdma_op_len =
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
pfault->rdma.rdma_va =
be64_to_cpu(pf_eqe->rdma.rdma_va);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
pfault->type, pfault->token,
pfault->rdma.r_key);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
pfault->rdma.rdma_op_len,
pfault->rdma.rdma_va);
break;
case MLX5_PFAULT_SUBTYPE_WQE:
/* WQE based event */
pfault->type =
(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
pfault->token =
be32_to_cpu(pf_eqe->wqe.token);
pfault->wqe.wq_num =
be32_to_cpu(pf_eqe->wqe.pftype_wq) &
MLX5_24BIT_MASK;
pfault->wqe.wqe_index =
be16_to_cpu(pf_eqe->wqe.wqe_index);
pfault->wqe.packet_size =
be16_to_cpu(pf_eqe->wqe.packet_length);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
pfault->type, pfault->token,
pfault->wqe.wq_num,
pfault->wqe.wqe_index);
break;
default:
mlx5_ib_warn(eq->dev,
"Unsupported page fault event sub-type: 0x%02hhx\n",
eqe->sub_type);
/* Unsupported page faults should still be
* resolved by the page fault handler
*/
}
pfault->eq = eq;
INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
queue_work(eq->wq, &pfault->work);
cc = mlx5_eq_update_cc(eq->core, ++cc);
}
mlx5_eq_update_ci(eq->core, cc, 1);
}
static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr)
{
struct mlx5_ib_pf_eq *eq = eq_ptr;
unsigned long flags;
if (spin_trylock_irqsave(&eq->lock, flags)) {
mlx5_ib_eq_pf_process(eq);
spin_unlock_irqrestore(&eq->lock, flags);
} else {
schedule_work(&eq->work);
}
return IRQ_HANDLED;
}
/* mempool_refill() was proposed but unfortunately wasn't accepted
* http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
* Cheap workaround.
*/
static void mempool_refill(mempool_t *pool)
{
while (pool->curr_nr < pool->min_nr)
mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
}
static void mlx5_ib_eq_pf_action(struct work_struct *work)
{
struct mlx5_ib_pf_eq *eq =
container_of(work, struct mlx5_ib_pf_eq, work);
mempool_refill(eq->pool);
spin_lock_irq(&eq->lock);
mlx5_ib_eq_pf_process(eq);
spin_unlock_irq(&eq->lock);
}
enum {
MLX5_IB_NUM_PF_EQE = 0x1000,
MLX5_IB_NUM_PF_DRAIN = 64,
};
static int
mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
{
struct mlx5_eq_param param = {};
int err;
INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
spin_lock_init(&eq->lock);
eq->dev = dev;
eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
sizeof(struct mlx5_pagefault));
if (!eq->pool)
return -ENOMEM;
eq->wq = alloc_workqueue("mlx5_ib_page_fault",
WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
MLX5_NUM_CMD_EQE);
if (!eq->wq) {
err = -ENOMEM;
goto err_mempool;
}
param = (struct mlx5_eq_param) {
.index = MLX5_EQ_PFAULT_IDX,
.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
.nent = MLX5_IB_NUM_PF_EQE,
.context = eq,
.handler = mlx5_ib_eq_pf_int
};
eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", &param);
if (IS_ERR(eq->core)) {
err = PTR_ERR(eq->core);
goto err_wq;
}
return 0;
err_wq:
destroy_workqueue(eq->wq);
err_mempool:
mempool_destroy(eq->pool);
return err;
}
static int
mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
{
int err;
err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
cancel_work_sync(&eq->work);
destroy_workqueue(eq->wq);
mempool_destroy(eq->pool);
return err;
}
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
{ {
if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
@ -1244,7 +1485,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{ {
int ret; int ret = 0;
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@ -1254,7 +1495,20 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
} }
} }
return 0; if (!MLX5_CAP_GEN(dev->mdev, pg))
return ret;
ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
return ret;
}
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
{
if (!MLX5_CAP_GEN(dev->mdev, pg))
return;
mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
} }
int mlx5_ib_odp_init(void) int mlx5_ib_odp_init(void)
@ -1264,4 +1518,3 @@ int mlx5_ib_odp_init(void)
return 0; return 0;
} }

View File

@ -139,17 +139,6 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
spin_lock_irq(&priv->ctx_lock); spin_lock_irq(&priv->ctx_lock);
list_add_tail(&dev_ctx->list, &priv->ctx_list); list_add_tail(&dev_ctx->list, &priv->ctx_list);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (dev_ctx->intf->pfault) {
if (priv->pfault) {
mlx5_core_err(dev, "multiple page fault handlers not supported");
} else {
priv->pfault_ctx = dev_ctx->context;
priv->pfault = dev_ctx->intf->pfault;
}
}
#endif
spin_unlock_irq(&priv->ctx_lock); spin_unlock_irq(&priv->ctx_lock);
} }
@ -179,15 +168,6 @@ void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
if (!dev_ctx) if (!dev_ctx)
return; return;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
spin_lock_irq(&priv->ctx_lock);
if (priv->pfault == dev_ctx->intf->pfault)
priv->pfault = NULL;
spin_unlock_irq(&priv->ctx_lock);
synchronize_srcu(&priv->pfault_srcu);
#endif
spin_lock_irq(&priv->ctx_lock); spin_lock_irq(&priv->ctx_lock);
list_del(&dev_ctx->list); list_del(&dev_ctx->list);
spin_unlock_irq(&priv->ctx_lock); spin_unlock_irq(&priv->ctx_lock);
@ -447,20 +427,6 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
spin_unlock_irqrestore(&priv->ctx_lock, flags); spin_unlock_irqrestore(&priv->ctx_lock, flags);
} }
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void mlx5_core_page_fault(struct mlx5_core_dev *dev,
struct mlx5_pagefault *pfault)
{
struct mlx5_priv *priv = &dev->priv;
int srcu_idx;
srcu_idx = srcu_read_lock(&priv->pfault_srcu);
if (priv->pfault)
priv->pfault(dev, priv->pfault_ctx, pfault);
srcu_read_unlock(&priv->pfault_srcu, srcu_idx);
}
#endif
void mlx5_dev_list_lock(void) void mlx5_dev_list_lock(void)
{ {
mutex_lock(&mlx5_intf_mutex); mutex_lock(&mlx5_intf_mutex);

View File

@ -56,13 +56,6 @@ enum {
MLX5_EQ_STATE_ALWAYS_ARMED = 0xb, MLX5_EQ_STATE_ALWAYS_ARMED = 0xb,
}; };
enum {
MLX5_NUM_SPARE_EQE = 0x80,
MLX5_NUM_ASYNC_EQE = 0x1000,
MLX5_NUM_CMD_EQE = 32,
MLX5_NUM_PF_DRAIN = 64,
};
enum { enum {
MLX5_EQ_DOORBEL_OFFSET = 0x40, MLX5_EQ_DOORBEL_OFFSET = 0x40,
}; };
@ -79,9 +72,6 @@ struct mlx5_eq_table {
struct mlx5_eq async_eq; struct mlx5_eq async_eq;
struct mlx5_eq cmd_eq; struct mlx5_eq cmd_eq;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct mlx5_eq_pagefault pfault_eq;
#endif
struct mutex lock; /* sync async eqs creations */ struct mutex lock; /* sync async eqs creations */
int num_comp_vectors; int num_comp_vectors;
struct mlx5_irq_info *irq_info; struct mlx5_irq_info *irq_info;
@ -222,224 +212,6 @@ static void eq_update_ci(struct mlx5_eq *eq, int arm)
mb(); mb();
} }
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static void eqe_pf_action(struct work_struct *work)
{
struct mlx5_pagefault *pfault = container_of(work,
struct mlx5_pagefault,
work);
struct mlx5_eq_pagefault *eq = pfault->eq;
mlx5_core_page_fault(eq->core->dev, pfault);
mempool_free(pfault, eq->pool);
}
static void eq_pf_process(struct mlx5_eq_pagefault *eq)
{
struct mlx5_core_dev *dev = eq->core->dev;
struct mlx5_eqe_page_fault *pf_eqe;
struct mlx5_pagefault *pfault;
struct mlx5_eqe *eqe;
int set_ci = 0;
while ((eqe = next_eqe_sw(eq->core))) {
pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
if (!pfault) {
schedule_work(&eq->work);
break;
}
dma_rmb();
pf_eqe = &eqe->data.page_fault;
pfault->event_subtype = eqe->sub_type;
pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
mlx5_core_dbg(dev,
"PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
eqe->sub_type, pfault->bytes_committed);
switch (eqe->sub_type) {
case MLX5_PFAULT_SUBTYPE_RDMA:
/* RDMA based event */
pfault->type =
be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
pfault->token =
be32_to_cpu(pf_eqe->rdma.pftype_token) &
MLX5_24BIT_MASK;
pfault->rdma.r_key =
be32_to_cpu(pf_eqe->rdma.r_key);
pfault->rdma.packet_size =
be16_to_cpu(pf_eqe->rdma.packet_length);
pfault->rdma.rdma_op_len =
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
pfault->rdma.rdma_va =
be64_to_cpu(pf_eqe->rdma.rdma_va);
mlx5_core_dbg(dev,
"PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
pfault->type, pfault->token,
pfault->rdma.r_key);
mlx5_core_dbg(dev,
"PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
pfault->rdma.rdma_op_len,
pfault->rdma.rdma_va);
break;
case MLX5_PFAULT_SUBTYPE_WQE:
/* WQE based event */
pfault->type =
(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
pfault->token =
be32_to_cpu(pf_eqe->wqe.token);
pfault->wqe.wq_num =
be32_to_cpu(pf_eqe->wqe.pftype_wq) &
MLX5_24BIT_MASK;
pfault->wqe.wqe_index =
be16_to_cpu(pf_eqe->wqe.wqe_index);
pfault->wqe.packet_size =
be16_to_cpu(pf_eqe->wqe.packet_length);
mlx5_core_dbg(dev,
"PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
pfault->type, pfault->token,
pfault->wqe.wq_num,
pfault->wqe.wqe_index);
break;
default:
mlx5_core_warn(dev,
"Unsupported page fault event sub-type: 0x%02hhx\n",
eqe->sub_type);
/* Unsupported page faults should still be
* resolved by the page fault handler
*/
}
pfault->eq = eq;
INIT_WORK(&pfault->work, eqe_pf_action);
queue_work(eq->wq, &pfault->work);
++eq->core->cons_index;
++set_ci;
if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
eq_update_ci(eq->core, 0);
set_ci = 0;
}
}
eq_update_ci(eq->core, 1);
}
static irqreturn_t mlx5_eq_pf_int(int irq, void *eq_ptr)
{
struct mlx5_eq_pagefault *eq = eq_ptr;
unsigned long flags;
if (spin_trylock_irqsave(&eq->lock, flags)) {
eq_pf_process(eq);
spin_unlock_irqrestore(&eq->lock, flags);
} else {
schedule_work(&eq->work);
}
return IRQ_HANDLED;
}
/* mempool_refill() was proposed but unfortunately wasn't accepted
* http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
* Chip workaround.
*/
static void mempool_refill(mempool_t *pool)
{
while (pool->curr_nr < pool->min_nr)
mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
}
static void eq_pf_action(struct work_struct *work)
{
struct mlx5_eq_pagefault *eq =
container_of(work, struct mlx5_eq_pagefault, work);
mempool_refill(eq->pool);
spin_lock_irq(&eq->lock);
eq_pf_process(eq);
spin_unlock_irq(&eq->lock);
}
static int
create_pf_eq(struct mlx5_core_dev *dev, struct mlx5_eq_pagefault *eq)
{
struct mlx5_eq_param param = {};
int err;
spin_lock_init(&eq->lock);
INIT_WORK(&eq->work, eq_pf_action);
eq->pool = mempool_create_kmalloc_pool(MLX5_NUM_PF_DRAIN,
sizeof(struct mlx5_pagefault));
if (!eq->pool)
return -ENOMEM;
eq->wq = alloc_workqueue("mlx5_page_fault",
WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
MLX5_NUM_CMD_EQE);
if (!eq->wq) {
err = -ENOMEM;
goto err_mempool;
}
param = (struct mlx5_eq_param) {
.index = MLX5_EQ_PFAULT_IDX,
.mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
.nent = MLX5_NUM_ASYNC_EQE,
.context = eq,
.handler = mlx5_eq_pf_int
};
eq->core = mlx5_eq_create_generic(dev, "mlx5_page_fault_eq", &param);
if (IS_ERR(eq->core)) {
err = PTR_ERR(eq->core);
goto err_wq;
}
return 0;
err_wq:
destroy_workqueue(eq->wq);
err_mempool:
mempool_destroy(eq->pool);
return err;
}
static int destroy_pf_eq(struct mlx5_core_dev *dev, struct mlx5_eq_pagefault *eq)
{
int err;
err = mlx5_eq_destroy_generic(dev, eq->core);
cancel_work_sync(&eq->work);
destroy_workqueue(eq->wq);
mempool_destroy(eq->pool);
return err;
}
int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
u32 wq_num, u8 type, int error)
{
u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {0};
MLX5_SET(page_fault_resume_in, in, opcode,
MLX5_CMD_OP_PAGE_FAULT_RESUME);
MLX5_SET(page_fault_resume_in, in, error, !!error);
MLX5_SET(page_fault_resume_in, in, page_fault_type, type);
MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
MLX5_SET(page_fault_resume_in, in, token, token);
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
}
EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
#endif
static void general_event_handler(struct mlx5_core_dev *dev, static void general_event_handler(struct mlx5_core_dev *dev,
struct mlx5_eqe *eqe) struct mlx5_eqe *eqe)
{ {
@ -1016,22 +788,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
goto err2; goto err2;
} }
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (MLX5_CAP_GEN(dev, pg)) {
err = create_pf_eq(dev, &table->pfault_eq);
if (err) {
mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
err);
goto err3;
}
}
return err; return err;
err3:
destroy_async_eq(dev, &table->pages_eq);
#else
return err;
#endif
err2: err2:
destroy_async_eq(dev, &table->async_eq); destroy_async_eq(dev, &table->async_eq);
@ -1047,15 +804,6 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_eq_table *table = dev->priv.eq_table;
int err; int err;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (MLX5_CAP_GEN(dev, pg)) {
err = destroy_pf_eq(dev, &table->pfault_eq);
if (err)
mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
err);
}
#endif
err = destroy_async_eq(dev, &table->pages_eq); err = destroy_async_eq(dev, &table->pages_eq);
if (err) if (err)
mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n", mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",

View File

@ -39,14 +39,6 @@ struct mlx5_eq_comp {
struct list_head list; struct list_head list;
}; };
struct mlx5_eq_pagefault {
struct mlx5_eq *core;
struct work_struct work;
spinlock_t lock; /* Pagefaults spinlock */
struct workqueue_struct *wq;
mempool_t *pool;
};
int mlx5_eq_table_init(struct mlx5_core_dev *dev); int mlx5_eq_table_init(struct mlx5_core_dev *dev);
void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev); void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev);
int mlx5_eq_table_create(struct mlx5_core_dev *dev); int mlx5_eq_table_create(struct mlx5_core_dev *dev);

View File

@ -1169,14 +1169,6 @@ static int init_one(struct pci_dev *pdev,
INIT_LIST_HEAD(&priv->waiting_events_list); INIT_LIST_HEAD(&priv->waiting_events_list);
priv->is_accum_events = false; priv->is_accum_events = false;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
err = init_srcu_struct(&priv->pfault_srcu);
if (err) {
dev_err(&pdev->dev, "init_srcu_struct failed with error code %d\n",
err);
goto clean_dev;
}
#endif
mutex_init(&priv->bfregs.reg_head.lock); mutex_init(&priv->bfregs.reg_head.lock);
mutex_init(&priv->bfregs.wc_head.lock); mutex_init(&priv->bfregs.wc_head.lock);
INIT_LIST_HEAD(&priv->bfregs.reg_head.list); INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
@ -1185,7 +1177,7 @@ static int init_one(struct pci_dev *pdev,
err = mlx5_pci_init(dev, priv); err = mlx5_pci_init(dev, priv);
if (err) { if (err) {
dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err); dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
goto clean_srcu; goto clean_dev;
} }
err = mlx5_health_init(dev); err = mlx5_health_init(dev);
@ -1218,11 +1210,7 @@ clean_health:
mlx5_health_cleanup(dev); mlx5_health_cleanup(dev);
close_pci: close_pci:
mlx5_pci_close(dev, priv); mlx5_pci_close(dev, priv);
clean_srcu:
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
cleanup_srcu_struct(&priv->pfault_srcu);
clean_dev: clean_dev:
#endif
devlink_free(devlink); devlink_free(devlink);
return err; return err;
@ -1246,9 +1234,6 @@ static void remove_one(struct pci_dev *pdev)
mlx5_pagealloc_cleanup(dev); mlx5_pagealloc_cleanup(dev);
mlx5_health_cleanup(dev); mlx5_health_cleanup(dev);
mlx5_pci_close(dev, priv); mlx5_pci_close(dev, priv);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
cleanup_srcu_struct(&priv->pfault_srcu);
#endif
devlink_free(devlink); devlink_free(devlink);
} }

View File

@ -100,8 +100,6 @@ int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
unsigned long param); unsigned long param);
void mlx5_core_page_fault(struct mlx5_core_dev *dev,
struct mlx5_pagefault *pfault);
void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force); void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
void mlx5_disable_device(struct mlx5_core_dev *dev); void mlx5_disable_device(struct mlx5_core_dev *dev);

View File

@ -510,7 +510,6 @@ struct mlx5_fc_stats {
struct mlx5_mpfs; struct mlx5_mpfs;
struct mlx5_eswitch; struct mlx5_eswitch;
struct mlx5_lag; struct mlx5_lag;
struct mlx5_pagefault;
struct mlx5_eq_table; struct mlx5_eq_table;
struct mlx5_rate_limit { struct mlx5_rate_limit {
@ -619,13 +618,6 @@ struct mlx5_priv {
struct mlx5_port_module_event_stats pme_stats; struct mlx5_port_module_event_stats pme_stats;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
void (*pfault)(struct mlx5_core_dev *dev,
void *context,
struct mlx5_pagefault *pfault);
void *pfault_ctx;
struct srcu_struct pfault_srcu;
#endif
struct mlx5_bfreg_data bfregs; struct mlx5_bfreg_data bfregs;
struct mlx5_uars_page *uar; struct mlx5_uars_page *uar;
}; };
@ -650,44 +642,6 @@ enum mlx5_pagefault_type_flags {
MLX5_PFAULT_RDMA = 1 << 2, MLX5_PFAULT_RDMA = 1 << 2,
}; };
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed;
u32 token;
u8 event_subtype;
u8 type;
union {
/* Initiator or send message responder pagefault details. */
struct {
/* Received packet size, only valid for responders. */
u32 packet_size;
/*
* Number of resource holding WQE, depends on type.
*/
u32 wq_num;
/*
* WQE index. Refers to either the send queue or
* receive queue, according to event_subtype.
*/
u16 wqe_index;
} wqe;
/* RDMA responder pagefault details */
struct {
u32 r_key;
/*
* Received packet size, minimal size page fault
* resolution required for forward progress.
*/
u32 packet_size;
u32 rdma_op_len;
u64 rdma_va;
} rdma;
};
struct mlx5_eq_pagefault *eq;
struct work_struct work;
};
struct mlx5_td { struct mlx5_td {
struct list_head tirs_list; struct list_head tirs_list;
u32 tdn; u32 tdn;
@ -1118,9 +1072,6 @@ struct mlx5_interface {
void (*detach)(struct mlx5_core_dev *dev, void *context); void (*detach)(struct mlx5_core_dev *dev, void *context);
void (*event)(struct mlx5_core_dev *dev, void *context, void (*event)(struct mlx5_core_dev *dev, void *context,
enum mlx5_dev_event event, unsigned long param); enum mlx5_dev_event event, unsigned long param);
void (*pfault)(struct mlx5_core_dev *dev,
void *context,
struct mlx5_pagefault *pfault);
void * (*get_dev)(void *context); void * (*get_dev)(void *context);
int protocol; int protocol;
struct list_head list; struct list_head list;

View File

@ -17,6 +17,10 @@ enum {
MLX5_EQ_VEC_COMP_BASE = MLX5_EQ_MAX_ASYNC_EQS, MLX5_EQ_VEC_COMP_BASE = MLX5_EQ_MAX_ASYNC_EQS,
}; };
#define MLX5_NUM_CMD_EQE (32)
#define MLX5_NUM_ASYNC_EQE (0x1000)
#define MLX5_NUM_SPARE_EQE (0x80)
struct mlx5_eq; struct mlx5_eq;
struct mlx5_eq_param { struct mlx5_eq_param {
@ -36,4 +40,21 @@ mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc); struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc);
void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm); void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm);
/* The HCA will think the queue has overflowed if we
* don't tell it we've been processing events. We
* create EQs with MLX5_NUM_SPARE_EQE extra entries,
* so we must update our consumer index at
* least that often.
*
* mlx5_eq_update_cc must be called on every EQE @EQ irq handler
*/
static inline u32 mlx5_eq_update_cc(struct mlx5_eq *eq, u32 cc)
{
if (unlikely(cc >= MLX5_NUM_SPARE_EQE)) {
mlx5_eq_update_ci(eq, cc, 0);
cc = 0;
}
return cc;
}
#endif /* MLX5_CORE_EQ_H */ #endif /* MLX5_CORE_EQ_H */