IB/mlx5: Add advise_mr() support
The verb advise_mr() is used to give advice to the kernel about an address range that belongs to a MR. Implement the verb and register it on the device. The current implementation supports the only known advice to date, prefetch. Signed-off-by: Moni Shoua <monis@mellanox.com> Reviewed-by: Guy Levi <guyle@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
This commit is contained in:
parent
ad8a449675
commit
813e90b1ae
|
@ -5712,6 +5712,8 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
|
|||
mlx5_ib_cleanup_multiport_master(dev);
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
cleanup_srcu_struct(&dev->mr_srcu);
|
||||
drain_workqueue(dev->advise_mr_wq);
|
||||
destroy_workqueue(dev->advise_mr_wq);
|
||||
#endif
|
||||
kfree(dev->port);
|
||||
}
|
||||
|
@ -5766,6 +5768,12 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
|
|||
dev->memic.dev = mdev;
|
||||
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
|
||||
if (!dev->advise_mr_wq) {
|
||||
err = -ENOMEM;
|
||||
goto err_free_port;
|
||||
}
|
||||
|
||||
err = init_srcu_struct(&dev->mr_srcu);
|
||||
if (err)
|
||||
goto err_free_port;
|
||||
|
|
|
@ -923,6 +923,7 @@ struct mlx5_ib_dev {
|
|||
*/
|
||||
struct srcu_struct mr_srcu;
|
||||
u32 null_mkey;
|
||||
struct workqueue_struct *advise_mr_wq;
|
||||
#endif
|
||||
struct mlx5_ib_flow_db *flow_db;
|
||||
/* protect resources needed as part of reset flow */
|
||||
|
@ -1085,6 +1086,12 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
|
|||
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
|
||||
u64 virt_addr, int access_flags,
|
||||
struct ib_udata *udata);
|
||||
int mlx5_ib_advise_mr(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
u32 flags,
|
||||
struct ib_sge *sg_list,
|
||||
u32 num_sge,
|
||||
struct uverbs_attr_bundle *attrs);
|
||||
struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
|
||||
struct ib_udata *udata);
|
||||
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
|
||||
|
@ -1182,6 +1189,10 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
|||
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
|
||||
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
|
||||
size_t nentries, struct mlx5_ib_mr *mr, int flags);
|
||||
|
||||
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
u32 flags, struct ib_sge *sg_list, u32 num_sge);
|
||||
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
||||
static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
|
@ -1197,6 +1208,13 @@ static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
|
|||
size_t nentries, struct mlx5_ib_mr *mr,
|
||||
int flags) {}
|
||||
|
||||
static int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
u32 flags, struct ib_sge *sg_list,
|
||||
u32 num_sge)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
||||
|
||||
/* Needed for rep profile */
|
||||
|
|
|
@ -1280,6 +1280,21 @@ err_free:
|
|||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
int mlx5_ib_advise_mr(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
u32 flags,
|
||||
struct ib_sge *sg_list,
|
||||
u32 num_sge,
|
||||
struct uverbs_attr_bundle *attrs)
|
||||
{
|
||||
if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
|
||||
advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
|
||||
sg_list, num_sge);
|
||||
}
|
||||
|
||||
struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
|
||||
struct ib_dm_mr_attr *attr,
|
||||
struct uverbs_attr_bundle *attrs)
|
||||
|
|
|
@ -549,10 +549,15 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
|
|||
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
|
||||
}
|
||||
|
||||
#define MLX5_PF_FLAGS_PREFETCH BIT(0)
|
||||
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
|
||||
static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
|
||||
u64 io_virt, size_t bcnt, u32 *bytes_mapped)
|
||||
u64 io_virt, size_t bcnt, u32 *bytes_mapped,
|
||||
u32 flags)
|
||||
{
|
||||
struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
|
||||
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
|
||||
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
|
||||
u64 access_mask = ODP_READ_ALLOWED_BIT;
|
||||
int npages = 0, page_shift, np;
|
||||
u64 start_idx, page_mask;
|
||||
|
@ -579,7 +584,15 @@ next_mr:
|
|||
page_mask = ~(BIT(page_shift) - 1);
|
||||
start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
|
||||
|
||||
if (mr->umem->writable)
|
||||
if (prefetch && !downgrade && !mr->umem->writable) {
|
||||
/* prefetch with write-access must
|
||||
* be supported by the MR
|
||||
*/
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mr->umem->writable && !downgrade)
|
||||
access_mask |= ODP_WRITE_ALLOWED_BIT;
|
||||
|
||||
current_seq = READ_ONCE(odp->notifiers_seq);
|
||||
|
@ -684,12 +697,13 @@ struct pf_frame {
|
|||
* -EFAULT when there's an error mapping the requested pages. The caller will
|
||||
* abort the page fault handling.
|
||||
*/
|
||||
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
|
||||
u32 key, u64 io_virt, size_t bcnt,
|
||||
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key,
|
||||
u64 io_virt, size_t bcnt,
|
||||
u32 *bytes_committed,
|
||||
u32 *bytes_mapped)
|
||||
u32 *bytes_mapped, u32 flags)
|
||||
{
|
||||
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
|
||||
bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
|
||||
struct pf_frame *head = NULL, *frame;
|
||||
struct mlx5_core_mkey *mmkey;
|
||||
struct mlx5_ib_mw *mw;
|
||||
|
@ -711,6 +725,12 @@ next_mr:
|
|||
goto srcu_unlock;
|
||||
}
|
||||
|
||||
if (prefetch && mmkey->type != MLX5_MKEY_MR) {
|
||||
mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
|
||||
ret = -EINVAL;
|
||||
goto srcu_unlock;
|
||||
}
|
||||
|
||||
switch (mmkey->type) {
|
||||
case MLX5_MKEY_MR:
|
||||
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
|
||||
|
@ -720,6 +740,11 @@ next_mr:
|
|||
goto srcu_unlock;
|
||||
}
|
||||
|
||||
if (prefetch && !mr->umem->is_odp) {
|
||||
ret = -EINVAL;
|
||||
goto srcu_unlock;
|
||||
}
|
||||
|
||||
if (!mr->umem->is_odp) {
|
||||
mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
|
||||
key);
|
||||
|
@ -729,7 +754,7 @@ next_mr:
|
|||
goto srcu_unlock;
|
||||
}
|
||||
|
||||
ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
|
||||
ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
|
||||
if (ret < 0)
|
||||
goto srcu_unlock;
|
||||
|
||||
|
@ -906,7 +931,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
|
|||
|
||||
ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
|
||||
&pfault->bytes_committed,
|
||||
bytes_mapped);
|
||||
bytes_mapped, 0);
|
||||
if (ret < 0)
|
||||
break;
|
||||
npages += ret;
|
||||
|
@ -1217,7 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
|
|||
}
|
||||
|
||||
ret = pagefault_single_data_segment(dev, rkey, address, length,
|
||||
&pfault->bytes_committed, NULL);
|
||||
&pfault->bytes_committed, NULL,
|
||||
0);
|
||||
if (ret == -EAGAIN) {
|
||||
/* We're racing with an invalidation, don't prefetch */
|
||||
prefetch_activated = 0;
|
||||
|
@ -1244,7 +1270,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
|
|||
|
||||
ret = pagefault_single_data_segment(dev, rkey, address,
|
||||
prefetch_len,
|
||||
&bytes_committed, NULL);
|
||||
&bytes_committed, NULL,
|
||||
0);
|
||||
if (ret < 0 && ret != -EAGAIN) {
|
||||
mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
|
||||
ret, pfault->token, address, prefetch_len);
|
||||
|
@ -1493,10 +1520,17 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
|
|||
}
|
||||
}
|
||||
|
||||
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
|
||||
.advise_mr = mlx5_ib_advise_mr,
|
||||
};
|
||||
|
||||
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
|
||||
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
|
||||
|
||||
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
|
||||
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
|
||||
if (ret) {
|
||||
|
@ -1528,3 +1562,76 @@ int mlx5_ib_odp_init(void)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct prefetch_mr_work {
|
||||
struct work_struct work;
|
||||
struct mlx5_ib_dev *dev;
|
||||
u32 pf_flags;
|
||||
u32 num_sge;
|
||||
struct ib_sge sg_list[0];
|
||||
};
|
||||
|
||||
static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags,
|
||||
struct ib_sge *sg_list, u32 num_sge)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < num_sge; ++i) {
|
||||
struct ib_sge *sg = &sg_list[i];
|
||||
int bytes_committed = 0;
|
||||
int ret;
|
||||
|
||||
ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr,
|
||||
sg->length,
|
||||
&bytes_committed, NULL,
|
||||
pf_flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
|
||||
{
|
||||
struct prefetch_mr_work *w =
|
||||
container_of(work, struct prefetch_mr_work, work);
|
||||
|
||||
if (w->dev->ib_dev.reg_state == IB_DEV_REGISTERED)
|
||||
mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list,
|
||||
w->num_sge);
|
||||
|
||||
kfree(w);
|
||||
}
|
||||
|
||||
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
u32 flags, struct ib_sge *sg_list, u32 num_sge)
|
||||
{
|
||||
struct mlx5_ib_dev *dev = to_mdev(pd->device);
|
||||
u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
|
||||
struct prefetch_mr_work *work;
|
||||
|
||||
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
|
||||
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
|
||||
|
||||
if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
|
||||
return mlx5_ib_prefetch_sg_list(dev, pf_flags, sg_list,
|
||||
num_sge);
|
||||
|
||||
if (dev->ib_dev.reg_state != IB_DEV_REGISTERED)
|
||||
return -ENODEV;
|
||||
|
||||
work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
|
||||
if (!work)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
|
||||
|
||||
work->dev = dev;
|
||||
work->pf_flags = pf_flags;
|
||||
work->num_sge = num_sge;
|
||||
|
||||
INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
|
||||
schedule_work(&work->work);
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue