RDMA/mlx5: Replace cache list with Xarray

The Xarray allows us to store the cached mkeys in memory efficient way.

Entries are reserved in the Xarray using xa_cmpxchg before calling to the
upcoming callbacks to avoid allocations in interrupt context.  The
xa_cmpxchg can sleep when using GFP_KERNEL, so we call it in a loop to
ensure one reserved entry for each process trying to reserve.

Link: https://lore.kernel.org/r/20220726071911.122765-3-michaelgur@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
Aharon Landau 2022-07-26 10:19:08 +03:00 committed by Jason Gunthorpe
parent 17ae355926
commit 86457a92df
2 changed files with 152 additions and 88 deletions

View File

@ -651,8 +651,6 @@ struct mlx5_ib_mr {
struct {
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
struct mlx5_async_work cb_work;
/* Cache list element */
struct list_head list;
};
/* Used only by kernel MRs (umem == NULL) */
@ -744,7 +742,8 @@ struct umr_common {
struct mlx5_cache_ent {
struct xarray mkeys;
struct list_head head;
unsigned long stored;
unsigned long reserved;
char name[4];
u32 order;
@ -756,18 +755,13 @@ struct mlx5_cache_ent {
u8 fill_to_high_water:1;
/*
* - available_mrs is the length of list head, ie the number of MRs
* available for immediate allocation.
* - total_mrs is available_mrs plus all in use MRs that could be
* - total_mrs is stored mkeys plus all in use MRs that could be
* returned to the cache.
* - limit is the low water mark for available_mrs, 2* limit is the
* - limit is the low water mark for stored mkeys, 2* limit is the
* upper water mark.
* - pending is the number of MRs currently being created
*/
u32 total_mrs;
u32 available_mrs;
u32 limit;
u32 pending;
/* Statistics */
u32 miss;

View File

@ -142,6 +142,104 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
}
static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
void *to_store)
{
XA_STATE(xas, &ent->mkeys, 0);
void *curr;
xa_lock_irq(&ent->mkeys);
if (limit_pendings &&
(ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
xa_unlock_irq(&ent->mkeys);
return -EAGAIN;
}
while (1) {
/*
* This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
* doesn't transparently unlock. Instead we set the xas index to
* the current value of reserved every iteration.
*/
xas_set(&xas, ent->reserved);
curr = xas_load(&xas);
if (!curr) {
if (to_store && ent->stored == ent->reserved)
xas_store(&xas, to_store);
else
xas_store(&xas, XA_ZERO_ENTRY);
if (xas_valid(&xas)) {
ent->reserved++;
if (to_store) {
if (ent->stored != ent->reserved)
__xa_store(&ent->mkeys,
ent->stored,
to_store,
GFP_KERNEL);
ent->stored++;
queue_adjust_cache_locked(ent);
WRITE_ONCE(ent->dev->cache.last_add,
jiffies);
}
}
}
xa_unlock_irq(&ent->mkeys);
/*
* Notice xas_nomem() must always be called as it cleans
* up any cached allocation.
*/
if (!xas_nomem(&xas, GFP_KERNEL))
break;
xa_lock_irq(&ent->mkeys);
}
if (xas_error(&xas))
return xas_error(&xas);
if (WARN_ON(curr))
return -EINVAL;
return 0;
}
static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
{
void *old;
ent->reserved--;
old = __xa_erase(&ent->mkeys, ent->reserved);
WARN_ON(old);
}
static void push_to_reserved(struct mlx5_cache_ent *ent, struct mlx5_ib_mr *mr)
{
void *old;
old = __xa_store(&ent->mkeys, ent->stored, mr, 0);
WARN_ON(old);
ent->stored++;
}
static struct mlx5_ib_mr *pop_stored_mkey(struct mlx5_cache_ent *ent)
{
struct mlx5_ib_mr *mr;
void *old;
ent->stored--;
ent->reserved--;
if (ent->stored == ent->reserved) {
mr = __xa_erase(&ent->mkeys, ent->stored);
WARN_ON(!mr);
return mr;
}
mr = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY,
GFP_KERNEL);
WARN_ON(!mr || xa_is_err(mr));
old = __xa_erase(&ent->mkeys, ent->reserved);
WARN_ON(old);
return mr;
}
static void create_mkey_callback(int status, struct mlx5_async_work *context)
{
struct mlx5_ib_mr *mr =
@ -154,7 +252,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
create_mkey_warn(dev, status, mr->out);
kfree(mr);
xa_lock_irqsave(&ent->mkeys, flags);
ent->pending--;
undo_push_reserve_mkey(ent);
WRITE_ONCE(dev->fill_delay, 1);
xa_unlock_irqrestore(&ent->mkeys, flags);
mod_timer(&dev->delay_timer, jiffies + HZ);
@ -169,12 +267,10 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
WRITE_ONCE(dev->cache.last_add, jiffies);
xa_lock_irqsave(&ent->mkeys, flags);
list_add_tail(&mr->list, &ent->head);
ent->available_mrs++;
push_to_reserved(ent, mr);
ent->total_mrs++;
/* If we are doing fill_to_high_water then keep going. */
queue_adjust_cache_locked(ent);
ent->pending--;
xa_unlock_irqrestore(&ent->mkeys, flags);
}
@ -237,31 +333,33 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
mr = alloc_cache_mr(ent, mkc);
if (!mr) {
err = -ENOMEM;
break;
goto free_in;
}
xa_lock_irq(&ent->mkeys);
if (ent->pending >= MAX_PENDING_REG_MR) {
err = -EAGAIN;
xa_unlock_irq(&ent->mkeys);
kfree(mr);
break;
}
ent->pending++;
xa_unlock_irq(&ent->mkeys);
err = push_mkey(ent, true, NULL);
if (err)
goto free_mr;
err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
&ent->dev->async_ctx, in, inlen,
mr->out, sizeof(mr->out),
&mr->cb_work);
if (err) {
xa_lock_irq(&ent->mkeys);
ent->pending--;
xa_unlock_irq(&ent->mkeys);
mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
kfree(mr);
break;
goto err_undo_reserve;
}
}
kfree(in);
return 0;
err_undo_reserve:
xa_lock_irq(&ent->mkeys);
undo_push_reserve_mkey(ent);
xa_unlock_irq(&ent->mkeys);
free_mr:
kfree(mr);
free_in:
kfree(in);
return err;
}
@ -310,11 +408,9 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
struct mlx5_ib_mr *mr;
lockdep_assert_held(&ent->mkeys.xa_lock);
if (list_empty(&ent->head))
if (!ent->stored)
return;
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
list_del(&mr->list);
ent->available_mrs--;
mr = pop_stored_mkey(ent);
ent->total_mrs--;
xa_unlock_irq(&ent->mkeys);
mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
@ -324,6 +420,7 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
bool limit_fill)
__acquires(&ent->mkeys) __releases(&ent->mkeys)
{
int err;
@ -332,10 +429,10 @@ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
while (true) {
if (limit_fill)
target = ent->limit * 2;
if (target == ent->available_mrs + ent->pending)
if (target == ent->reserved)
return 0;
if (target > ent->available_mrs + ent->pending) {
u32 todo = target - (ent->available_mrs + ent->pending);
if (target > ent->reserved) {
u32 todo = target - ent->reserved;
xa_unlock_irq(&ent->mkeys);
err = add_keys(ent, todo);
@ -366,15 +463,15 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
/*
* Target is the new value of total_mrs the user requests, however we
* cannot free MRs that are in use. Compute the target value for
* available_mrs.
* cannot free MRs that are in use. Compute the target value for stored
* mkeys.
*/
xa_lock_irq(&ent->mkeys);
if (target < ent->total_mrs - ent->available_mrs) {
if (target < ent->total_mrs - ent->stored) {
err = -EINVAL;
goto err_unlock;
}
target = target - (ent->total_mrs - ent->available_mrs);
target = target - (ent->total_mrs - ent->stored);
if (target < ent->limit || target > ent->limit*2) {
err = -EINVAL;
goto err_unlock;
@ -466,7 +563,7 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
bool ret;
xa_lock_irq(&ent->mkeys);
ret = ent->available_mrs < ent->limit;
ret = ent->stored < ent->limit;
xa_unlock_irq(&ent->mkeys);
if (ret)
return true;
@ -485,22 +582,22 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
return;
if (ent->available_mrs < ent->limit) {
if (ent->stored < ent->limit) {
ent->fill_to_high_water = true;
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
} else if (ent->fill_to_high_water &&
ent->available_mrs + ent->pending < 2 * ent->limit) {
ent->reserved < 2 * ent->limit) {
/*
* Once we start populating due to hitting a low water mark
* continue until we pass the high water mark.
*/
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
} else if (ent->available_mrs == 2 * ent->limit) {
} else if (ent->stored == 2 * ent->limit) {
ent->fill_to_high_water = false;
} else if (ent->available_mrs > 2 * ent->limit) {
} else if (ent->stored > 2 * ent->limit) {
/* Queue deletion of excess entries */
ent->fill_to_high_water = false;
if (ent->pending)
if (ent->stored != ent->reserved)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
msecs_to_jiffies(1000));
else
@ -518,8 +615,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
if (ent->disabled)
goto out;
if (ent->fill_to_high_water &&
ent->available_mrs + ent->pending < 2 * ent->limit &&
if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
!READ_ONCE(dev->fill_delay)) {
xa_unlock_irq(&ent->mkeys);
err = add_keys(ent, 1);
@ -528,8 +624,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
goto out;
if (err) {
/*
* EAGAIN only happens if pending is positive, so we
* will be rescheduled from reg_mr_callback(). The only
* EAGAIN only happens if there are pending MRs, so we
* will be rescheduled when storing them. The only
* failure path here is ENOMEM.
*/
if (err != -EAGAIN) {
@ -541,7 +637,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
msecs_to_jiffies(1000));
}
}
} else if (ent->available_mrs > 2 * ent->limit) {
} else if (ent->stored > 2 * ent->limit) {
bool need_delay;
/*
@ -593,7 +689,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
return ERR_PTR(-EOPNOTSUPP);
xa_lock_irq(&ent->mkeys);
if (list_empty(&ent->head)) {
if (!ent->stored) {
queue_adjust_cache_locked(ent);
ent->miss++;
xa_unlock_irq(&ent->mkeys);
@ -601,9 +697,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
if (IS_ERR(mr))
return mr;
} else {
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
list_del(&mr->list);
ent->available_mrs--;
mr = pop_stored_mkey(ent);
queue_adjust_cache_locked(ent);
xa_unlock_irq(&ent->mkeys);
@ -612,45 +706,23 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
return mr;
}
static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
struct mlx5_cache_ent *ent = mr->cache_ent;
WRITE_ONCE(dev->cache.last_add, jiffies);
xa_lock_irq(&ent->mkeys);
list_add_tail(&mr->list, &ent->head);
ent->available_mrs++;
queue_adjust_cache_locked(ent);
xa_unlock_irq(&ent->mkeys);
}
static void clean_keys(struct mlx5_ib_dev *dev, int c)
{
struct mlx5_mr_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent = &cache->ent[c];
struct mlx5_ib_mr *tmp_mr;
struct mlx5_ib_mr *mr;
LIST_HEAD(del_list);
cancel_delayed_work(&ent->dwork);
while (1) {
xa_lock_irq(&ent->mkeys);
if (list_empty(&ent->head)) {
xa_unlock_irq(&ent->mkeys);
break;
}
mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
list_move(&mr->list, &del_list);
ent->available_mrs--;
xa_lock_irq(&ent->mkeys);
while (ent->stored) {
mr = pop_stored_mkey(ent);
ent->total_mrs--;
xa_unlock_irq(&ent->mkeys);
mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
}
list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
list_del(&mr->list);
kfree(mr);
xa_lock_irq(&ent->mkeys);
}
xa_unlock_irq(&ent->mkeys);
}
static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
@ -680,7 +752,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
dir = debugfs_create_dir(ent->name, cache->root);
debugfs_create_file("size", 0600, dir, ent, &size_fops);
debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
debugfs_create_ulong("cur", 0400, dir, &ent->stored);
debugfs_create_u32("miss", 0600, dir, &ent->miss);
}
}
@ -709,7 +781,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
timer_setup(&dev->delay_timer, delay_time_func, 0);
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
ent = &cache->ent[i];
INIT_LIST_HEAD(&ent->head);
xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
ent->order = i + 2;
ent->dev = dev;
@ -1571,7 +1642,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
/* Stop DMA */
if (mr->cache_ent) {
if (mlx5r_umr_revoke_mr(mr)) {
if (mlx5r_umr_revoke_mr(mr) ||
push_mkey(mr->cache_ent, false, mr)) {
xa_lock_irq(&mr->cache_ent->mkeys);
mr->cache_ent->total_mrs--;
xa_unlock_irq(&mr->cache_ent->mkeys);
@ -1595,9 +1667,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
mlx5_ib_free_odp_mr(mr);
}
if (mr->cache_ent) {
mlx5_mr_cache_free(dev, mr);
} else {
if (!mr->cache_ent) {
mlx5_free_priv_descs(mr);
kfree(mr);
}