RDMA/umem: Get rid of per_mm->notifier_count
This is intrinsically racy and the scheme is simply unnecessary. New MR registration can wait for any on going invalidation to fully complete. CPU0 CPU1 if (atomic_read()) if (atomic_dec_and_test() && !list_empty()) { /* not taken */ } list_add() Putting the new UMEM into some kind of purgatory until another invalidate rolls through.. Instead hold the read side of the umem_rwsem across the pair'd start/end and get rid of the racy 'deferred add' approach. Since all umem's in the rbt are always ready to go, also get rid of the mn_counters_active stuff. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
This commit is contained in:
parent
f27a0d50a4
commit
ca748c39ea
|
@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
|
|||
static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
|
||||
/* Only update private counters for this umem if it has them.
|
||||
* Otherwise skip it. All page faults will be delayed for this umem. */
|
||||
if (umem_odp->mn_counters_active) {
|
||||
int notifiers_count = umem_odp->notifiers_count++;
|
||||
|
||||
if (notifiers_count == 0)
|
||||
/* Initialize the completion object for waiting on
|
||||
* notifiers. Since notifier_count is zero, no one
|
||||
* should be waiting right now. */
|
||||
reinit_completion(&umem_odp->notifier_completion);
|
||||
}
|
||||
if (umem_odp->notifiers_count++ == 0)
|
||||
/*
|
||||
* Initialize the completion object for waiting on
|
||||
* notifiers. Since notifier_count is zero, no one should be
|
||||
* waiting right now.
|
||||
*/
|
||||
reinit_completion(&umem_odp->notifier_completion);
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
}
|
||||
|
||||
static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
|
||||
/* Only update private counters for this umem if it has them.
|
||||
* Otherwise skip it. All page faults will be delayed for this umem. */
|
||||
if (umem_odp->mn_counters_active) {
|
||||
/*
|
||||
* This sequence increase will notify the QP page fault that
|
||||
* the page that is going to be mapped in the spte could have
|
||||
* been freed.
|
||||
*/
|
||||
++umem_odp->notifiers_seq;
|
||||
if (--umem_odp->notifiers_count == 0)
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
}
|
||||
/*
|
||||
* This sequence increase will notify the QP page fault that the page
|
||||
* that is going to be mapped in the spte could have been freed.
|
||||
*/
|
||||
++umem_odp->notifiers_seq;
|
||||
if (--umem_odp->notifiers_count == 0)
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
}
|
||||
|
||||
/* Account for a new mmu notifier in an ib_ucontext. */
|
||||
static void
|
||||
ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm)
|
||||
{
|
||||
atomic_inc(&per_mm->notifier_count);
|
||||
}
|
||||
|
||||
/* Account for a terminating mmu notifier in an ib_ucontext.
|
||||
*
|
||||
* Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
|
||||
* the function takes the semaphore itself. */
|
||||
static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm)
|
||||
{
|
||||
int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count);
|
||||
|
||||
if (zero_notifiers &&
|
||||
!list_empty(&per_mm->no_private_counters)) {
|
||||
/* No currently running mmu notifiers. Now is the chance to
|
||||
* add private accounting to all previously added umems. */
|
||||
struct ib_umem_odp *odp_data, *next;
|
||||
|
||||
/* Prevent concurrent mmu notifiers from working on the
|
||||
* no_private_counters list. */
|
||||
down_write(&per_mm->umem_rwsem);
|
||||
|
||||
/* Read the notifier_count again, with the umem_rwsem
|
||||
* semaphore taken for write. */
|
||||
if (!atomic_read(&per_mm->notifier_count)) {
|
||||
list_for_each_entry_safe(odp_data, next,
|
||||
&per_mm->no_private_counters,
|
||||
no_private_counters) {
|
||||
mutex_lock(&odp_data->umem_mutex);
|
||||
odp_data->mn_counters_active = true;
|
||||
list_del(&odp_data->no_private_counters);
|
||||
complete_all(&odp_data->notifier_completion);
|
||||
mutex_unlock(&odp_data->umem_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
up_write(&per_mm->umem_rwsem);
|
||||
}
|
||||
}
|
||||
|
||||
static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
|
||||
u64 start, u64 end, void *cookie)
|
||||
{
|
||||
|
@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
|
|||
if (!per_mm->context->invalidate_range)
|
||||
return;
|
||||
|
||||
ib_ucontext_notifier_start_account(per_mm);
|
||||
down_read(&per_mm->umem_rwsem);
|
||||
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0,
|
||||
ULLONG_MAX,
|
||||
|
@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
|||
else if (!down_read_trylock(&per_mm->umem_rwsem))
|
||||
return -EAGAIN;
|
||||
|
||||
ib_ucontext_notifier_start_account(per_mm);
|
||||
ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
|
||||
end,
|
||||
invalidate_range_start_trampoline,
|
||||
blockable, NULL);
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
|
||||
return ret;
|
||||
return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
|
||||
invalidate_range_start_trampoline,
|
||||
blockable, NULL);
|
||||
}
|
||||
|
||||
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
|
||||
|
@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
|
|||
if (!per_mm->context->invalidate_range)
|
||||
return;
|
||||
|
||||
/*
|
||||
* TODO: we currently bail out if there is any sleepable work to be done
|
||||
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
|
||||
* here. But this is ugly and fragile.
|
||||
*/
|
||||
down_read(&per_mm->umem_rwsem);
|
||||
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
|
||||
end,
|
||||
invalidate_range_end_trampoline, true, NULL);
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
ib_ucontext_notifier_end_account(per_mm);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops ib_umem_notifiers = {
|
||||
|
@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
|
|||
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
|
||||
rbt_ib_umem_insert(&umem_odp->interval_tree,
|
||||
&per_mm->umem_tree);
|
||||
|
||||
if (likely(!atomic_read(&per_mm->notifier_count)))
|
||||
umem_odp->mn_counters_active = true;
|
||||
else
|
||||
list_add(&umem_odp->no_private_counters,
|
||||
&per_mm->no_private_counters);
|
||||
up_write(&per_mm->umem_rwsem);
|
||||
}
|
||||
|
||||
|
@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
|
|||
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
|
||||
rbt_ib_umem_remove(&umem_odp->interval_tree,
|
||||
&per_mm->umem_tree);
|
||||
if (!umem_odp->mn_counters_active) {
|
||||
list_del(&umem_odp->no_private_counters);
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
}
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
|
||||
up_write(&per_mm->umem_rwsem);
|
||||
}
|
||||
|
@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
|
|||
per_mm->mm = mm;
|
||||
per_mm->umem_tree = RB_ROOT_CACHED;
|
||||
init_rwsem(&per_mm->umem_rwsem);
|
||||
INIT_LIST_HEAD(&per_mm->no_private_counters);
|
||||
|
||||
rcu_read_lock();
|
||||
per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
|
||||
|
|
|
@ -67,15 +67,9 @@ struct ib_umem_odp {
|
|||
struct mutex umem_mutex;
|
||||
void *private; /* for the HW driver to use. */
|
||||
|
||||
/* When false, use the notifier counter in the ucontext struct. */
|
||||
bool mn_counters_active;
|
||||
int notifiers_seq;
|
||||
int notifiers_count;
|
||||
|
||||
/* A linked list of umems that don't have private mmu notifier
|
||||
* counters yet. */
|
||||
struct list_head no_private_counters;
|
||||
|
||||
/* Tree tracking */
|
||||
struct umem_odp_node interval_tree;
|
||||
|
||||
|
@ -99,11 +93,8 @@ struct ib_ucontext_per_mm {
|
|||
struct rb_root_cached umem_tree;
|
||||
/* Protects umem_tree */
|
||||
struct rw_semaphore umem_rwsem;
|
||||
atomic_t notifier_count;
|
||||
|
||||
struct mmu_notifier mn;
|
||||
/* A list of umems that don't have private mmu notifier counters yet. */
|
||||
struct list_head no_private_counters;
|
||||
unsigned int odp_mrs_count;
|
||||
|
||||
struct list_head ucontext_list;
|
||||
|
@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
|
|||
* and the ucontext umem_mutex semaphore locked for read).
|
||||
*/
|
||||
|
||||
/* Do not allow page faults while the new ib_umem hasn't seen a state
|
||||
* with zero notifiers yet, and doesn't have its own valid set of
|
||||
* private counters. */
|
||||
if (!umem_odp->mn_counters_active)
|
||||
return 1;
|
||||
|
||||
if (unlikely(umem_odp->notifiers_count))
|
||||
return 1;
|
||||
if (umem_odp->notifiers_seq != mmu_seq)
|
||||
|
|
Loading…
Reference in New Issue