bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().
Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into per-cpu free list the _rcu() flavor waits for RCU grace period and then moves objects into free_by_rcu_ttrace list where they are waiting for RCU task trace grace period to be freed into slab. The life cycle of objects: alloc: dequeue free_llist free: enqeueu free_llist free_rcu: enqueue free_by_rcu -> waiting_for_gp free_llist above high watermark -> free_by_rcu_ttrace after RCU GP waiting_for_gp -> free_by_rcu_ttrace free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Hou Tao <houtao1@huawei.com> Link: https://lore.kernel.org/bpf/20230706033447.54696-13-alexei.starovoitov@gmail.com
This commit is contained in:
parent
f76faa65c9
commit
5af6807bdb
|
@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
|
|||
/* kmalloc/kfree equivalent: */
|
||||
void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
|
||||
void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
|
||||
void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
|
||||
|
||||
/* kmem_cache_alloc/free equivalent: */
|
||||
void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
|
||||
void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
|
||||
void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
|
||||
void bpf_mem_cache_raw_free(void *ptr);
|
||||
void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
|
||||
|
||||
|
|
|
@ -101,6 +101,15 @@ struct bpf_mem_cache {
|
|||
bool draining;
|
||||
struct bpf_mem_cache *tgt;
|
||||
|
||||
/* list of objects to be freed after RCU GP */
|
||||
struct llist_head free_by_rcu;
|
||||
struct llist_node *free_by_rcu_tail;
|
||||
struct llist_head waiting_for_gp;
|
||||
struct llist_node *waiting_for_gp_tail;
|
||||
struct rcu_head rcu;
|
||||
atomic_t call_rcu_in_progress;
|
||||
struct llist_head free_llist_extra_rcu;
|
||||
|
||||
/* list of objects to be freed after RCU tasks trace GP */
|
||||
struct llist_head free_by_rcu_ttrace;
|
||||
struct llist_head waiting_for_gp_ttrace;
|
||||
|
@ -346,6 +355,69 @@ static void free_bulk(struct bpf_mem_cache *c)
|
|||
do_call_rcu_ttrace(tgt);
|
||||
}
|
||||
|
||||
static void __free_by_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
|
||||
struct bpf_mem_cache *tgt = c->tgt;
|
||||
struct llist_node *llnode;
|
||||
|
||||
llnode = llist_del_all(&c->waiting_for_gp);
|
||||
if (!llnode)
|
||||
goto out;
|
||||
|
||||
llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
|
||||
|
||||
/* Objects went through regular RCU GP. Send them to RCU tasks trace */
|
||||
do_call_rcu_ttrace(tgt);
|
||||
out:
|
||||
atomic_set(&c->call_rcu_in_progress, 0);
|
||||
}
|
||||
|
||||
static void check_free_by_rcu(struct bpf_mem_cache *c)
|
||||
{
|
||||
struct llist_node *llnode, *t;
|
||||
unsigned long flags;
|
||||
|
||||
/* drain free_llist_extra_rcu */
|
||||
if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
|
||||
inc_active(c, &flags);
|
||||
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
|
||||
if (__llist_add(llnode, &c->free_by_rcu))
|
||||
c->free_by_rcu_tail = llnode;
|
||||
dec_active(c, flags);
|
||||
}
|
||||
|
||||
if (llist_empty(&c->free_by_rcu))
|
||||
return;
|
||||
|
||||
if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
|
||||
/*
|
||||
* Instead of kmalloc-ing new rcu_head and triggering 10k
|
||||
* call_rcu() to hit rcutree.qhimark and force RCU to notice
|
||||
* the overload just ask RCU to hurry up. There could be many
|
||||
* objects in free_by_rcu list.
|
||||
* This hint reduces memory consumption for an artificial
|
||||
* benchmark from 2 Gbyte to 150 Mbyte.
|
||||
*/
|
||||
rcu_request_urgent_qs_task(current);
|
||||
return;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
|
||||
|
||||
inc_active(c, &flags);
|
||||
WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
|
||||
c->waiting_for_gp_tail = c->free_by_rcu_tail;
|
||||
dec_active(c, flags);
|
||||
|
||||
if (unlikely(READ_ONCE(c->draining))) {
|
||||
free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
|
||||
atomic_set(&c->call_rcu_in_progress, 0);
|
||||
} else {
|
||||
call_rcu_hurry(&c->rcu, __free_by_rcu);
|
||||
}
|
||||
}
|
||||
|
||||
static void bpf_mem_refill(struct irq_work *work)
|
||||
{
|
||||
struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
|
||||
|
@ -360,6 +432,8 @@ static void bpf_mem_refill(struct irq_work *work)
|
|||
alloc_bulk(c, c->batch, NUMA_NO_NODE);
|
||||
else if (cnt > c->high_watermark)
|
||||
free_bulk(c);
|
||||
|
||||
check_free_by_rcu(c);
|
||||
}
|
||||
|
||||
static void notrace irq_work_raise(struct bpf_mem_cache *c)
|
||||
|
@ -488,6 +562,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
|
|||
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist_extra), percpu);
|
||||
free_all(__llist_del_all(&c->free_by_rcu), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
|
||||
free_all(llist_del_all(&c->waiting_for_gp), percpu);
|
||||
}
|
||||
|
||||
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
|
||||
|
@ -500,8 +577,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
|
|||
|
||||
static void free_mem_alloc(struct bpf_mem_alloc *ma)
|
||||
{
|
||||
/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
|
||||
* still execute. Wait for it now before we freeing percpu caches.
|
||||
/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
|
||||
* might still execute. Wait for them.
|
||||
*
|
||||
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
|
||||
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
|
||||
|
@ -510,7 +587,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
|
|||
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
|
||||
* using rcu_trace_implies_rcu_gp() as well.
|
||||
*/
|
||||
rcu_barrier_tasks_trace();
|
||||
rcu_barrier(); /* wait for __free_by_rcu */
|
||||
rcu_barrier_tasks_trace(); /* wait for __free_rcu */
|
||||
if (!rcu_trace_implies_rcu_gp())
|
||||
rcu_barrier();
|
||||
free_mem_alloc_no_barrier(ma);
|
||||
|
@ -563,6 +641,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
|
|||
irq_work_sync(&c->refill_work);
|
||||
drain_mem_cache(c);
|
||||
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
|
||||
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
|
||||
}
|
||||
/* objcg is the same across cpus */
|
||||
if (c->objcg)
|
||||
|
@ -579,6 +658,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
|
|||
irq_work_sync(&c->refill_work);
|
||||
drain_mem_cache(c);
|
||||
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
|
||||
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
|
||||
}
|
||||
}
|
||||
if (c->objcg)
|
||||
|
@ -663,6 +743,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
|
|||
irq_work_raise(c);
|
||||
}
|
||||
|
||||
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
|
||||
{
|
||||
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
|
||||
unsigned long flags;
|
||||
|
||||
c->tgt = *(struct bpf_mem_cache **)llnode;
|
||||
|
||||
local_irq_save(flags);
|
||||
if (local_inc_return(&c->active) == 1) {
|
||||
if (__llist_add(llnode, &c->free_by_rcu))
|
||||
c->free_by_rcu_tail = llnode;
|
||||
} else {
|
||||
llist_add(llnode, &c->free_llist_extra_rcu);
|
||||
}
|
||||
local_dec(&c->active);
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (!atomic_read(&c->call_rcu_in_progress))
|
||||
irq_work_raise(c);
|
||||
}
|
||||
|
||||
/* Called from BPF program or from sys_bpf syscall.
|
||||
* In both cases migration is disabled.
|
||||
*/
|
||||
|
@ -696,6 +797,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
|
|||
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
|
||||
}
|
||||
|
||||
void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
|
||||
{
|
||||
int idx;
|
||||
|
||||
if (!ptr)
|
||||
return;
|
||||
|
||||
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
|
||||
if (idx < 0)
|
||||
return;
|
||||
|
||||
unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
|
||||
}
|
||||
|
||||
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
|
||||
{
|
||||
void *ret;
|
||||
|
@ -712,6 +827,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
|
|||
unit_free(this_cpu_ptr(ma->cache), ptr);
|
||||
}
|
||||
|
||||
void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
|
||||
{
|
||||
if (!ptr)
|
||||
return;
|
||||
|
||||
unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
|
||||
}
|
||||
|
||||
/* Directly does a kfree() without putting 'ptr' back to the free_llist
|
||||
* for reuse and without waiting for a rcu_tasks_trace gp.
|
||||
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
|
||||
|
|
Loading…
Reference in New Issue