Merge branch 'Support stashing local kptrs with bpf_kptr_xchg'
Dave Marchevsky says: ==================== Local kptrs are kptrs allocated via bpf_obj_new with a type specified in program BTF. A BPF program which creates a local kptr has exclusive control of the lifetime of the kptr, and, prior to terminating, must: * free the kptr via bpf_obj_drop * If the kptr is a {list,rbtree} node, add the node to a {list, rbtree}, thereby passing control of the lifetime to the collection This series adds a third option: * stash the kptr in a map value using bpf_kptr_xchg As indicated by the use of "stash" to describe this behavior, the intended use of this feature is temporary storage of local kptrs. For example, a sched_ext ([0]) scheduler may want to create an rbtree node for each new cgroup on cgroup init, but to add that node to the rbtree as part of a separate program which runs on enqueue. Stashing the node in a map_value allows its lifetime to outlive the execution of the cgroup_init program. Behavior: There is no semantic difference between adding a kptr to a graph collection and "stashing" it in a map. In both cases exclusive ownership of the kptr's lifetime is passed to some containing data structure, which is responsible for bpf_obj_drop'ing it when the container goes away. Since graph collections also expect exclusive ownership of the nodes they contain, graph nodes cannot be both stashed in a map_value and contained by their corresponding collection. Implementation: Two observations simplify the verifier changes for this feature. First, kptrs ("referenced kptrs" until a recent renaming) require registration of a dtor function as part of their acquire/release semantics, so that a referenced kptr which is placed in a map_value is properly released when the map goes away. We want this exact behavior for local kptrs, but with bpf_obj_drop as the dtor instead of a per-btf_id dtor. The second observation is that, in terms of identification, "referenced kptr" and "local kptr" already don't interfere with one another. Consider the following example: struct node_data { long key; long data; struct bpf_rb_node node; }; struct map_value { struct node_data __kptr *node; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); __uint(max_entries, 1); } some_nodes SEC(".maps"); struct map_value *mapval; struct node_data *res; int key = 0; res = bpf_obj_new(typeof(*res)); if (!res) { /* err handling */ } mapval = bpf_map_lookup_elem(&some_nodes, &key); if (!mapval) { /* err handling */ } res = bpf_kptr_xchg(&mapval->node, res); if (res) bpf_obj_drop(res); The __kptr tag identifies map_value's node as a referenced kptr, while the PTR_TO_BTF_ID which bpf_obj_new returns - a type in some non-vmlinux, non-module BTF - identifies res as a local kptr. Type tag on the pointer indicates referenced kptr, while the type of the pointee indicates local kptr. So using existing facilities we can tell the verifier about a "referenced kptr" pointer to a "local kptr" pointee. When kptr_xchg'ing a kptr into a map_value, the verifier can recognize local kptr types and treat them like referenced kptrs with a properly-typed bpf_obj_drop as a dtor. Other implementation notes: * We don't need to do anything special to enforce "graph nodes cannot be both stashed in a map_value and contained by their corresponding collection" * bpf_kptr_xchg both returns and takes as input a (possibly-null) owning reference. It does not accept non-owning references as input by virtue of requiring a ref_obj_id. By definition, if a program has an owning ref to a node, the node isn't in a collection, so it's safe to pass ownership via bpf_kptr_xchg. Summary of patches: * Patch 1 modifies BTF plumbing to support using bpf_obj_drop as a dtor * Patch 2 adds verifier plumbing to support MEM_ALLOC-flagged param for bpf_kptr_xchg * Patch 3 adds selftests exercising the new behavior Changelog: v1 -> v2: https://lore.kernel.org/bpf/20230309180111.1618459-1-davemarchevsky@fb.com/ Patch #s used below refer to the patch's position in v1 unless otherwise specified. Patches 1-3 were applied and are not included in v2. Rebase onto latest bpf-next: "libbpf: Revert poisoning of strlcpy" Patch 4: "bpf: Support __kptr to local kptrs" * Remove !btf_is_kernel(btf) check, WARN_ON_ONCE instead (Alexei) Patch 6: "selftests/bpf: Add local kptr stashing test" * Add test which stashes 2 nodes and later unstashes one of them using a separate BPF program (Alexei) * Fix incorrect runner subtest name for original test (was "rbtree_add_nodes") ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
49b5300f1f
|
@ -189,10 +189,19 @@ enum btf_field_type {
|
|||
BPF_RB_NODE | BPF_RB_ROOT,
|
||||
};
|
||||
|
||||
typedef void (*btf_dtor_kfunc_t)(void *);
|
||||
typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
|
||||
|
||||
struct btf_field_kptr {
|
||||
struct btf *btf;
|
||||
struct module *module;
|
||||
btf_dtor_kfunc_t dtor;
|
||||
union {
|
||||
/* dtor used if btf_is_kernel(btf), otherwise the type
|
||||
* is program-allocated and obj_drop is used
|
||||
*/
|
||||
btf_dtor_kfunc_t dtor;
|
||||
btf_dtor_obj_drop obj_drop;
|
||||
};
|
||||
u32 btf_id;
|
||||
};
|
||||
|
||||
|
|
|
@ -121,8 +121,6 @@ struct btf_struct_metas {
|
|||
struct btf_struct_meta types[];
|
||||
};
|
||||
|
||||
typedef void (*btf_dtor_kfunc_t)(void *);
|
||||
|
||||
extern const struct file_operations btf_fops;
|
||||
|
||||
void btf_get(struct btf *btf);
|
||||
|
|
|
@ -3551,12 +3551,17 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
|
||||
|
||||
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
|
||||
struct btf_field_info *info)
|
||||
{
|
||||
struct module *mod = NULL;
|
||||
const struct btf_type *t;
|
||||
struct btf *kernel_btf;
|
||||
/* If a matching btf type is found in kernel or module BTFs, kptr_ref
|
||||
* is that BTF, otherwise it's program BTF
|
||||
*/
|
||||
struct btf *kptr_btf;
|
||||
int ret;
|
||||
s32 id;
|
||||
|
||||
|
@ -3565,7 +3570,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
|
|||
*/
|
||||
t = btf_type_by_id(btf, info->kptr.type_id);
|
||||
id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
|
||||
&kernel_btf);
|
||||
&kptr_btf);
|
||||
if (id == -ENOENT) {
|
||||
/* btf_parse_kptr should only be called w/ btf = program BTF */
|
||||
WARN_ON_ONCE(btf_is_kernel(btf));
|
||||
|
||||
/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
|
||||
* kptr allocated via bpf_obj_new
|
||||
*/
|
||||
field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
|
||||
id = info->kptr.type_id;
|
||||
kptr_btf = (struct btf *)btf;
|
||||
btf_get(kptr_btf);
|
||||
goto found_dtor;
|
||||
}
|
||||
if (id < 0)
|
||||
return id;
|
||||
|
||||
|
@ -3582,20 +3600,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
|
|||
* can be used as a referenced pointer and be stored in a map at
|
||||
* the same time.
|
||||
*/
|
||||
dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
|
||||
dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
|
||||
if (dtor_btf_id < 0) {
|
||||
ret = dtor_btf_id;
|
||||
goto end_btf;
|
||||
}
|
||||
|
||||
dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
|
||||
dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
|
||||
if (!dtor_func) {
|
||||
ret = -ENOENT;
|
||||
goto end_btf;
|
||||
}
|
||||
|
||||
if (btf_is_module(kernel_btf)) {
|
||||
mod = btf_try_get_module(kernel_btf);
|
||||
if (btf_is_module(kptr_btf)) {
|
||||
mod = btf_try_get_module(kptr_btf);
|
||||
if (!mod) {
|
||||
ret = -ENXIO;
|
||||
goto end_btf;
|
||||
|
@ -3605,7 +3623,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
|
|||
/* We already verified dtor_func to be btf_type_is_func
|
||||
* in register_btf_id_dtor_kfuncs.
|
||||
*/
|
||||
dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
|
||||
dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
|
||||
addr = kallsyms_lookup_name(dtor_func_name);
|
||||
if (!addr) {
|
||||
ret = -EINVAL;
|
||||
|
@ -3614,14 +3632,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
|
|||
field->kptr.dtor = (void *)addr;
|
||||
}
|
||||
|
||||
found_dtor:
|
||||
field->kptr.btf_id = id;
|
||||
field->kptr.btf = kernel_btf;
|
||||
field->kptr.btf = kptr_btf;
|
||||
field->kptr.module = mod;
|
||||
return 0;
|
||||
end_mod:
|
||||
module_put(mod);
|
||||
end_btf:
|
||||
btf_put(kernel_btf);
|
||||
btf_put(kptr_btf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -1896,14 +1896,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
|
|||
return p;
|
||||
}
|
||||
|
||||
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
|
||||
{
|
||||
if (rec)
|
||||
bpf_obj_free_fields(rec, p);
|
||||
bpf_mem_free(&bpf_global_ma, p);
|
||||
}
|
||||
|
||||
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
|
||||
{
|
||||
struct btf_struct_meta *meta = meta__ign;
|
||||
void *p = p__alloc;
|
||||
|
||||
if (meta)
|
||||
bpf_obj_free_fields(meta->record, p);
|
||||
bpf_mem_free(&bpf_global_ma, p);
|
||||
__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
|
||||
}
|
||||
|
||||
static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
|
||||
|
|
|
@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
|
|||
return;
|
||||
fields = rec->fields;
|
||||
for (i = 0; i < rec->cnt; i++) {
|
||||
struct btf_struct_meta *pointee_struct_meta;
|
||||
const struct btf_field *field = &fields[i];
|
||||
void *field_ptr = obj + field->offset;
|
||||
void *xchgd_field;
|
||||
|
||||
switch (fields[i].type) {
|
||||
case BPF_SPIN_LOCK:
|
||||
|
@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
|
|||
WRITE_ONCE(*(u64 *)field_ptr, 0);
|
||||
break;
|
||||
case BPF_KPTR_REF:
|
||||
field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
|
||||
xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
|
||||
if (!btf_is_kernel(field->kptr.btf)) {
|
||||
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
|
||||
field->kptr.btf_id);
|
||||
WARN_ON_ONCE(!pointee_struct_meta);
|
||||
field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
|
||||
pointee_struct_meta->record :
|
||||
NULL);
|
||||
} else {
|
||||
field->kptr.dtor(xchgd_field);
|
||||
}
|
||||
break;
|
||||
case BPF_LIST_HEAD:
|
||||
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
|
||||
|
|
|
@ -7123,6 +7123,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
|
|||
if (arg_type & PTR_MAYBE_NULL)
|
||||
type &= ~PTR_MAYBE_NULL;
|
||||
|
||||
if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
|
||||
type &= ~MEM_ALLOC;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
|
||||
expected = compatible->types[i];
|
||||
if (expected == NOT_INIT)
|
||||
|
@ -7185,7 +7188,8 @@ found:
|
|||
break;
|
||||
}
|
||||
case PTR_TO_BTF_ID | MEM_ALLOC:
|
||||
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
|
||||
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
|
||||
meta->func_id != BPF_FUNC_kptr_xchg) {
|
||||
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
|
@ -9151,6 +9155,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
|||
if (func_id == BPF_FUNC_kptr_xchg) {
|
||||
ret_btf = meta.kptr_field->kptr.btf;
|
||||
ret_btf_id = meta.kptr_field->kptr.btf_id;
|
||||
if (!btf_is_kernel(ret_btf))
|
||||
regs[BPF_REG_0].type |= MEM_ALLOC;
|
||||
} else {
|
||||
if (fn->ret_btf_id == BPF_PTR_POISON) {
|
||||
verbose(env, "verifier internal error:");
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
|
||||
|
||||
#include <test_progs.h>
|
||||
#include <network_helpers.h>
|
||||
|
||||
#include "local_kptr_stash.skel.h"
|
||||
static void test_local_kptr_stash_simple(void)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts,
|
||||
.data_in = &pkt_v4,
|
||||
.data_size_in = sizeof(pkt_v4),
|
||||
.repeat = 1,
|
||||
);
|
||||
struct local_kptr_stash *skel;
|
||||
int ret;
|
||||
|
||||
skel = local_kptr_stash__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "local_kptr_stash__open_and_load"))
|
||||
return;
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stash_rb_nodes), &opts);
|
||||
ASSERT_OK(ret, "local_kptr_stash_add_nodes run");
|
||||
ASSERT_OK(opts.retval, "local_kptr_stash_add_nodes retval");
|
||||
|
||||
local_kptr_stash__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_local_kptr_stash_unstash(void)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts,
|
||||
.data_in = &pkt_v4,
|
||||
.data_size_in = sizeof(pkt_v4),
|
||||
.repeat = 1,
|
||||
);
|
||||
struct local_kptr_stash *skel;
|
||||
int ret;
|
||||
|
||||
skel = local_kptr_stash__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "local_kptr_stash__open_and_load"))
|
||||
return;
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stash_rb_nodes), &opts);
|
||||
ASSERT_OK(ret, "local_kptr_stash_add_nodes run");
|
||||
ASSERT_OK(opts.retval, "local_kptr_stash_add_nodes retval");
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.unstash_rb_node), &opts);
|
||||
ASSERT_OK(ret, "local_kptr_stash_add_nodes run");
|
||||
ASSERT_EQ(opts.retval, 42, "local_kptr_stash_add_nodes retval");
|
||||
|
||||
local_kptr_stash__destroy(skel);
|
||||
}
|
||||
|
||||
void test_local_kptr_stash_success(void)
|
||||
{
|
||||
if (test__start_subtest("local_kptr_stash_simple"))
|
||||
test_local_kptr_stash_simple();
|
||||
if (test__start_subtest("local_kptr_stash_unstash"))
|
||||
test_local_kptr_stash_unstash();
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
|
||||
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include "bpf_experimental.h"
|
||||
|
||||
struct node_data {
|
||||
long key;
|
||||
long data;
|
||||
struct bpf_rb_node node;
|
||||
};
|
||||
|
||||
struct map_value {
|
||||
struct prog_test_ref_kfunc *not_kptr;
|
||||
struct prog_test_ref_kfunc __kptr *val;
|
||||
struct node_data __kptr *node;
|
||||
};
|
||||
|
||||
/* This is necessary so that LLVM generates BTF for node_data struct
|
||||
* If it's not included, a fwd reference for node_data will be generated but
|
||||
* no struct. Example BTF of "node" field in map_value when not included:
|
||||
*
|
||||
* [10] PTR '(anon)' type_id=35
|
||||
* [34] FWD 'node_data' fwd_kind=struct
|
||||
* [35] TYPE_TAG 'kptr_ref' type_id=34
|
||||
*
|
||||
* (with no node_data struct defined)
|
||||
* Had to do the same w/ bpf_kfunc_call_test_release below
|
||||
*/
|
||||
struct node_data *just_here_because_btf_bug;
|
||||
|
||||
extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
__uint(max_entries, 2);
|
||||
} some_nodes SEC(".maps");
|
||||
|
||||
static int create_and_stash(int idx, int val)
|
||||
{
|
||||
struct map_value *mapval;
|
||||
struct node_data *res;
|
||||
|
||||
mapval = bpf_map_lookup_elem(&some_nodes, &idx);
|
||||
if (!mapval)
|
||||
return 1;
|
||||
|
||||
res = bpf_obj_new(typeof(*res));
|
||||
if (!res)
|
||||
return 1;
|
||||
res->key = val;
|
||||
|
||||
res = bpf_kptr_xchg(&mapval->node, res);
|
||||
if (res)
|
||||
bpf_obj_drop(res);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tc")
|
||||
long stash_rb_nodes(void *ctx)
|
||||
{
|
||||
return create_and_stash(0, 41) ?: create_and_stash(1, 42);
|
||||
}
|
||||
|
||||
SEC("tc")
|
||||
long unstash_rb_node(void *ctx)
|
||||
{
|
||||
struct map_value *mapval;
|
||||
struct node_data *res;
|
||||
long retval;
|
||||
int key = 1;
|
||||
|
||||
mapval = bpf_map_lookup_elem(&some_nodes, &key);
|
||||
if (!mapval)
|
||||
return 1;
|
||||
|
||||
res = bpf_kptr_xchg(&mapval->node, NULL);
|
||||
if (res) {
|
||||
retval = res->key;
|
||||
bpf_obj_drop(res);
|
||||
return retval;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
SEC("tc")
|
||||
long stash_test_ref_kfunc(void *ctx)
|
||||
{
|
||||
struct prog_test_ref_kfunc *res;
|
||||
struct map_value *mapval;
|
||||
int key = 0;
|
||||
|
||||
mapval = bpf_map_lookup_elem(&some_nodes, &key);
|
||||
if (!mapval)
|
||||
return 1;
|
||||
|
||||
res = bpf_kptr_xchg(&mapval->val, NULL);
|
||||
if (res)
|
||||
bpf_kfunc_call_test_release(res);
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
Loading…
Reference in New Issue