OpenCloudOS-Kernel/kernel/bpf/mprog.c

448 lines
12 KiB
C
Raw Normal View History

bpf: Add generic attach/detach/query API for multi-progs This adds a generic layer called bpf_mprog which can be reused by different attachment layers to enable multi-program attachment and dependency resolution. In-kernel users of the bpf_mprog don't need to care about the dependency resolution internals, they can just consume it with few API calls. The initial idea of having a generic API sparked out of discussion [0] from an earlier revision of this work where tc's priority was reused and exposed via BPF uapi as a way to coordinate dependencies among tc BPF programs, similar as-is for classic tc BPF. The feedback was that priority provides a bad user experience and is hard to use [1], e.g.: I cannot help but feel that priority logic copy-paste from old tc, netfilter and friends is done because "that's how things were done in the past". [...] Priority gets exposed everywhere in uapi all the way to bpftool when it's right there for users to understand. And that's the main problem with it. The user don't want to and don't need to be aware of it, but uapi forces them to pick the priority. [...] Your cover letter [0] example proves that in real life different service pick the same priority. They simply don't know any better. Priority is an unnecessary magic that apps _have_ to pick, so they just copy-paste and everyone ends up using the same. The course of the discussion showed more and more the need for a generic, reusable API where the "same look and feel" can be applied for various other program types beyond just tc BPF, for example XDP today does not have multi- program support in kernel, but also there was interest around this API for improving management of cgroup program types. Such common multi-program management concept is useful for BPF management daemons or user space BPF applications coordinating internally about their attachments. Both from Cilium and Meta side [2], we've collected the following requirements for a generic attach/detach/query API for multi-progs which has been implemented as part of this work: - Support prog-based attach/detach and link API - Dependency directives (can also be combined): - BPF_F_{BEFORE,AFTER} with relative_{fd,id} which can be {prog,link,none} - BPF_F_ID flag as {fd,id} toggle; the rationale for id is so that user space application does not need CAP_SYS_ADMIN to retrieve foreign fds via bpf_*_get_fd_by_id() - BPF_F_LINK flag as {prog,link} toggle - If relative_{fd,id} is none, then BPF_F_BEFORE will just prepend, and BPF_F_AFTER will just append for attaching - Enforced only at attach time - BPF_F_REPLACE with replace_bpf_fd which can be prog, links have their own infra for replacing their internal prog - If no flags are set, then it's default append behavior for attaching - Internal revision counter and optionally being able to pass expected_revision - User space application can query current state with revision, and pass it along for attachment to assert current state before doing updates - Query also gets extension for link_ids array and link_attach_flags: - prog_ids are always filled with program IDs - link_ids are filled with link IDs when link was used, otherwise 0 - {prog,link}_attach_flags for holding {prog,link}-specific flags - Must be easy to integrate/reuse for in-kernel users The uapi-side changes needed for supporting bpf_mprog are rather minimal, consisting of the additions of the attachment flags, revision counter, and expanding existing union with relative_{fd,id} member. The bpf_mprog framework consists of an bpf_mprog_entry object which holds an array of bpf_mprog_fp (fast-path structure). The bpf_mprog_cp (control-path structure) is part of bpf_mprog_bundle. Both have been separated, so that fast-path gets efficient packing of bpf_prog pointers for maximum cache efficiency. Also, array has been chosen instead of linked list or other structures to remove unnecessary indirections for a fast point-to-entry in tc for BPF. The bpf_mprog_entry comes as a pair via bpf_mprog_bundle so that in case of updates the peer bpf_mprog_entry is populated and then just swapped which avoids additional allocations that could otherwise fail, for example, in detach case. bpf_mprog_{fp,cp} arrays are currently static, but they could be converted to dynamic allocation if necessary at a point in future. Locking is deferred to the in-kernel user of bpf_mprog, for example, in case of tcx which uses this API in the next patch, it piggybacks on rtnl. An extensive test suite for checking all aspects of this API for prog-based attach/detach and link API comes as BPF selftests in this series. Thanks also to Andrii Nakryiko for early API discussions wrt Meta's BPF prog management. [0] https://lore.kernel.org/bpf/20221004231143.19190-1-daniel@iogearbox.net [1] https://lore.kernel.org/bpf/CAADnVQ+gEY3FjCR=+DmjDR4gp5bOYZUFJQXj4agKFHT9CQPZBw@mail.gmail.com [2] http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/r/20230719140858.13224-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-19 22:08:51 +08:00
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2023 Isovalent */
#include <linux/bpf.h>
#include <linux/bpf_mprog.h>
static int bpf_mprog_link(struct bpf_tuple *tuple,
u32 id_or_fd, u32 flags,
enum bpf_prog_type type)
{
struct bpf_link *link = ERR_PTR(-EINVAL);
bool id = flags & BPF_F_ID;
if (id)
link = bpf_link_by_id(id_or_fd);
else if (id_or_fd)
link = bpf_link_get_from_fd(id_or_fd);
if (IS_ERR(link))
return PTR_ERR(link);
if (type && link->prog->type != type) {
bpf_link_put(link);
return -EINVAL;
}
tuple->link = link;
tuple->prog = link->prog;
return 0;
}
static int bpf_mprog_prog(struct bpf_tuple *tuple,
u32 id_or_fd, u32 flags,
enum bpf_prog_type type)
{
struct bpf_prog *prog = ERR_PTR(-EINVAL);
bool id = flags & BPF_F_ID;
if (id)
prog = bpf_prog_by_id(id_or_fd);
else if (id_or_fd)
prog = bpf_prog_get(id_or_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (type && prog->type != type) {
bpf_prog_put(prog);
return -EINVAL;
}
tuple->link = NULL;
tuple->prog = prog;
return 0;
}
static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
u32 id_or_fd, u32 flags,
enum bpf_prog_type type)
{
bool link = flags & BPF_F_LINK;
bool id = flags & BPF_F_ID;
memset(tuple, 0, sizeof(*tuple));
if (link)
return bpf_mprog_link(tuple, id_or_fd, flags, type);
/* If no relevant flag is set and no id_or_fd was passed, then
* tuple link/prog is just NULLed. This is the case when before/
* after selects first/last position without passing fd.
*/
if (!id && !id_or_fd)
return 0;
return bpf_mprog_prog(tuple, id_or_fd, flags, type);
}
static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
{
if (tuple->link)
bpf_link_put(tuple->link);
else if (tuple->prog)
bpf_prog_put(tuple->prog);
}
/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
* one exception that for deletion we support delete from front/back. In
* case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
* Adjustment to first and last entry is trivial. The bpf_mprog_insert()
* we have to deal with the following cases:
*
* idx + before:
*
* Insert P4 before P3: idx for old array is 1, idx for new array is 2,
* hence we adjust target idx for the new array, so that memmove copies
* P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
* before P1 would have old idx -1 and new idx 0.
*
* +--+--+--+ +--+--+--+--+ +--+--+--+--+
* |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
* +--+--+--+ +--+--+--+--+ +--+--+--+--+
*
* idx + after:
*
* Insert P4 after P2: idx for old array is 2, idx for new array is 2.
* Again, memmove copies P1 and P2 to the new entry, and we insert P4
* into idx 2. Inserting after P3 would have both old/new idx at 4 aka
* bpf_mprog_total(entry).
*
* +--+--+--+ +--+--+--+--+ +--+--+--+--+
* |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
* +--+--+--+ +--+--+--+--+ +--+--+--+--+
*/
static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
struct bpf_mprog_entry **entry_new,
struct bpf_tuple *ntuple, int idx)
{
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
struct bpf_prog *oprog;
bpf_mprog_read(entry, idx, &fp, &cp);
oprog = READ_ONCE(fp->prog);
bpf_mprog_write(fp, cp, ntuple);
if (!ntuple->link) {
WARN_ON_ONCE(cp->link);
bpf_prog_put(oprog);
}
*entry_new = entry;
return 0;
}
static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
struct bpf_mprog_entry **entry_new,
struct bpf_tuple *ntuple, int idx, u32 flags)
{
int total = bpf_mprog_total(entry);
struct bpf_mprog_entry *peer;
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
peer = bpf_mprog_peer(entry);
bpf_mprog_entry_copy(peer, entry);
if (idx == total)
goto insert;
else if (flags & BPF_F_BEFORE)
idx += 1;
bpf_mprog_entry_grow(peer, idx);
insert:
bpf_mprog_read(peer, idx, &fp, &cp);
bpf_mprog_write(fp, cp, ntuple);
bpf_mprog_inc(peer);
*entry_new = peer;
return 0;
}
static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
struct bpf_mprog_entry **entry_new,
struct bpf_tuple *dtuple, int idx)
{
int total = bpf_mprog_total(entry);
struct bpf_mprog_entry *peer;
peer = bpf_mprog_peer(entry);
bpf_mprog_entry_copy(peer, entry);
if (idx == -1)
idx = 0;
else if (idx == total)
idx = total - 1;
bpf_mprog_entry_shrink(peer, idx);
bpf_mprog_dec(peer);
bpf_mprog_mark_for_release(peer, dtuple);
*entry_new = peer;
return 0;
}
/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
* program/link that needs to be replaced, inserted or deleted for
* each "rule" independently. If all rules agree on that position
* or existing element, then enact replacement, addition or deletion.
* If this is not the case, then the request cannot be satisfied and
* we bail out with an error.
*/
static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
struct bpf_tuple *tuple)
{
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
int i;
for (i = 0; i < bpf_mprog_total(entry); i++) {
bpf_mprog_read(entry, i, &fp, &cp);
if (tuple->prog == READ_ONCE(fp->prog))
return tuple->link == cp->link ? i : -EBUSY;
}
return -ENOENT;
}
static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
struct bpf_tuple *tuple)
{
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
int i;
for (i = 0; i < bpf_mprog_total(entry); i++) {
bpf_mprog_read(entry, i, &fp, &cp);
if (tuple->prog == READ_ONCE(fp->prog) &&
(!tuple->link || tuple->link == cp->link))
return i - 1;
}
return tuple->prog ? -ENOENT : -1;
}
static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
struct bpf_tuple *tuple)
{
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
int i;
for (i = 0; i < bpf_mprog_total(entry); i++) {
bpf_mprog_read(entry, i, &fp, &cp);
if (tuple->prog == READ_ONCE(fp->prog) &&
(!tuple->link || tuple->link == cp->link))
return i + 1;
}
return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
}
int bpf_mprog_attach(struct bpf_mprog_entry *entry,
struct bpf_mprog_entry **entry_new,
struct bpf_prog *prog_new, struct bpf_link *link,
struct bpf_prog *prog_old,
u32 flags, u32 id_or_fd, u64 revision)
{
struct bpf_tuple rtuple, ntuple = {
.prog = prog_new,
.link = link,
}, otuple = {
.prog = prog_old,
.link = link,
};
int ret, idx = -ERANGE, tidx;
if (revision && revision != bpf_mprog_revision(entry))
return -ESTALE;
if (bpf_mprog_exists(entry, prog_new))
return -EEXIST;
ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
flags & ~BPF_F_REPLACE,
prog_new->type);
if (ret)
return ret;
if (flags & BPF_F_REPLACE) {
tidx = bpf_mprog_pos_exact(entry, &otuple);
if (tidx < 0) {
ret = tidx;
goto out;
}
idx = tidx;
}
if (flags & BPF_F_BEFORE) {
tidx = bpf_mprog_pos_before(entry, &rtuple);
if (tidx < -1 || (idx >= -1 && tidx != idx)) {
ret = tidx < -1 ? tidx : -ERANGE;
goto out;
}
idx = tidx;
}
if (flags & BPF_F_AFTER) {
tidx = bpf_mprog_pos_after(entry, &rtuple);
if (tidx < -1 || (idx >= -1 && tidx != idx)) {
ret = tidx < 0 ? tidx : -ERANGE;
goto out;
}
idx = tidx;
}
if (idx < -1) {
if (rtuple.prog || flags) {
ret = -EINVAL;
goto out;
}
idx = bpf_mprog_total(entry);
flags = BPF_F_AFTER;
}
if (idx >= bpf_mprog_max()) {
ret = -ERANGE;
goto out;
}
if (flags & BPF_F_REPLACE)
ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
else
ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
out:
bpf_mprog_tuple_put(&rtuple);
return ret;
}
static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
struct bpf_tuple *tuple, int idx)
{
int total = bpf_mprog_total(entry);
struct bpf_mprog_cp *cp;
struct bpf_mprog_fp *fp;
struct bpf_prog *prog;
struct bpf_link *link;
if (idx == -1)
idx = 0;
else if (idx == total)
idx = total - 1;
bpf_mprog_read(entry, idx, &fp, &cp);
prog = READ_ONCE(fp->prog);
link = cp->link;
/* The deletion request can either be without filled tuple in which
* case it gets populated here based on idx, or with filled tuple
* where the only thing we end up doing is the WARN_ON_ONCE() assert.
* If we hit a BPF link at the given index, it must not be removed
* from opts path.
*/
if (link && !tuple->link)
return -EBUSY;
WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
WARN_ON_ONCE(tuple->link && tuple->link != link);
tuple->prog = prog;
tuple->link = link;
return 0;
}
int bpf_mprog_detach(struct bpf_mprog_entry *entry,
struct bpf_mprog_entry **entry_new,
struct bpf_prog *prog, struct bpf_link *link,
u32 flags, u32 id_or_fd, u64 revision)
{
struct bpf_tuple rtuple, dtuple = {
.prog = prog,
.link = link,
};
int ret, idx = -ERANGE, tidx;
if (flags & BPF_F_REPLACE)
return -EINVAL;
if (revision && revision != bpf_mprog_revision(entry))
return -ESTALE;
if (!bpf_mprog_total(entry))
return -ENOENT;
bpf: Add generic attach/detach/query API for multi-progs This adds a generic layer called bpf_mprog which can be reused by different attachment layers to enable multi-program attachment and dependency resolution. In-kernel users of the bpf_mprog don't need to care about the dependency resolution internals, they can just consume it with few API calls. The initial idea of having a generic API sparked out of discussion [0] from an earlier revision of this work where tc's priority was reused and exposed via BPF uapi as a way to coordinate dependencies among tc BPF programs, similar as-is for classic tc BPF. The feedback was that priority provides a bad user experience and is hard to use [1], e.g.: I cannot help but feel that priority logic copy-paste from old tc, netfilter and friends is done because "that's how things were done in the past". [...] Priority gets exposed everywhere in uapi all the way to bpftool when it's right there for users to understand. And that's the main problem with it. The user don't want to and don't need to be aware of it, but uapi forces them to pick the priority. [...] Your cover letter [0] example proves that in real life different service pick the same priority. They simply don't know any better. Priority is an unnecessary magic that apps _have_ to pick, so they just copy-paste and everyone ends up using the same. The course of the discussion showed more and more the need for a generic, reusable API where the "same look and feel" can be applied for various other program types beyond just tc BPF, for example XDP today does not have multi- program support in kernel, but also there was interest around this API for improving management of cgroup program types. Such common multi-program management concept is useful for BPF management daemons or user space BPF applications coordinating internally about their attachments. Both from Cilium and Meta side [2], we've collected the following requirements for a generic attach/detach/query API for multi-progs which has been implemented as part of this work: - Support prog-based attach/detach and link API - Dependency directives (can also be combined): - BPF_F_{BEFORE,AFTER} with relative_{fd,id} which can be {prog,link,none} - BPF_F_ID flag as {fd,id} toggle; the rationale for id is so that user space application does not need CAP_SYS_ADMIN to retrieve foreign fds via bpf_*_get_fd_by_id() - BPF_F_LINK flag as {prog,link} toggle - If relative_{fd,id} is none, then BPF_F_BEFORE will just prepend, and BPF_F_AFTER will just append for attaching - Enforced only at attach time - BPF_F_REPLACE with replace_bpf_fd which can be prog, links have their own infra for replacing their internal prog - If no flags are set, then it's default append behavior for attaching - Internal revision counter and optionally being able to pass expected_revision - User space application can query current state with revision, and pass it along for attachment to assert current state before doing updates - Query also gets extension for link_ids array and link_attach_flags: - prog_ids are always filled with program IDs - link_ids are filled with link IDs when link was used, otherwise 0 - {prog,link}_attach_flags for holding {prog,link}-specific flags - Must be easy to integrate/reuse for in-kernel users The uapi-side changes needed for supporting bpf_mprog are rather minimal, consisting of the additions of the attachment flags, revision counter, and expanding existing union with relative_{fd,id} member. The bpf_mprog framework consists of an bpf_mprog_entry object which holds an array of bpf_mprog_fp (fast-path structure). The bpf_mprog_cp (control-path structure) is part of bpf_mprog_bundle. Both have been separated, so that fast-path gets efficient packing of bpf_prog pointers for maximum cache efficiency. Also, array has been chosen instead of linked list or other structures to remove unnecessary indirections for a fast point-to-entry in tc for BPF. The bpf_mprog_entry comes as a pair via bpf_mprog_bundle so that in case of updates the peer bpf_mprog_entry is populated and then just swapped which avoids additional allocations that could otherwise fail, for example, in detach case. bpf_mprog_{fp,cp} arrays are currently static, but they could be converted to dynamic allocation if necessary at a point in future. Locking is deferred to the in-kernel user of bpf_mprog, for example, in case of tcx which uses this API in the next patch, it piggybacks on rtnl. An extensive test suite for checking all aspects of this API for prog-based attach/detach and link API comes as BPF selftests in this series. Thanks also to Andrii Nakryiko for early API discussions wrt Meta's BPF prog management. [0] https://lore.kernel.org/bpf/20221004231143.19190-1-daniel@iogearbox.net [1] https://lore.kernel.org/bpf/CAADnVQ+gEY3FjCR=+DmjDR4gp5bOYZUFJQXj4agKFHT9CQPZBw@mail.gmail.com [2] http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/r/20230719140858.13224-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-19 22:08:51 +08:00
ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
prog ? prog->type :
BPF_PROG_TYPE_UNSPEC);
if (ret)
return ret;
if (dtuple.prog) {
tidx = bpf_mprog_pos_exact(entry, &dtuple);
if (tidx < 0) {
ret = tidx;
goto out;
}
idx = tidx;
}
if (flags & BPF_F_BEFORE) {
tidx = bpf_mprog_pos_before(entry, &rtuple);
if (tidx < -1 || (idx >= -1 && tidx != idx)) {
ret = tidx < -1 ? tidx : -ERANGE;
goto out;
}
idx = tidx;
}
if (flags & BPF_F_AFTER) {
tidx = bpf_mprog_pos_after(entry, &rtuple);
if (tidx < -1 || (idx >= -1 && tidx != idx)) {
ret = tidx < 0 ? tidx : -ERANGE;
goto out;
}
idx = tidx;
}
if (idx < -1) {
if (rtuple.prog || flags) {
ret = -EINVAL;
goto out;
}
idx = bpf_mprog_total(entry);
flags = BPF_F_AFTER;
}
if (idx >= bpf_mprog_max()) {
ret = -ERANGE;
goto out;
}
ret = bpf_mprog_fetch(entry, &dtuple, idx);
if (ret)
goto out;
ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
out:
bpf_mprog_tuple_put(&rtuple);
return ret;
}
int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
struct bpf_mprog_entry *entry)
{
u32 __user *uprog_flags, *ulink_flags;
u32 __user *uprog_id, *ulink_id;
struct bpf_mprog_fp *fp;
struct bpf_mprog_cp *cp;
struct bpf_prog *prog;
const u32 flags = 0;
int i, ret = 0;
u32 id, count;
u64 revision;
if (attr->query.query_flags || attr->query.attach_flags)
return -EINVAL;
revision = bpf_mprog_revision(entry);
count = bpf_mprog_total(entry);
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
return -EFAULT;
if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
return -EFAULT;
uprog_id = u64_to_user_ptr(attr->query.prog_ids);
uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
ulink_id = u64_to_user_ptr(attr->query.link_ids);
ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
if (attr->query.count == 0 || !uprog_id || !count)
return 0;
if (attr->query.count < count) {
count = attr->query.count;
ret = -ENOSPC;
}
for (i = 0; i < bpf_mprog_max(); i++) {
bpf_mprog_read(entry, i, &fp, &cp);
prog = READ_ONCE(fp->prog);
if (!prog)
break;
id = prog->aux->id;
if (copy_to_user(uprog_id + i, &id, sizeof(id)))
return -EFAULT;
if (uprog_flags &&
copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
return -EFAULT;
id = cp->link ? cp->link->id : 0;
if (ulink_id &&
copy_to_user(ulink_id + i, &id, sizeof(id)))
return -EFAULT;
if (ulink_flags &&
copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
return -EFAULT;
if (i + 1 == count)
break;
}
return ret;
}