2018-05-02 19:01:28 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* XSKMAP used for AF_XDP sockets
|
|
|
|
* Copyright(c) 2018 Intel Corporation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/bpf.h>
|
2021-12-29 08:49:13 +08:00
|
|
|
#include <linux/filter.h>
|
2018-05-02 19:01:28 +08:00
|
|
|
#include <net/xdp_sock.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/sched.h>
|
2022-04-25 21:32:47 +08:00
|
|
|
#include <linux/btf_ids.h>
|
2018-05-02 19:01:28 +08:00
|
|
|
|
2020-05-21 03:20:50 +08:00
|
|
|
#include "xsk.h"
|
|
|
|
|
2019-08-15 17:30:13 +08:00
|
|
|
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
struct xdp_sock __rcu **map_entry)
|
2019-08-15 17:30:13 +08:00
|
|
|
{
|
|
|
|
struct xsk_map_node *node;
|
|
|
|
|
2020-12-02 05:58:43 +08:00
|
|
|
node = bpf_map_kzalloc(&map->map, sizeof(*node),
|
|
|
|
GFP_ATOMIC | __GFP_NOWARN);
|
2019-08-15 17:30:13 +08:00
|
|
|
if (!node)
|
2019-09-25 00:25:21 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2019-08-15 17:30:13 +08:00
|
|
|
|
2020-11-26 23:03:18 +08:00
|
|
|
bpf_map_inc(&map->map);
|
2023-03-05 20:46:13 +08:00
|
|
|
atomic_inc(&map->count);
|
2019-08-15 17:30:13 +08:00
|
|
|
|
|
|
|
node->map = map;
|
|
|
|
node->map_entry = map_entry;
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xsk_map_node_free(struct xsk_map_node *node)
|
|
|
|
{
|
2023-03-05 20:46:13 +08:00
|
|
|
struct xsk_map *map = node->map;
|
|
|
|
|
2020-11-26 23:03:18 +08:00
|
|
|
bpf_map_put(&node->map->map);
|
2019-08-15 17:30:13 +08:00
|
|
|
kfree(node);
|
2023-03-05 20:46:13 +08:00
|
|
|
atomic_dec(&map->count);
|
2019-08-15 17:30:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
|
|
|
|
{
|
|
|
|
spin_lock_bh(&xs->map_list_lock);
|
|
|
|
list_add_tail(&node->node, &xs->map_list);
|
|
|
|
spin_unlock_bh(&xs->map_list_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xsk_map_sock_delete(struct xdp_sock *xs,
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
struct xdp_sock __rcu **map_entry)
|
2019-08-15 17:30:13 +08:00
|
|
|
{
|
|
|
|
struct xsk_map_node *n, *tmp;
|
|
|
|
|
|
|
|
spin_lock_bh(&xs->map_list_lock);
|
|
|
|
list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
|
|
|
|
if (map_entry == n->map_entry) {
|
|
|
|
list_del(&n->node);
|
|
|
|
xsk_map_node_free(n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock_bh(&xs->map_list_lock);
|
|
|
|
}
|
|
|
|
|
2018-05-02 19:01:28 +08:00
|
|
|
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
|
|
|
{
|
|
|
|
struct xsk_map *m;
|
2020-12-02 05:58:56 +08:00
|
|
|
int numa_node;
|
2019-12-19 14:10:02 +08:00
|
|
|
u64 size;
|
2018-05-02 19:01:28 +08:00
|
|
|
|
|
|
|
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
|
|
|
attr->value_size != 4 ||
|
|
|
|
attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2019-11-01 19:03:44 +08:00
|
|
|
numa_node = bpf_map_attr_numa_node(attr);
|
|
|
|
size = struct_size(m, xsk_map, attr->max_entries);
|
|
|
|
|
|
|
|
m = bpf_map_area_alloc(size, numa_node);
|
2020-12-02 05:58:56 +08:00
|
|
|
if (!m)
|
2018-05-02 19:01:28 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
bpf_map_init_from_attr(&m->map, attr);
|
2019-08-15 17:30:13 +08:00
|
|
|
spin_lock_init(&m->lock);
|
2018-05-02 19:01:28 +08:00
|
|
|
|
|
|
|
return &m->map;
|
|
|
|
}
|
|
|
|
|
2023-03-05 20:46:13 +08:00
|
|
|
static u64 xsk_map_mem_usage(const struct bpf_map *map)
|
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
|
|
|
|
|
|
|
return struct_size(m, xsk_map, map->max_entries) +
|
|
|
|
(u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
|
|
|
|
}
|
|
|
|
|
2018-05-02 19:01:28 +08:00
|
|
|
static void xsk_map_free(struct bpf_map *map)
|
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
|
|
|
|
|
|
|
synchronize_net();
|
2019-11-01 19:03:44 +08:00
|
|
|
bpf_map_area_free(m);
|
2018-05-02 19:01:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
|
|
|
u32 index = key ? *(u32 *)key : U32_MAX;
|
|
|
|
u32 *next = next_key;
|
|
|
|
|
|
|
|
if (index >= m->map.max_entries) {
|
|
|
|
*next = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (index == m->map.max_entries - 1)
|
|
|
|
return -ENOENT;
|
|
|
|
*next = index + 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-10-11 07:40:03 +08:00
|
|
|
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
|
2019-11-01 19:03:45 +08:00
|
|
|
{
|
|
|
|
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
|
|
|
|
struct bpf_insn *insn = insn_buf;
|
|
|
|
|
|
|
|
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
|
|
|
|
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
|
|
|
|
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
|
|
|
|
*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
|
|
|
|
*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
|
|
|
|
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
|
|
|
|
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
|
|
|
|
*insn++ = BPF_MOV64_IMM(ret, 0);
|
|
|
|
return insn - insn_buf;
|
|
|
|
}
|
|
|
|
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
|
|
|
|
* by local_bh_disable() (from XDP calls inside NAPI). The
|
|
|
|
* rcu_read_lock_bh_held() below makes lockdep accept both.
|
|
|
|
*/
|
2021-03-08 19:29:06 +08:00
|
|
|
static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
|
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
|
|
|
|
|
|
|
if (key >= map->max_entries)
|
|
|
|
return NULL;
|
|
|
|
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
|
2021-03-08 19:29:06 +08:00
|
|
|
}
|
|
|
|
|
2018-05-02 19:01:28 +08:00
|
|
|
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
|
2019-06-07 04:59:40 +08:00
|
|
|
{
|
|
|
|
return __xsk_map_lookup_elem(map, *(u32 *)key);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
|
2018-05-02 19:01:28 +08:00
|
|
|
{
|
2018-10-09 09:04:50 +08:00
|
|
|
return ERR_PTR(-EOPNOTSUPP);
|
2018-05-02 19:01:28 +08:00
|
|
|
}
|
|
|
|
|
bpf: return long from bpf_map_ops funcs
This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.
The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.
For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:
; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
...
46: call 0xffffffffe103291c ; htab_map_update_elem
; if (err && err != -EEXIST) {
4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax
kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:
movl $0xffffffef, %r9d
...
movl %r9d, %eax
...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef
Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-03-23 03:47:54 +08:00
|
|
|
static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|
|
|
u64 map_flags)
|
2018-05-02 19:01:28 +08:00
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
struct xdp_sock __rcu **map_entry;
|
|
|
|
struct xdp_sock *xs, *old_xs;
|
2018-05-02 19:01:28 +08:00
|
|
|
u32 i = *(u32 *)key, fd = *(u32 *)value;
|
2019-08-15 17:30:13 +08:00
|
|
|
struct xsk_map_node *node;
|
2018-05-02 19:01:28 +08:00
|
|
|
struct socket *sock;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (unlikely(map_flags > BPF_EXIST))
|
|
|
|
return -EINVAL;
|
|
|
|
if (unlikely(i >= m->map.max_entries))
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
sock = sockfd_lookup(fd, &err);
|
|
|
|
if (!sock)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (sock->sk->sk_family != PF_XDP) {
|
|
|
|
sockfd_put(sock);
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
xs = (struct xdp_sock *)sock->sk;
|
|
|
|
|
2019-08-15 17:30:13 +08:00
|
|
|
map_entry = &m->xsk_map[i];
|
|
|
|
node = xsk_map_node_alloc(m, map_entry);
|
|
|
|
if (IS_ERR(node)) {
|
|
|
|
sockfd_put(sock);
|
|
|
|
return PTR_ERR(node);
|
|
|
|
}
|
2018-05-02 19:01:28 +08:00
|
|
|
|
2019-08-15 17:30:13 +08:00
|
|
|
spin_lock_bh(&m->lock);
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
|
2019-08-15 17:30:13 +08:00
|
|
|
if (old_xs == xs) {
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
2019-08-15 17:30:14 +08:00
|
|
|
} else if (old_xs && map_flags == BPF_NOEXIST) {
|
|
|
|
err = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
} else if (!old_xs && map_flags == BPF_EXIST) {
|
|
|
|
err = -ENOENT;
|
|
|
|
goto out;
|
2019-08-15 17:30:13 +08:00
|
|
|
}
|
|
|
|
xsk_map_sock_add(xs, node);
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
rcu_assign_pointer(*map_entry, xs);
|
2018-10-09 01:40:16 +08:00
|
|
|
if (old_xs)
|
2019-08-15 17:30:13 +08:00
|
|
|
xsk_map_sock_delete(old_xs, map_entry);
|
|
|
|
spin_unlock_bh(&m->lock);
|
2018-05-02 19:01:28 +08:00
|
|
|
sockfd_put(sock);
|
|
|
|
return 0;
|
2019-08-15 17:30:13 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock_bh(&m->lock);
|
|
|
|
sockfd_put(sock);
|
|
|
|
xsk_map_node_free(node);
|
|
|
|
return err;
|
2018-05-02 19:01:28 +08:00
|
|
|
}
|
|
|
|
|
bpf: return long from bpf_map_ops funcs
This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.
The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.
For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:
; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
...
46: call 0xffffffffe103291c ; htab_map_update_elem
; if (err && err != -EEXIST) {
4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax
kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:
movl $0xffffffef, %r9d
...
movl %r9d, %eax
...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef
Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-03-23 03:47:54 +08:00
|
|
|
static long xsk_map_delete_elem(struct bpf_map *map, void *key)
|
2018-05-02 19:01:28 +08:00
|
|
|
{
|
|
|
|
struct xsk_map *m = container_of(map, struct xsk_map, map);
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
struct xdp_sock __rcu **map_entry;
|
|
|
|
struct xdp_sock *old_xs;
|
2018-05-02 19:01:28 +08:00
|
|
|
int k = *(u32 *)key;
|
|
|
|
|
|
|
|
if (k >= map->max_entries)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2019-08-15 17:30:13 +08:00
|
|
|
spin_lock_bh(&m->lock);
|
|
|
|
map_entry = &m->xsk_map[k];
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
old_xs = unrcu_pointer(xchg(map_entry, NULL));
|
2018-10-09 01:40:16 +08:00
|
|
|
if (old_xs)
|
2019-08-15 17:30:13 +08:00
|
|
|
xsk_map_sock_delete(old_xs, map_entry);
|
|
|
|
spin_unlock_bh(&m->lock);
|
2018-05-02 19:01:28 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
bpf: return long from bpf_map_ops funcs
This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.
The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.
For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:
; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
...
46: call 0xffffffffe103291c ; htab_map_update_elem
; if (err && err != -EEXIST) {
4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax
kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:
movl $0xffffffef, %r9d
...
movl %r9d, %eax
...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef
Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-03-23 03:47:54 +08:00
|
|
|
static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
|
2021-03-08 19:29:06 +08:00
|
|
|
{
|
2022-11-08 22:06:00 +08:00
|
|
|
return __bpf_xdp_redirect_map(map, index, flags, 0,
|
2021-05-19 17:07:45 +08:00
|
|
|
__xsk_map_lookup_elem);
|
2021-03-08 19:29:06 +08:00
|
|
|
}
|
|
|
|
|
2019-08-15 17:30:13 +08:00
|
|
|
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
struct xdp_sock __rcu **map_entry)
|
2019-08-15 17:30:13 +08:00
|
|
|
{
|
|
|
|
spin_lock_bh(&map->lock);
|
xdp: Add proper __rcu annotations to redirect map entries
XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.
Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.
This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).
The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.
With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.
[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
2021-06-25 00:05:55 +08:00
|
|
|
if (rcu_access_pointer(*map_entry) == xs) {
|
|
|
|
rcu_assign_pointer(*map_entry, NULL);
|
2019-08-15 17:30:13 +08:00
|
|
|
xsk_map_sock_delete(xs, map_entry);
|
|
|
|
}
|
|
|
|
spin_unlock_bh(&map->lock);
|
|
|
|
}
|
|
|
|
|
2020-08-28 09:18:13 +08:00
|
|
|
static bool xsk_map_meta_equal(const struct bpf_map *meta0,
|
|
|
|
const struct bpf_map *meta1)
|
|
|
|
{
|
|
|
|
return meta0->max_entries == meta1->max_entries &&
|
|
|
|
bpf_map_meta_equal(meta0, meta1);
|
|
|
|
}
|
|
|
|
|
2022-04-25 21:32:47 +08:00
|
|
|
BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map)
|
2018-05-02 19:01:28 +08:00
|
|
|
const struct bpf_map_ops xsk_map_ops = {
|
2020-08-28 09:18:13 +08:00
|
|
|
.map_meta_equal = xsk_map_meta_equal,
|
2018-05-02 19:01:28 +08:00
|
|
|
.map_alloc = xsk_map_alloc,
|
|
|
|
.map_free = xsk_map_free,
|
|
|
|
.map_get_next_key = xsk_map_get_next_key,
|
|
|
|
.map_lookup_elem = xsk_map_lookup_elem,
|
2019-11-01 19:03:45 +08:00
|
|
|
.map_gen_lookup = xsk_map_gen_lookup,
|
2019-06-07 04:59:40 +08:00
|
|
|
.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
|
2018-05-02 19:01:28 +08:00
|
|
|
.map_update_elem = xsk_map_update_elem,
|
|
|
|
.map_delete_elem = xsk_map_delete_elem,
|
2018-08-12 07:59:17 +08:00
|
|
|
.map_check_btf = map_check_no_btf,
|
2023-03-05 20:46:13 +08:00
|
|
|
.map_mem_usage = xsk_map_mem_usage,
|
2022-04-25 21:32:47 +08:00
|
|
|
.map_btf_id = &xsk_map_btf_ids[0],
|
2021-03-08 19:29:06 +08:00
|
|
|
.map_redirect = xsk_map_redirect,
|
2018-05-02 19:01:28 +08:00
|
|
|
};
|