OpenCloudOS-Kernel/kernel/bpf/xskmap.c

329 lines
7.5 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/* XSKMAP used for AF_XDP sockets
* Copyright(c) 2018 Intel Corporation.
*/
#include <linux/bpf.h>
#include <linux/capability.h>
#include <net/xdp_sock.h>
#include <linux/slab.h>
#include <linux/sched.h>
struct xsk_map {
struct bpf_map map;
struct list_head __percpu *flush_list;
spinlock_t lock; /* Synchronize map updates */
struct xdp_sock *xsk_map[];
};
int xsk_map_inc(struct xsk_map *map)
{
bpf_map_inc(&map->map);
return 0;
}
void xsk_map_put(struct xsk_map *map)
{
bpf_map_put(&map->map);
}
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
struct xdp_sock **map_entry)
{
struct xsk_map_node *node;
int err;
node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
if (!node)
return ERR_PTR(-ENOMEM);
err = xsk_map_inc(map);
if (err) {
kfree(node);
return ERR_PTR(err);
}
node->map = map;
node->map_entry = map_entry;
return node;
}
static void xsk_map_node_free(struct xsk_map_node *node)
{
xsk_map_put(node->map);
kfree(node);
}
static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
{
spin_lock_bh(&xs->map_list_lock);
list_add_tail(&node->node, &xs->map_list);
spin_unlock_bh(&xs->map_list_lock);
}
static void xsk_map_sock_delete(struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
struct xsk_map_node *n, *tmp;
spin_lock_bh(&xs->map_list_lock);
list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
if (map_entry == n->map_entry) {
list_del(&n->node);
xsk_map_node_free(n);
}
}
spin_unlock_bh(&xs->map_list_lock);
}
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct bpf_map_memory mem;
int cpu, err, numa_node;
struct xsk_map *m;
u64 cost, size;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 ||
attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
return ERR_PTR(-EINVAL);
numa_node = bpf_map_attr_numa_node(attr);
size = struct_size(m, xsk_map, attr->max_entries);
cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus());
err = bpf_map_charge_init(&mem, cost);
if (err < 0)
return ERR_PTR(err);
m = bpf_map_area_alloc(size, numa_node);
if (!m) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
bpf_map_init_from_attr(&m->map, attr);
bpf_map_charge_move(&m->map.memory, &mem);
spin_lock_init(&m->lock);
m->flush_list = alloc_percpu(struct list_head);
if (!m->flush_list) {
bpf_map_charge_finish(&m->map.memory);
bpf_map_area_free(m);
return ERR_PTR(-ENOMEM);
}
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
return &m->map;
}
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano <sebastiano.miano@polito.it> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2018-08-18 05:26:14 +08:00
bpf_clear_redirect_map(map);
synchronize_net();
free_percpu(m->flush_list);
bpf_map_area_free(m);
}
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = next_key;
if (index >= m->map.max_entries) {
*next = 0;
return 0;
}
if (index == m->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *xs;
if (key >= map->max_entries)
return NULL;
xs = READ_ONCE(m->xsk_map[key]);
return xs;
}
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
struct bpf_insn *insn = insn_buf;
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
*insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
*insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*insn++ = BPF_MOV64_IMM(ret, 0);
return insn - insn_buf;
}
int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
struct xdp_sock *xs)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct list_head *flush_list = this_cpu_ptr(m->flush_list);
int err;
err = xsk_rcv(xs, xdp);
if (err)
return err;
if (!xs->flush_node.prev)
list_add(&xs->flush_node, flush_list);
return 0;
}
void __xsk_map_flush(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct list_head *flush_list = this_cpu_ptr(m->flush_list);
struct xdp_sock *xs, *tmp;
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs);
__list_del_clearprev(&xs->flush_node);
}
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return __xsk_map_lookup_elem(map, *(u32 *)key);
}
static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
{
return ERR_PTR(-EOPNOTSUPP);
}
static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
struct xsk_map_node *node;
struct socket *sock;
int err;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
sock = sockfd_lookup(fd, &err);
if (!sock)
return err;
if (sock->sk->sk_family != PF_XDP) {
sockfd_put(sock);
return -EOPNOTSUPP;
}
xs = (struct xdp_sock *)sock->sk;
if (!xsk_is_setup_for_bpf_map(xs)) {
sockfd_put(sock);
return -EOPNOTSUPP;
}
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);
if (IS_ERR(node)) {
sockfd_put(sock);
return PTR_ERR(node);
}
spin_lock_bh(&m->lock);
old_xs = READ_ONCE(*map_entry);
if (old_xs == xs) {
err = 0;
goto out;
} else if (old_xs && map_flags == BPF_NOEXIST) {
err = -EEXIST;
goto out;
} else if (!old_xs && map_flags == BPF_EXIST) {
err = -ENOENT;
goto out;
}
xsk_map_sock_add(xs, node);
WRITE_ONCE(*map_entry, xs);
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
out:
spin_unlock_bh(&m->lock);
sockfd_put(sock);
xsk_map_node_free(node);
return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
spin_lock_bh(&m->lock);
map_entry = &m->xsk_map[k];
old_xs = xchg(map_entry, NULL);
if (old_xs)
xsk_map_sock_delete(old_xs, map_entry);
spin_unlock_bh(&m->lock);
return 0;
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry)
{
spin_lock_bh(&map->lock);
if (READ_ONCE(*map_entry) == xs) {
WRITE_ONCE(*map_entry, NULL);
xsk_map_sock_delete(xs, map_entry);
}
spin_unlock_bh(&map->lock);
}
static int xsk_map_btf_id;
const struct bpf_map_ops xsk_map_ops = {
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
.map_lookup_elem = xsk_map_lookup_elem,
.map_gen_lookup = xsk_map_gen_lookup,
.map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
.map_check_btf = map_check_no_btf,
.map_btf_name = "xsk_map",
.map_btf_id = &xsk_map_btf_id,
};