OpenCloudOS-Kernel/kernel/bpf/xskmap.c

// SPDX-License-Identifier: GPL-2.0
/* XSKMAP used for AF_XDP sockets
 * Copyright(c) 2018 Intel Corporation.
 */

#include <linux/bpf.h>
#include <linux/capability.h>
#include <net/xdp_sock.h>
#include <linux/slab.h>
#include <linux/sched.h>

struct xsk_map {
	struct bpf_map map;
	struct xdp_sock **xsk_map;
	struct list_head __percpu *flush_list;
};

static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
	int cpu, err = -EINVAL;
	struct xsk_map *m;
	u64 cost;

	if (!capable(CAP_NET_ADMIN))
		return ERR_PTR(-EPERM);

	if (attr->max_entries == 0 || attr->key_size != 4 ||
	    attr->value_size != 4 ||
	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
		return ERR_PTR(-EINVAL);

	m = kzalloc(sizeof(*m), GFP_USER);
	if (!m)
		return ERR_PTR(-ENOMEM);

	bpf_map_init_from_attr(&m->map, attr);

	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
	cost += sizeof(struct list_head) * num_possible_cpus();
	if (cost >= U32_MAX - PAGE_SIZE)
		goto free_m;

	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;

	/* Notice returns -EPERM on if map size is larger than memlock limit */
	err = bpf_map_precharge_memlock(m->map.pages);
	if (err)
		goto free_m;

	err = -ENOMEM;

	m->flush_list = alloc_percpu(struct list_head);
	if (!m->flush_list)
		goto free_m;

	for_each_possible_cpu(cpu)
		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));

	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
					sizeof(struct xdp_sock *),
					m->map.numa_node);
	if (!m->xsk_map)
		goto free_percpu;
	return &m->map;

free_percpu:
	free_percpu(m->flush_list);
free_m:
	kfree(m);
	return ERR_PTR(err);
}

static void xsk_map_free(struct bpf_map *map)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	int i;

	bpf_clear_redirect_map(map);
	synchronize_net();

	for (i = 0; i < map->max_entries; i++) {
		struct xdp_sock *xs;

		xs = m->xsk_map[i];
		if (!xs)
			continue;

		sock_put((struct sock *)xs);
	}

	free_percpu(m->flush_list);
	bpf_map_area_free(m->xsk_map);
	kfree(m);
}

static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	u32 index = key ? *(u32 *)key : U32_MAX;
	u32 *next = next_key;

	if (index >= m->map.max_entries) {
		*next = 0;
		return 0;
	}

	if (index == m->map.max_entries - 1)
		return -ENOENT;
	*next = index + 1;
	return 0;
}

struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	struct xdp_sock *xs;

	if (key >= map->max_entries)
		return NULL;

	xs = READ_ONCE(m->xsk_map[key]);
	return xs;
}

int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
		       struct xdp_sock *xs)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
	int err;

	err = xsk_rcv(xs, xdp);
	if (err)
		return err;

	if (!xs->flush_node.prev)
		list_add(&xs->flush_node, flush_list);

	return 0;
}

void __xsk_map_flush(struct bpf_map *map)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
	struct xdp_sock *xs, *tmp;

	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
		xsk_flush(xs);
		__list_del(xs->flush_node.prev, xs->flush_node.next);
		xs->flush_node.prev = NULL;
	}
}

static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
	return NULL;
}

static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
			       u64 map_flags)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	u32 i = *(u32 *)key, fd = *(u32 *)value;
	struct xdp_sock *xs, *old_xs;
	struct socket *sock;
	int err;

	if (unlikely(map_flags > BPF_EXIST))
		return -EINVAL;
	if (unlikely(i >= m->map.max_entries))
		return -E2BIG;
	if (unlikely(map_flags == BPF_NOEXIST))
		return -EEXIST;

	sock = sockfd_lookup(fd, &err);
	if (!sock)
		return err;

	if (sock->sk->sk_family != PF_XDP) {
		sockfd_put(sock);
		return -EOPNOTSUPP;
	}

	xs = (struct xdp_sock *)sock->sk;

	if (!xsk_is_setup_for_bpf_map(xs)) {
		sockfd_put(sock);
		return -EOPNOTSUPP;
	}

	sock_hold(sock->sk);

	old_xs = xchg(&m->xsk_map[i], xs);
	if (old_xs) {
		/* Make sure we've flushed everything. */
		synchronize_net();
		sock_put((struct sock *)old_xs);
	}

	sockfd_put(sock);
	return 0;
}

static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
	struct xsk_map *m = container_of(map, struct xsk_map, map);
	struct xdp_sock *old_xs;
	int k = *(u32 *)key;

	if (k >= map->max_entries)
		return -EINVAL;

	old_xs = xchg(&m->xsk_map[k], NULL);
	if (old_xs) {
		/* Make sure we've flushed everything. */
		synchronize_net();
		sock_put((struct sock *)old_xs);
	}

	return 0;
}

const struct bpf_map_ops xsk_map_ops = {
	.map_alloc = xsk_map_alloc,
	.map_free = xsk_map_free,
	.map_get_next_key = xsk_map_get_next_key,
	.map_lookup_elem = xsk_map_lookup_elem,
	.map_update_elem = xsk_map_update_elem,
	.map_delete_elem = xsk_map_delete_elem,
	.map_check_btf = map_check_no_btf,
};
bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will only accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-05-02 19:01:28 +08:00			`// SPDX-License-Identifier: GPL-2.0`
			`/* XSKMAP used for AF_XDP sockets`
			`* Copyright(c) 2018 Intel Corporation.`
			`*/`

			`#include <linux/bpf.h>`
			`#include <linux/capability.h>`
			`#include <net/xdp_sock.h>`
			`#include <linux/slab.h>`
			`#include <linux/sched.h>`

			`struct xsk_map {`
			`struct bpf_map map;`
			`struct xdp_sock **xsk_map;`
			`struct list_head __percpu *flush_list;`
			`};`

			`static struct bpf_map xsk_map_alloc(union bpf_attr attr)`
			`{`
			`int cpu, err = -EINVAL;`
			`struct xsk_map *m;`
			`u64 cost;`

			`if (!capable(CAP_NET_ADMIN))`
			`return ERR_PTR(-EPERM);`

			`if (attr->max_entries == 0 \|\| attr->key_size != 4 \|\|`
			`attr->value_size != 4 \|\|`
			`attr->map_flags & ~(BPF_F_NUMA_NODE \| BPF_F_RDONLY \| BPF_F_WRONLY))`
			`return ERR_PTR(-EINVAL);`

			`m = kzalloc(sizeof(*m), GFP_USER);`
			`if (!m)`
			`return ERR_PTR(-ENOMEM);`

			`bpf_map_init_from_attr(&m->map, attr);`

			`cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);`
			`cost += sizeof(struct list_head) * num_possible_cpus();`
			`if (cost >= U32_MAX - PAGE_SIZE)`
			`goto free_m;`

			`m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;`

			`/* Notice returns -EPERM on if map size is larger than memlock limit */`
			`err = bpf_map_precharge_memlock(m->map.pages);`
			`if (err)`
			`goto free_m;`

bpf, xskmap: fix crash in xsk_map_alloc error path handling If bpf_map_precharge_memlock() did not fail, then we set err to zero. However, any subsequent failure from either alloc_percpu() or the bpf_map_area_alloc() will return ERR_PTR(0) which in find_and_alloc_map() will cause NULL pointer deref. In devmap we have the convention that we return -EINVAL on page count overflow, so keep the same logic here and just set err to -ENOMEM after successful bpf_map_precharge_memlock(). Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Björn Töpel <bjorn.topel@intel.com> Acked-by: David S. Miller <davem@davemloft.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-05-04 22:27:53 +08:00			`err = -ENOMEM;`

bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will only accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-05-02 19:01:28 +08:00			`m->flush_list = alloc_percpu(struct list_head);`
			`if (!m->flush_list)`
			`goto free_m;`

			`for_each_possible_cpu(cpu)`
			`INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));`

			`m->xsk_map = bpf_map_area_alloc(m->map.max_entries *`
			`sizeof(struct xdp_sock *),`
			`m->map.numa_node);`
			`if (!m->xsk_map)`
			`goto free_percpu;`
			`return &m->map;`

			`free_percpu:`
			`free_percpu(m->flush_list);`
			`free_m:`
			`kfree(m);`
			`return ERR_PTR(err);`
			`}`

			`static void xsk_map_free(struct bpf_map *map)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`int i;`

bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano <sebastiano.miano@polito.it> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-08-18 05:26:14 +08:00			`bpf_clear_redirect_map(map);`
bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will only accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-05-02 19:01:28 +08:00			`synchronize_net();`

			`for (i = 0; i < map->max_entries; i++) {`
			`struct xdp_sock *xs;`

			`xs = m->xsk_map[i];`
			`if (!xs)`
			`continue;`

			`sock_put((struct sock *)xs);`
			`}`

			`free_percpu(m->flush_list);`
			`bpf_map_area_free(m->xsk_map);`
			`kfree(m);`
			`}`

			`static int xsk_map_get_next_key(struct bpf_map map, void key, void *next_key)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`u32 index = key ? (u32 )key : U32_MAX;`
			`u32 *next = next_key;`

			`if (index >= m->map.max_entries) {`
			`*next = 0;`
			`return 0;`
			`}`

			`if (index == m->map.max_entries - 1)`
			`return -ENOENT;`
			`*next = index + 1;`
			`return 0;`
			`}`

			`struct xdp_sock __xsk_map_lookup_elem(struct bpf_map map, u32 key)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`struct xdp_sock *xs;`

			`if (key >= map->max_entries)`
			`return NULL;`

			`xs = READ_ONCE(m->xsk_map[key]);`
			`return xs;`
			`}`

			`int __xsk_map_redirect(struct bpf_map map, struct xdp_buff xdp,`
			`struct xdp_sock *xs)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`struct list_head *flush_list = this_cpu_ptr(m->flush_list);`
			`int err;`

			`err = xsk_rcv(xs, xdp);`
			`if (err)`
			`return err;`

			`if (!xs->flush_node.prev)`
			`list_add(&xs->flush_node, flush_list);`

			`return 0;`
			`}`

			`void __xsk_map_flush(struct bpf_map *map)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`struct list_head *flush_list = this_cpu_ptr(m->flush_list);`
			`struct xdp_sock xs, tmp;`

			`list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {`
			`xsk_flush(xs);`
			`__list_del(xs->flush_node.prev, xs->flush_node.next);`
			`xs->flush_node.prev = NULL;`
			`}`
			`}`

			`static void xsk_map_lookup_elem(struct bpf_map map, void *key)`
			`{`
			`return NULL;`
			`}`

			`static int xsk_map_update_elem(struct bpf_map map, void key, void *value,`
			`u64 map_flags)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`u32 i = (u32 )key, fd = (u32 )value;`
			`struct xdp_sock xs, old_xs;`
			`struct socket *sock;`
			`int err;`

			`if (unlikely(map_flags > BPF_EXIST))`
			`return -EINVAL;`
			`if (unlikely(i >= m->map.max_entries))`
			`return -E2BIG;`
			`if (unlikely(map_flags == BPF_NOEXIST))`
			`return -EEXIST;`

			`sock = sockfd_lookup(fd, &err);`
			`if (!sock)`
			`return err;`

			`if (sock->sk->sk_family != PF_XDP) {`
			`sockfd_put(sock);`
			`return -EOPNOTSUPP;`
			`}`

			`xs = (struct xdp_sock *)sock->sk;`

			`if (!xsk_is_setup_for_bpf_map(xs)) {`
			`sockfd_put(sock);`
			`return -EOPNOTSUPP;`
			`}`

			`sock_hold(sock->sk);`

			`old_xs = xchg(&m->xsk_map[i], xs);`
			`if (old_xs) {`
			`/* Make sure we've flushed everything. */`
			`synchronize_net();`
			`sock_put((struct sock *)old_xs);`
			`}`

			`sockfd_put(sock);`
			`return 0;`
			`}`

			`static int xsk_map_delete_elem(struct bpf_map map, void key)`
			`{`
			`struct xsk_map *m = container_of(map, struct xsk_map, map);`
			`struct xdp_sock *old_xs;`
			`int k = (u32 )key;`

			`if (k >= map->max_entries)`
			`return -EINVAL;`

			`old_xs = xchg(&m->xsk_map[k], NULL);`
			`if (old_xs) {`
			`/* Make sure we've flushed everything. */`
			`synchronize_net();`
			`sock_put((struct sock *)old_xs);`
			`}`

			`return 0;`
			`}`

			`const struct bpf_map_ops xsk_map_ops = {`
			`.map_alloc = xsk_map_alloc,`
			`.map_free = xsk_map_free,`
			`.map_get_next_key = xsk_map_get_next_key,`
			`.map_lookup_elem = xsk_map_lookup_elem,`
			`.map_update_elem = xsk_map_update_elem,`
			`.map_delete_elem = xsk_map_delete_elem,`
bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Yonghong Song <yhs@fb.com> 2018-08-12 07:59:17 +08:00			`.map_check_btf = map_check_no_btf,`
bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will only accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2018-05-02 19:01:28 +08:00			`};`