From 04919bed948dc22a0032a9da867b7dcb8aece4ca Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:11 -0700 Subject: [PATCH 01/94] tcp: Introduce tcp_read_skb() This patch inroduces tcp_read_skb() based on tcp_read_sock(), a preparation for the next patch which actually introduces a new sock ops. TCP is special here, because it has tcp_read_sock() which is mainly used by splice(). tcp_read_sock() supports partial read and arbitrary offset, neither of them is needed for sockmap. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-2-xiyou.wangcong@gmail.com --- include/net/tcp.h | 2 ++ net/ipv4/tcp.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index c21a9b516f1e..7547d90fbb57 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,6 +672,8 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f7309452bdce..124f384f8695 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,6 +1734,53 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + int copied = 0; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + int used; + + __skb_unlink(skb, &sk->sk_receive_queue); + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } + seq += used; + copied += used; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + consume_skb(skb); + ++seq; + break; + } + consume_skb(skb); + if (!desc->count) + break; + WRITE_ONCE(tp->copied_seq, seq); + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied > 0) + tcp_cleanup_rbuf(sk, copied); + + return copied; +} +EXPORT_SYMBOL(tcp_read_skb); + int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); From 965b57b469a589d64d81b1688b38dcb537011bb0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:12 -0700 Subject: [PATCH 02/94] net: Introduce a new proto_ops ->read_skb() Currently both splice() and sockmap use ->read_sock() to read skb from receive queue, but for sockmap we only read one entire skb at a time, so ->read_sock() is too conservative to use. Introduce a new proto_ops ->read_skb() which supports this sematic, with this we can finally pass the ownership of skb to recv actors. For non-TCP protocols, all ->read_sock() can be simply converted to ->read_skb(). Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com --- include/linux/net.h | 4 ++++ include/net/tcp.h | 3 +-- include/net/udp.h | 3 +-- net/core/skmsg.c | 20 +++++--------------- net/ipv4/af_inet.c | 3 ++- net/ipv4/tcp.c | 9 +++------ net/ipv4/udp.c | 10 ++++------ net/ipv6/af_inet6.c | 3 ++- net/unix/af_unix.c | 23 +++++++++-------------- 9 files changed, 31 insertions(+), 47 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index 12093f4db50c..a03485e8cbb2 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -152,6 +152,8 @@ struct module; struct sk_buff; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); +typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); + struct proto_ops { int family; @@ -214,6 +216,8 @@ struct proto_ops { */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); + /* This is different from read_sock(), it reads an entire skb at a time. */ + int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendpage_locked)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, diff --git a/include/net/tcp.h b/include/net/tcp.h index 7547d90fbb57..8e48dc56837b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,8 +672,7 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index b60eea2e3fae..987f7fc7c0aa 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -306,8 +306,7 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 7e03f96e441b..f7f63b7d990c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1160,21 +1160,17 @@ static void sk_psock_done_strp(struct sk_psock *psock) } #endif /* CONFIG_BPF_STREAM_PARSER */ -static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, - unsigned int offset, size_t orig_len) +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *)desc->arg.data; struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = orig_len; + int len = skb->len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - desc->error = -ENOMEM; + if (!skb) return 0; - } rcu_read_lock(); psock = sk_psock(sk); @@ -1204,16 +1200,10 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; - read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; - - desc.arg.data = sk; - desc.error = 0; - desc.count = 1; - - sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); + sock->ops->read_skb(sk, sk_psock_verdict_recv); } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index da81f56fdd1c..7abd652a558f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, @@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 124f384f8695..9d2fd3ced21b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,8 +1734,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; @@ -1750,7 +1749,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, int used; __skb_unlink(skb, &sk->sk_receive_queue); - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1765,9 +1764,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, break; } consume_skb(skb); - if (!desc->count) - break; - WRITE_ONCE(tp->copied_seq, seq); + break; } WRITE_ONCE(tp->copied_seq, seq); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6172b4750a88..c660b0bc4d14 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1797,8 +1797,7 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -1820,7 +1819,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1831,13 +1830,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; } -EXPORT_SYMBOL(udp_read_sock); +EXPORT_SYMBOL(udp_read_skb); /* * This should be easy, if there is something there we diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 658823e91eca..0ee0770e79aa 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -702,6 +702,7 @@ const struct proto_ops inet6_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, @@ -727,7 +728,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..1bed3739768c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, - .read_sock = unix_stream_read_sock, + .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, - .read_sock = unix_read_sock, + .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -2487,8 +2485,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si return __unix_dgram_recvmsg(sk, msg, size, flags); } -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -2503,7 +2500,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, if (!skb) return err; - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -2514,8 +2511,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; @@ -2650,13 +2646,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { if (unlikely(sk->sk_state != TCP_ESTABLISHED)) return -ENOTCONN; - return unix_read_sock(sk, desc, recv_actor); + return unix_read_skb(sk, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, From 57452d767feaeab405de3bff0d240c3ac84bfe0d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:13 -0700 Subject: [PATCH 03/94] skmsg: Get rid of skb_clone() With ->read_skb() now we have an entire skb dequeued from receive queue, now we just need to grab an addtional refcnt before passing its ownership to recv actors. And we should not touch them any more, particularly for skb->sk. Fortunately, skb->sk is already set for most of the protocols except UDP where skb->sk has been stolen, so we have to fix it up for UDP case. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-4-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 7 +------ net/ipv4/udp.c | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f7f63b7d990c..8b248d289c11 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1167,10 +1167,7 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) int ret = __SK_DROP; int len = skb->len; - /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) - return 0; + skb_get(skb); rcu_read_lock(); psock = sk_psock(sk); @@ -1183,12 +1180,10 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); - skb->sk = NULL; } if (sk_psock_verdict_apply(psock, skb, ret) < 0) len = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c660b0bc4d14..2516078aa03e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1819,6 +1819,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) continue; } + WARN_ON(!skb_set_owner_sk_safe(skb, sk)); used = recv_actor(sk, skb); if (used <= 0) { if (!copied) From 43312915b5ba20741617dd2119e835205fa8580c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:14 -0700 Subject: [PATCH 04/94] skmsg: Get rid of unncessary memset() We always allocate skmsg with kzalloc(), so there is no need to call memset(0) on it, the only thing we need from sk_msg_init() is sg_init_marker(). So introduce a new helper which is just kzalloc()+sg_init_marker(), this saves an unncessary memset(0) for skmsg on fast path. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8b248d289c11..4b297d67edb7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -497,23 +497,27 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, - struct sk_buff *skb) +static struct sk_msg *alloc_sk_msg(void) { struct sk_msg *msg; + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!msg)) + return NULL; + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); + return msg; +} + +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) +{ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!msg)) - return NULL; - - sk_msg_init(msg); - return msg; + return alloc_sk_msg(); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, @@ -590,13 +594,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { - struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sk_msg *msg = alloc_sk_msg(); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; - sk_msg_init(msg); skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) From e068c0776b0bbd3fc5a283b0e0eaa1cbb9ef0e3d Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 20 Jun 2022 13:49:39 +0300 Subject: [PATCH 05/94] selftests/bpf: Enable config options needed for xdp_synproxy test This commit adds the kernel config options needed to run the recently added xdp_synproxy test. Users without these options will hit errors like this: test_synproxy:FAIL:iptables -t raw -I PREROUTING -i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack unexpected error: 256 (errno 22) Suggested-by: Alexei Starovoitov Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220620104939.4094104-1-maximmi@nvidia.com --- tools/testing/selftests/bpf/config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 3b3edc0fc8a6..c05904d631ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -57,3 +57,9 @@ CONFIG_FPROBE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_MPTCP=y +CONFIG_NETFILTER_SYNPROXY=y +CONFIG_NETFILTER_XT_TARGET_CT=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_SYNPROXY=y +CONFIG_IP_NF_RAW=y From aca80dd95e20f1fa0daa212afc83c9fa0ad239e5 Mon Sep 17 00:00:00 2001 From: Delyan Kratunov Date: Mon, 20 Jun 2022 21:47:55 +0000 Subject: [PATCH 06/94] uprobe: gate bpf call behind BPF_EVENTS The call into bpf from uprobes needs to be gated now that it doesn't use the trace_events.h helpers. Randy found this as a randconfig build failure on linux-next [1]. [1]: https://lore.kernel.org/linux-next/2de99180-7d55-2fdf-134d-33198c27cc58@infradead.org/ Reported-by: Randy Dunlap Signed-off-by: Delyan Kratunov Tested-by: Randy Dunlap Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/cb8bfbbcde87ed5d811227a393ef4925f2aadb7b.camel@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/trace_uprobe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0282c119b1b2..326235fd2346 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1344,6 +1344,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; +#ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { u32 ret; @@ -1351,6 +1352,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (!ret) return; } +#endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); From 933ff53191eb7a8492370bad6339a1ca6da2d939 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:40 +0300 Subject: [PATCH 07/94] selftests/bpf: specify expected instructions in test_verifier tests Allows to specify expected and unexpected instruction sequences in test_verifier test cases. The instructions are requested from kernel after BPF program loading, thus allowing to check some of the transformations applied by BPF verifier. - `expected_insn` field specifies a sequence of instructions expected to be found in the program; - `unexpected_insn` field specifies a sequence of instructions that are not expected to be found in the program; - `INSN_OFF_MASK` and `INSN_IMM_MASK` values could be used to mask `off` and `imm` fields. - `SKIP_INSNS` could be used to specify that some instructions in the (un)expected pattern are not important (behavior similar to usage of `\t` in `errstr` field). The intended usage is as follows: { "inline simple bpf_loop call", .insns = { /* main */ BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), ... BPF_EXIT_INSN(), /* callback */ BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), BPF_EXIT_INSN(), }, .expected_insns = { BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), SKIP_INSNS(), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_CALL, 8, 1) }, .unexpected_insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, INSN_OFF_MASK, INSN_IMM_MASK), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .result = ACCEPT, .runs = 0, }, Here it is expected that move of 1 to register 1 would remain in place and helper function call instruction would be replaced by a relative call instruction. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 234 ++++++++++++++++++++ 1 file changed, 234 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 372579c9f45e..1f24eae9e16e 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -51,6 +51,8 @@ #endif #define MAX_INSNS BPF_MAXINSNS +#define MAX_EXPECTED_INSNS 32 +#define MAX_UNEXPECTED_INSNS 32 #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 #define MAX_NR_MAPS 23 @@ -58,6 +60,10 @@ #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 +#define INSN_OFF_MASK ((__s16)0xFFFF) +#define INSN_IMM_MASK ((__s32)0xFFFFFFFF) +#define SKIP_INSNS() BPF_RAW_INSN(0xde, 0xa, 0xd, 0xbeef, 0xdeadbeef) + #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -79,6 +85,23 @@ struct bpf_test { const char *descr; struct bpf_insn insns[MAX_INSNS]; struct bpf_insn *fill_insns; + /* If specified, test engine looks for this sequence of + * instructions in the BPF program after loading. Allows to + * test rewrites applied by verifier. Use values + * INSN_OFF_MASK and INSN_IMM_MASK to mask `off` and `imm` + * fields if content does not matter. The test case fails if + * specified instructions are not found. + * + * The sequence could be split into sub-sequences by adding + * SKIP_INSNS instruction at the end of each sub-sequence. In + * such case sub-sequences are searched for one after another. + */ + struct bpf_insn expected_insns[MAX_EXPECTED_INSNS]; + /* If specified, test engine applies same pattern matching + * logic as for `expected_insns`. If the specified pattern is + * matched test case is marked as failed. + */ + struct bpf_insn unexpected_insns[MAX_UNEXPECTED_INSNS]; int fixup_map_hash_8b[MAX_FIXUPS]; int fixup_map_hash_48b[MAX_FIXUPS]; int fixup_map_hash_16b[MAX_FIXUPS]; @@ -1126,6 +1149,214 @@ static bool cmp_str_seq(const char *log, const char *exp) return true; } +static int get_xlated_program(int fd_prog, struct bpf_insn **buf, int *cnt) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 xlated_prog_len; + __u32 buf_element_size = sizeof(struct bpf_insn); + + if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) { + perror("bpf_obj_get_info_by_fd failed"); + return -1; + } + + xlated_prog_len = info.xlated_prog_len; + if (xlated_prog_len % buf_element_size) { + printf("Program length %d is not multiple of %d\n", + xlated_prog_len, buf_element_size); + return -1; + } + + *cnt = xlated_prog_len / buf_element_size; + *buf = calloc(*cnt, buf_element_size); + if (!buf) { + perror("can't allocate xlated program buffer"); + return -ENOMEM; + } + + bzero(&info, sizeof(info)); + info.xlated_prog_len = xlated_prog_len; + info.xlated_prog_insns = (__u64)*buf; + if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) { + perror("second bpf_obj_get_info_by_fd failed"); + goto out_free_buf; + } + + return 0; + +out_free_buf: + free(*buf); + return -1; +} + +static bool is_null_insn(struct bpf_insn *insn) +{ + struct bpf_insn null_insn = {}; + + return memcmp(insn, &null_insn, sizeof(null_insn)) == 0; +} + +static bool is_skip_insn(struct bpf_insn *insn) +{ + struct bpf_insn skip_insn = SKIP_INSNS(); + + return memcmp(insn, &skip_insn, sizeof(skip_insn)) == 0; +} + +static int null_terminated_insn_len(struct bpf_insn *seq, int max_len) +{ + int i; + + for (i = 0; i < max_len; ++i) { + if (is_null_insn(&seq[i])) + return i; + } + return max_len; +} + +static bool compare_masked_insn(struct bpf_insn *orig, struct bpf_insn *masked) +{ + struct bpf_insn orig_masked; + + memcpy(&orig_masked, orig, sizeof(orig_masked)); + if (masked->imm == INSN_IMM_MASK) + orig_masked.imm = INSN_IMM_MASK; + if (masked->off == INSN_OFF_MASK) + orig_masked.off = INSN_OFF_MASK; + + return memcmp(&orig_masked, masked, sizeof(orig_masked)) == 0; +} + +static int find_insn_subseq(struct bpf_insn *seq, struct bpf_insn *subseq, + int seq_len, int subseq_len) +{ + int i, j; + + if (subseq_len > seq_len) + return -1; + + for (i = 0; i < seq_len - subseq_len + 1; ++i) { + bool found = true; + + for (j = 0; j < subseq_len; ++j) { + if (!compare_masked_insn(&seq[i + j], &subseq[j])) { + found = false; + break; + } + } + if (found) + return i; + } + + return -1; +} + +static int find_skip_insn_marker(struct bpf_insn *seq, int len) +{ + int i; + + for (i = 0; i < len; ++i) + if (is_skip_insn(&seq[i])) + return i; + + return -1; +} + +/* Return true if all sub-sequences in `subseqs` could be found in + * `seq` one after another. Sub-sequences are separated by a single + * nil instruction. + */ +static bool find_all_insn_subseqs(struct bpf_insn *seq, struct bpf_insn *subseqs, + int seq_len, int max_subseqs_len) +{ + int subseqs_len = null_terminated_insn_len(subseqs, max_subseqs_len); + + while (subseqs_len > 0) { + int skip_idx = find_skip_insn_marker(subseqs, subseqs_len); + int cur_subseq_len = skip_idx < 0 ? subseqs_len : skip_idx; + int subseq_idx = find_insn_subseq(seq, subseqs, + seq_len, cur_subseq_len); + + if (subseq_idx < 0) + return false; + seq += subseq_idx + cur_subseq_len; + seq_len -= subseq_idx + cur_subseq_len; + subseqs += cur_subseq_len + 1; + subseqs_len -= cur_subseq_len + 1; + } + + return true; +} + +static void print_insn(struct bpf_insn *buf, int cnt) +{ + int i; + + printf(" addr op d s off imm\n"); + for (i = 0; i < cnt; ++i) { + struct bpf_insn *insn = &buf[i]; + + if (is_null_insn(insn)) + break; + + if (is_skip_insn(insn)) + printf(" ...\n"); + else + printf(" %04x: %02x %1x %x %04hx %08x\n", + i, insn->code, insn->dst_reg, + insn->src_reg, insn->off, insn->imm); + } +} + +static bool check_xlated_program(struct bpf_test *test, int fd_prog) +{ + struct bpf_insn *buf; + int cnt; + bool result = true; + bool check_expected = !is_null_insn(test->expected_insns); + bool check_unexpected = !is_null_insn(test->unexpected_insns); + + if (!check_expected && !check_unexpected) + goto out; + + if (get_xlated_program(fd_prog, &buf, &cnt)) { + printf("FAIL: can't get xlated program\n"); + result = false; + goto out; + } + + if (check_expected && + !find_all_insn_subseqs(buf, test->expected_insns, + cnt, MAX_EXPECTED_INSNS)) { + printf("FAIL: can't find expected subsequence of instructions\n"); + result = false; + if (verbose) { + printf("Program:\n"); + print_insn(buf, cnt); + printf("Expected subsequence:\n"); + print_insn(test->expected_insns, MAX_EXPECTED_INSNS); + } + } + + if (check_unexpected && + find_all_insn_subseqs(buf, test->unexpected_insns, + cnt, MAX_UNEXPECTED_INSNS)) { + printf("FAIL: found unexpected subsequence of instructions\n"); + result = false; + if (verbose) { + printf("Program:\n"); + print_insn(buf, cnt); + printf("Un-expected subsequence:\n"); + print_insn(test->unexpected_insns, MAX_UNEXPECTED_INSNS); + } + } + + free(buf); + out: + return result; +} + static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { @@ -1262,6 +1493,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, if (verbose) printf(", verifier log:\n%s", bpf_vlog); + if (!check_xlated_program(test, fd_prog)) + goto fail_log; + run_errs = 0; run_successes = 0; if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { From 7a42008ca5c700819e4b3003025e5e1695fd1f86 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:41 +0300 Subject: [PATCH 08/94] selftests/bpf: allow BTF specs and func infos in test_verifier tests The BTF and func_info specification for test_verifier tests follows the same notation as in prog_tests/btf.c tests. E.g.: ... .func_info = { { 0, 6 }, { 8, 7 } }, .func_info_cnt = 2, .btf_strings = "\0int\0", .btf_types = { BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), BTF_PTR_ENC(1), }, ... The BTF specification is loaded only when specified. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 1 - tools/testing/selftests/bpf/test_btf.h | 2 + tools/testing/selftests/bpf/test_verifier.c | 94 ++++++++++++++++---- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index edb387163baa..1fd792a92a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -34,7 +34,6 @@ static bool always_log; #undef CHECK #define CHECK(condition, format...) _CHECK(condition, "check", duration, format) -#define BTF_END_RAW 0xdeadbeef #define NAME_TBD 0xdeadb33f #define NAME_NTH(N) (0xfffe0000 | N) diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 38782bd47fdc..fb4f4714eeb4 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -4,6 +4,8 @@ #ifndef _TEST_BTF_H #define _TEST_BTF_H +#define BTF_END_RAW 0xdeadbeef + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1f24eae9e16e..7fe897c66d81 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -59,11 +59,17 @@ #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 +#define MAX_FUNC_INFOS 8 +#define MAX_BTF_STRINGS 256 +#define MAX_BTF_TYPES 256 #define INSN_OFF_MASK ((__s16)0xFFFF) #define INSN_IMM_MASK ((__s32)0xFFFFFFFF) #define SKIP_INSNS() BPF_RAW_INSN(0xde, 0xa, 0xd, 0xbeef, 0xdeadbeef) +#define DEFAULT_LIBBPF_LOG_LEVEL 4 +#define VERBOSE_LIBBPF_LOG_LEVEL 1 + #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0) #define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1) @@ -158,6 +164,14 @@ struct bpf_test { }; enum bpf_attach_type expected_attach_type; const char *kfunc; + struct bpf_func_info func_info[MAX_FUNC_INFOS]; + int func_info_cnt; + char btf_strings[MAX_BTF_STRINGS]; + /* A set of BTF types to load when specified, + * use macro definitions from test_btf.h, + * must end with BTF_END_RAW + */ + __u32 btf_types[MAX_BTF_TYPES]; }; /* Note we want this to be 64 bit aligned so that the end of our array is @@ -687,34 +701,66 @@ static __u32 btf_raw_types[] = { BTF_MEMBER_ENC(71, 13, 128), /* struct prog_test_member __kptr_ref *ptr; */ }; -static int load_btf(void) +static char bpf_vlog[UINT_MAX >> 8]; + +static int load_btf_spec(__u32 *types, int types_len, + const char *strings, int strings_len) { struct btf_header hdr = { .magic = BTF_MAGIC, .version = BTF_VERSION, .hdr_len = sizeof(struct btf_header), - .type_len = sizeof(btf_raw_types), - .str_off = sizeof(btf_raw_types), - .str_len = sizeof(btf_str_sec), + .type_len = types_len, + .str_off = types_len, + .str_len = strings_len, }; void *ptr, *raw_btf; int btf_fd; + LIBBPF_OPTS(bpf_btf_load_opts, opts, + .log_buf = bpf_vlog, + .log_size = sizeof(bpf_vlog), + .log_level = (verbose + ? VERBOSE_LIBBPF_LOG_LEVEL + : DEFAULT_LIBBPF_LOG_LEVEL), + ); - ptr = raw_btf = malloc(sizeof(hdr) + sizeof(btf_raw_types) + - sizeof(btf_str_sec)); + raw_btf = malloc(sizeof(hdr) + types_len + strings_len); + ptr = raw_btf; memcpy(ptr, &hdr, sizeof(hdr)); ptr += sizeof(hdr); - memcpy(ptr, btf_raw_types, hdr.type_len); + memcpy(ptr, types, hdr.type_len); ptr += hdr.type_len; - memcpy(ptr, btf_str_sec, hdr.str_len); + memcpy(ptr, strings, hdr.str_len); ptr += hdr.str_len; - btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, NULL); - free(raw_btf); + btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, &opts); if (btf_fd < 0) - return -1; - return btf_fd; + printf("Failed to load BTF spec: '%s'\n", strerror(errno)); + + free(raw_btf); + + return btf_fd < 0 ? -1 : btf_fd; +} + +static int load_btf(void) +{ + return load_btf_spec(btf_raw_types, sizeof(btf_raw_types), + btf_str_sec, sizeof(btf_str_sec)); +} + +static int load_btf_for_test(struct bpf_test *test) +{ + int types_num = 0; + + while (types_num < MAX_BTF_TYPES && + test->btf_types[types_num] != BTF_END_RAW) + ++types_num; + + int types_len = types_num * sizeof(test->btf_types[0]); + + return load_btf_spec(test->btf_types, types_len, + test->btf_strings, sizeof(test->btf_strings)); } static int create_map_spin_lock(void) @@ -793,8 +839,6 @@ static int create_map_kptr(void) return fd; } -static char bpf_vlog[UINT_MAX >> 8]; - static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, struct bpf_insn *prog, int *map_fds) { @@ -1360,7 +1404,7 @@ static bool check_xlated_program(struct bpf_test *test, int fd_prog) static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { - int fd_prog, expected_ret, alignment_prevented_execution; + int fd_prog, btf_fd, expected_ret, alignment_prevented_execution; int prog_len, prog_type = test->prog_type; struct bpf_insn *prog = test->insns; LIBBPF_OPTS(bpf_prog_load_opts, opts); @@ -1372,8 +1416,10 @@ static void do_test_single(struct bpf_test *test, bool unpriv, __u32 pflags; int i, err; + fd_prog = -1; for (i = 0; i < MAX_NR_MAPS; i++) map_fds[i] = -1; + btf_fd = -1; if (!prog_type) prog_type = BPF_PROG_TYPE_SOCKET_FILTER; @@ -1406,11 +1452,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv, opts.expected_attach_type = test->expected_attach_type; if (verbose) - opts.log_level = 1; + opts.log_level = VERBOSE_LIBBPF_LOG_LEVEL; else if (expected_ret == VERBOSE_ACCEPT) opts.log_level = 2; else - opts.log_level = 4; + opts.log_level = DEFAULT_LIBBPF_LOG_LEVEL; opts.prog_flags = pflags; if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) { @@ -1428,6 +1474,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv, opts.attach_btf_id = attach_btf_id; } + if (test->btf_types[0] != 0) { + btf_fd = load_btf_for_test(test); + if (btf_fd < 0) + goto fail_log; + opts.prog_btf_fd = btf_fd; + } + + if (test->func_info_cnt != 0) { + opts.func_info = test->func_info; + opts.func_info_cnt = test->func_info_cnt; + opts.func_info_rec_size = sizeof(test->func_info[0]); + } + opts.log_buf = bpf_vlog; opts.log_size = sizeof(bpf_vlog); fd_prog = bpf_prog_load(prog_type, NULL, "GPL", prog, prog_len, &opts); @@ -1539,6 +1598,7 @@ close_fds: if (test->fill_insns) free(test->fill_insns); close(fd_prog); + close(btf_fd); for (i = 0; i < MAX_NR_MAPS; i++) close(map_fds[i]); sched_yield(); From 1ade23711971b0eececf0d7fedc29d3c1d2fce01 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:42 +0300 Subject: [PATCH 09/94] bpf: Inline calls to bpf_loop when callback is known Calls to `bpf_loop` are replaced with direct loops to avoid indirection. E.g. the following: bpf_loop(10, foo, NULL, 0); Is replaced by equivalent of the following: for (int i = 0; i < 10; ++i) foo(i, NULL); This transformation could be applied when: - callback is known and does not change during program execution; - flags passed to `bpf_loop` are always zero. Inlining logic works as follows: - During execution simulation function `update_loop_inline_state` tracks the following information for each `bpf_loop` call instruction: - is callback known and constant? - are flags constant and zero? - Function `optimize_bpf_loop` increases stack depth for functions where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop` to apply the inlining. The additional stack space is used to spill registers R6, R7 and R8. These registers are used as loop counter, loop maximal bound and callback context parameter; Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op. Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 + include/linux/bpf_verifier.h | 12 +++ kernel/bpf/bpf_iter.c | 9 +- kernel/bpf/verifier.c | 180 ++++++++++++++++++++++++++++++++++- 4 files changed, 195 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0edd7d2c0064..d05e1495a06e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1286,6 +1286,9 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 +/* Maximum number of loops for bpf_loop */ +#define BPF_MAX_LOOPS BIT(23) + #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ BPF_F_WRONLY | \ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3930c963fa67..81b19669efba 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -344,6 +344,14 @@ struct bpf_verifier_state_list { int miss_cnt, hit_cnt; }; +struct bpf_loop_inline_state { + int initialized:1; /* set to true upon first entry */ + int fit_for_inline:1; /* true if callback function is the same + * at each call and flags are always zero + */ + u32 callback_subprogno; /* valid when fit_for_inline is true */ +}; + /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) @@ -373,6 +381,10 @@ struct bpf_insn_aux_data { u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; + /* if instruction is a call to bpf_loop this field tracks + * the state of the relevant registers to make decision about inlining + */ + struct bpf_loop_inline_state loop_inline_state; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index d5d96ceca105..7e8fd49406f6 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg4_type = ARG_ANYTHING, }; -/* maximum number of loops */ -#define MAX_LOOPS BIT(23) - BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { @@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64 ret; u32 i; + /* Note: these safety checks are also verified when bpf_loop + * is inlined, be careful to modify this code in sync. See + * function verifier.c:inline_bpf_loop. + */ if (flags) return -EINVAL; - if (nr_loops > MAX_LOOPS) + if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2859901ffbe3..bf72dc511df6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7124,6 +7124,41 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static bool loop_flag_is_zero(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[BPF_REG_4]; + bool reg_is_null = register_is_null(reg); + + if (reg_is_null) + mark_chain_precision(env, BPF_REG_4); + + return reg_is_null; +} + +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) +{ + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + + if (!state->initialized) { + state->initialized = 1; + state->fit_for_inline = loop_flag_is_zero(env); + state->callback_subprogno = subprogno; + return; + } + + if (!state->fit_for_inline) + return; + + state->fit_for_inline = (loop_flag_is_zero(env) && + state->callback_subprogno == subprogno); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -7276,6 +7311,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: + update_loop_inline_state(env, meta.subprogno); err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; @@ -7682,11 +7718,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) -{ - return &env->insn_aux_data[env->insn_idx]; -} - enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -14315,6 +14346,142 @@ patch_call_imm: return 0; } +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + struct bpf_insn insn_buf[] = { + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), + BPF_JMP_IMM(BPF_JA, 0, 0, 16), + /* spill R6, R7, R8 to use these as loop vars */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), + /* initialize loop vars */ + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), + BPF_MOV32_IMM(reg_loop_cnt, 0), + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), + /* callback call, + * correct callback offset would be set after patching + */ + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), + BPF_CALL_REL(0), + /* increment loop counter */ + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), + /* jump to loop header if callback returned 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), + /* restore original values of R6, R7, R8 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), + }; + + *cnt = ARRAY_SIZE(insn_buf); + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + env->prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +static int optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -15052,6 +15219,9 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (ret == 0) + ret = optimize_bpf_loop(env); + if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); From f8acfdd04410d26b096a7082444cdc402df10f89 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:43 +0300 Subject: [PATCH 10/94] selftests/bpf: BPF test_verifier selftests for bpf_loop inlining A number of test cases for BPF selftests test_verifier to check how bpf_loop inline transformation rewrites the BPF program. The following cases are covered: - happy path - no-rewrite when flags is non-zero - no-rewrite when callback is non-constant - subprogno in insn_aux is updated correctly when dead sub-programs are removed - check that correct stack offsets are assigned for spilling of R6-R8 registers Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/bpf_loop_inline.c | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 tools/testing/selftests/bpf/verifier/bpf_loop_inline.c diff --git a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c new file mode 100644 index 000000000000..232da07c93b5 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c @@ -0,0 +1,252 @@ +#define BTF_TYPES \ + .btf_strings = "\0int\0i\0ctx\0callback\0main\0", \ + .btf_types = { \ + /* 1: int */ BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), \ + /* 2: int* */ BTF_PTR_ENC(1), \ + /* 3: void* */ BTF_PTR_ENC(0), \ + /* 4: int __(void*) */ BTF_FUNC_PROTO_ENC(1, 1), \ + BTF_FUNC_PROTO_ARG_ENC(7, 3), \ + /* 5: int __(int, int*) */ BTF_FUNC_PROTO_ENC(1, 2), \ + BTF_FUNC_PROTO_ARG_ENC(5, 1), \ + BTF_FUNC_PROTO_ARG_ENC(7, 2), \ + /* 6: main */ BTF_FUNC_ENC(20, 4), \ + /* 7: callback */ BTF_FUNC_ENC(11, 5), \ + BTF_END_RAW \ + } + +#define MAIN_TYPE 6 +#define CALLBACK_TYPE 7 + +/* can't use BPF_CALL_REL, jit_subprogs adjusts IMM & OFF + * fields for pseudo calls + */ +#define PSEUDO_CALL_INSN() \ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_CALL, \ + INSN_OFF_MASK, INSN_IMM_MASK) + +/* can't use BPF_FUNC_loop constant, + * do_mix_fixups adjusts the IMM field + */ +#define HELPER_CALL_INSN() \ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, INSN_OFF_MASK, INSN_IMM_MASK) + +{ + "inline simple bpf_loop call", + .insns = { + /* main */ + /* force verifier state branching to verify logic on first and + * subsequent bpf_loop insn processing steps + */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 777, 2), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { { 0, MAIN_TYPE }, { 12, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, +{ + "don't inline bpf_loop call, flags non-zero", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 9), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 7), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, -10), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { HELPER_CALL_INSN() }, + .unexpected_insns = { PSEUDO_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { { 0, MAIN_TYPE }, { 16, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, +{ + "don't inline bpf_loop call, callback non-constant", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 777, 4), /* pick a random callback */ + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 10), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 8), + BPF_RAW_INSN(0, 0, 0, 0, 0), + + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* callback #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { HELPER_CALL_INSN() }, + .unexpected_insns = { PSEUDO_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { + { 0, MAIN_TYPE }, + { 14, CALLBACK_TYPE }, + { 16, CALLBACK_TYPE } + }, + .func_info_cnt = 3, + BTF_TYPES +}, +{ + "bpf_loop_inline and a dead func", + .insns = { + /* main */ + + /* A reference to callback #1 to make verifier count it as a func. + * This reference is overwritten below and callback #1 is dead. + */ + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 9), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 8), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* callback #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .runs = 0, + .func_info = { + { 0, MAIN_TYPE }, + { 10, CALLBACK_TYPE }, + { 12, CALLBACK_TYPE } + }, + .func_info_cnt = 3, + BTF_TYPES +}, +{ + "bpf_loop_inline stack locations for loop vars", + .insns = { + /* main */ + BPF_ST_MEM(BPF_W, BPF_REG_10, -12, 0x77), + /* bpf_loop call #1 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 22), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + /* bpf_loop call #2 */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 16), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + /* call func and exit */ + BPF_CALL_REL(2), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* func */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -32, 0x55), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 2), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 6), + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* callback */ + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -12, 0x77), + SKIP_INSNS(), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -24), + SKIP_INSNS(), + /* offsets are the same as in the first call */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -40), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -32), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -24), + SKIP_INSNS(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -32, 0x55), + SKIP_INSNS(), + /* offsets differ from main because of different offset + * in BPF_ST_MEM instruction + */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -56), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, -48), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -40), + }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .result = ACCEPT, + .func_info = { + { 0, MAIN_TYPE }, + { 16, MAIN_TYPE }, + { 25, CALLBACK_TYPE }, + }, + .func_info_cnt = 3, + BTF_TYPES +}, + +#undef HELPER_CALL_INSN +#undef PSEUDO_CALL_INSN +#undef CALLBACK_TYPE +#undef MAIN_TYPE +#undef BTF_TYPES From 0e1bf9ed2000c16fa8e0703e255a23d64a4adb27 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Jun 2022 02:53:44 +0300 Subject: [PATCH 11/94] selftests/bpf: BPF test_prog selftests for bpf_loop inlining Two new test BPF programs for test_prog selftests checking bpf_loop behavior. Both are corner cases for bpf_loop inlinig transformation: - check that bpf_loop behaves correctly when callback function is not a compile time constant - check that local function variables are not affected by allocating additional stack storage for registers spilled by loop inlining Signed-off-by: Eduard Zingerman Acked-by: Song Liu Link: https://lore.kernel.org/r/20220620235344.569325-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_loop.c | 62 ++++++++++ tools/testing/selftests/bpf/progs/bpf_loop.c | 114 ++++++++++++++++++ 2 files changed, 176 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c index 380d7a2072e3..4cd8a25afe68 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c @@ -120,6 +120,64 @@ static void check_nested_calls(struct bpf_loop *skel) bpf_link__destroy(link); } +static void check_non_constant_callback(struct bpf_loop *skel) +{ + struct bpf_link *link = + bpf_program__attach(skel->progs.prog_non_constant_callback); + + if (!ASSERT_OK_PTR(link, "link")) + return; + + skel->bss->callback_selector = 0x0F; + usleep(1); + ASSERT_EQ(skel->bss->g_output, 0x0F, "g_output #1"); + + skel->bss->callback_selector = 0xF0; + usleep(1); + ASSERT_EQ(skel->bss->g_output, 0xF0, "g_output #2"); + + bpf_link__destroy(link); +} + +static void check_stack(struct bpf_loop *skel) +{ + struct bpf_link *link = bpf_program__attach(skel->progs.stack_check); + const int max_key = 12; + int key; + int map_fd; + + if (!ASSERT_OK_PTR(link, "link")) + return; + + map_fd = bpf_map__fd(skel->maps.map1); + + if (!ASSERT_GE(map_fd, 0, "bpf_map__fd")) + goto out; + + for (key = 1; key <= max_key; ++key) { + int val = key; + int err = bpf_map_update_elem(map_fd, &key, &val, BPF_NOEXIST); + + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + } + + usleep(1); + + for (key = 1; key <= max_key; ++key) { + int val; + int err = bpf_map_lookup_elem(map_fd, &key, &val); + + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + if (!ASSERT_EQ(val, key + 1, "bad value in the map")) + goto out; + } + +out: + bpf_link__destroy(link); +} + void test_bpf_loop(void) { struct bpf_loop *skel; @@ -140,6 +198,10 @@ void test_bpf_loop(void) check_invalid_flags(skel); if (test__start_subtest("check_nested_calls")) check_nested_calls(skel); + if (test__start_subtest("check_non_constant_callback")) + check_non_constant_callback(skel); + if (test__start_subtest("check_stack")) + check_stack(skel); bpf_loop__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_loop.c b/tools/testing/selftests/bpf/progs/bpf_loop.c index e08565282759..de1fc82d2710 100644 --- a/tools/testing/selftests/bpf/progs/bpf_loop.c +++ b/tools/testing/selftests/bpf/progs/bpf_loop.c @@ -11,11 +11,19 @@ struct callback_ctx { int output; }; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 32); + __type(key, int); + __type(value, int); +} map1 SEC(".maps"); + /* These should be set by the user program */ u32 nested_callback_nr_loops; u32 stop_index = -1; u32 nr_loops; int pid; +int callback_selector; /* Making these global variables so that the userspace program * can verify the output through the skeleton @@ -111,3 +119,109 @@ int prog_nested_calls(void *ctx) return 0; } + +static int callback_set_f0(int i, void *ctx) +{ + g_output = 0xF0; + return 0; +} + +static int callback_set_0f(int i, void *ctx) +{ + g_output = 0x0F; + return 0; +} + +/* + * non-constant callback is a corner case for bpf_loop inline logic + */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int prog_non_constant_callback(void *ctx) +{ + struct callback_ctx data = {}; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + int (*callback)(int i, void *ctx); + + g_output = 0; + + if (callback_selector == 0x0F) + callback = callback_set_0f; + else + callback = callback_set_f0; + + bpf_loop(1, callback, NULL, 0); + + return 0; +} + +static int stack_check_inner_callback(void *ctx) +{ + return 0; +} + +static int map1_lookup_elem(int key) +{ + int *val = bpf_map_lookup_elem(&map1, &key); + + return val ? *val : -1; +} + +static void map1_update_elem(int key, int val) +{ + bpf_map_update_elem(&map1, &key, &val, BPF_ANY); +} + +static int stack_check_outer_callback(void *ctx) +{ + int a = map1_lookup_elem(1); + int b = map1_lookup_elem(2); + int c = map1_lookup_elem(3); + int d = map1_lookup_elem(4); + int e = map1_lookup_elem(5); + int f = map1_lookup_elem(6); + + bpf_loop(1, stack_check_inner_callback, NULL, 0); + + map1_update_elem(1, a + 1); + map1_update_elem(2, b + 1); + map1_update_elem(3, c + 1); + map1_update_elem(4, d + 1); + map1_update_elem(5, e + 1); + map1_update_elem(6, f + 1); + + return 0; +} + +/* Some of the local variables in stack_check and + * stack_check_outer_callback would be allocated on stack by + * compiler. This test should verify that stack content for these + * variables is preserved between calls to bpf_loop (might be an issue + * if loop inlining allocates stack slots incorrectly). + */ +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int stack_check(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + int a = map1_lookup_elem(7); + int b = map1_lookup_elem(8); + int c = map1_lookup_elem(9); + int d = map1_lookup_elem(10); + int e = map1_lookup_elem(11); + int f = map1_lookup_elem(12); + + bpf_loop(1, stack_check_outer_callback, NULL, 0); + + map1_update_elem(7, a + 1); + map1_update_elem(8, b + 1); + map1_update_elem(9, c + 1); + map1_update_elem(10, d + 1); + map1_update_elem(11, e + 1); + map1_update_elem(12, f + 1); + + return 0; +} From 95acd8817e66d031d2e6ee7def3f1e1874819317 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 17 Jun 2022 12:57:34 +0200 Subject: [PATCH 12/94] bpf, x64: Add predicate for bpf2bpf with tailcalls support in JIT The BPF core/verifier is hard-coded to permit mixing bpf2bpf and tail calls for only x86-64. Change the logic to instead rely on a new weak function 'bool bpf_jit_supports_subprog_tailcalls(void)', which a capable JIT backend can override. Update the x86-64 eBPF JIT to reflect this. Signed-off-by: Tony Ambardar [jakub: drop MIPS bits and tweak patch subject] Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220617105735.733938-2-jakub@cloudflare.com --- arch/x86/net/bpf_jit_comp.c | 6 ++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 6 ++++++ kernel/bpf/verifier.c | 3 ++- 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f298b18a9a3d..2c51ca9f7cec 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2491,3 +2491,9 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-EINVAL); return dst; } + +/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index d0cbb31b1b4d..4c1a8b247545 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -914,6 +914,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5ffebcce6cc..f023cb399e3f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2729,6 +2729,12 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool __weak bpf_jit_supports_subprog_tailcalls(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf72dc511df6..a20d7736a5b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6154,7 +6154,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { - return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); + return env->prog->jit_requested && + bpf_jit_supports_subprog_tailcalls(); } static int check_map_func_compatibility(struct bpf_verifier_env *env, From d4609a5d8c70d21b4a3f801cf896a3c16c613fe1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jun 2022 12:57:35 +0200 Subject: [PATCH 13/94] bpf, arm64: Keep tail call count across bpf2bpf calls Today doing a BPF tail call after a BPF to BPF call, that is from a subprogram, is allowed only by the x86-64 BPF JIT. Mixing these features requires support from JIT. Tail call count has to be tracked through BPF to BPF calls, as well as through BPF tail calls to prevent unbounded chains of tail calls. arm64 BPF JIT stores the tail call count (TCC) in a dedicated register (X26). This makes it easier to support bpf2bpf calls mixed with tail calls than on x86 platform. In order to keep the tail call count in tact throughout bpf2bpf calls, all we need to do is tweak the program prologue generator. When emitting prologue for a subprogram, we skip the block that initializes the tail call count and emits a jump pad for the tail call. With this change, a sample execution flow where a bpf2bpf call is followed by a tail call would look like so: int entry(struct __sk_buff *skb): 0xffffffc0090151d4: paciasp 0xffffffc0090151d8: stp x29, x30, [sp, #-16]! 0xffffffc0090151dc: mov x29, sp 0xffffffc0090151e0: stp x19, x20, [sp, #-16]! 0xffffffc0090151e4: stp x21, x22, [sp, #-16]! 0xffffffc0090151e8: stp x25, x26, [sp, #-16]! 0xffffffc0090151ec: stp x27, x28, [sp, #-16]! 0xffffffc0090151f0: mov x25, sp 0xffffffc0090151f4: mov x26, #0x0 // <- init TCC only 0xffffffc0090151f8: bti j // in main prog 0xffffffc0090151fc: sub x27, x25, #0x0 0xffffffc009015200: sub sp, sp, #0x10 0xffffffc009015204: mov w1, #0x0 0xffffffc009015208: mov x10, #0xffffffffffffffff 0xffffffc00901520c: strb w1, [x25, x10] 0xffffffc009015210: mov x10, #0xffffffffffffd25c 0xffffffc009015214: movk x10, #0x902, lsl #16 0xffffffc009015218: movk x10, #0xffc0, lsl #32 0xffffffc00901521c: blr x10 -------------------. // bpf2bpf call 0xffffffc009015220: add x7, x0, #0x0 <-------------. 0xffffffc009015224: add sp, sp, #0x10 | | 0xffffffc009015228: ldp x27, x28, [sp], #16 | | 0xffffffc00901522c: ldp x25, x26, [sp], #16 | | 0xffffffc009015230: ldp x21, x22, [sp], #16 | | 0xffffffc009015234: ldp x19, x20, [sp], #16 | | 0xffffffc009015238: ldp x29, x30, [sp], #16 | | 0xffffffc00901523c: add x0, x7, #0x0 | | 0xffffffc009015240: autiasp | | 0xffffffc009015244: ret | | | | int subprog_tail(struct __sk_buff *skb): | | 0xffffffc00902d25c: paciasp <----------------------' | 0xffffffc00902d260: stp x29, x30, [sp, #-16]! | 0xffffffc00902d264: mov x29, sp | 0xffffffc00902d268: stp x19, x20, [sp, #-16]! | 0xffffffc00902d26c: stp x21, x22, [sp, #-16]! | 0xffffffc00902d270: stp x25, x26, [sp, #-16]! | 0xffffffc00902d274: stp x27, x28, [sp, #-16]! | 0xffffffc00902d278: mov x25, sp | 0xffffffc00902d27c: sub x27, x25, #0x0 | 0xffffffc00902d280: sub sp, sp, #0x10 | // <- end of prologue, notice: 0xffffffc00902d284: add x19, x0, #0x0 | // 1) TCC not touched, and 0xffffffc00902d288: mov w0, #0x1 | // 2) no tail call jump pad 0xffffffc00902d28c: mov x10, #0xfffffffffffffffc | 0xffffffc00902d290: str w0, [x25, x10] | 0xffffffc00902d294: mov x20, #0xffffff80ffffffff | 0xffffffc00902d298: movk x20, #0xc033, lsl #16 | 0xffffffc00902d29c: movk x20, #0x4e00 | 0xffffffc00902d2a0: add x0, x19, #0x0 | 0xffffffc00902d2a4: add x1, x20, #0x0 | 0xffffffc00902d2a8: mov x2, #0x0 | 0xffffffc00902d2ac: mov w10, #0x24 | 0xffffffc00902d2b0: ldr w10, [x1, x10] | 0xffffffc00902d2b4: add w2, w2, #0x0 | 0xffffffc00902d2b8: cmp w2, w10 | 0xffffffc00902d2bc: b.cs 0xffffffc00902d2f8 | 0xffffffc00902d2c0: mov w10, #0x21 | 0xffffffc00902d2c4: cmp x26, x10 | // TCC >= MAX_TAIL_CALL_CNT? 0xffffffc00902d2c8: b.cs 0xffffffc00902d2f8 | 0xffffffc00902d2cc: add x26, x26, #0x1 | // TCC++ 0xffffffc00902d2d0: mov w10, #0x110 | 0xffffffc00902d2d4: add x10, x1, x10 | 0xffffffc00902d2d8: lsl x11, x2, #3 | 0xffffffc00902d2dc: ldr x11, [x10, x11] | 0xffffffc00902d2e0: cbz x11, 0xffffffc00902d2f8 | 0xffffffc00902d2e4: mov w10, #0x30 | 0xffffffc00902d2e8: ldr x10, [x11, x10] | 0xffffffc00902d2ec: add x10, x10, #0x24 | 0xffffffc00902d2f0: add sp, sp, #0x10 | // <- destroy just current 0xffffffc00902d2f4: br x10 ---------------------. | // BPF stack frame 0xffffffc00902d2f8: mov x10, #0xfffffffffffffffc | | // before the tail call 0xffffffc00902d2fc: ldr w7, [x25, x10] | | 0xffffffc00902d300: add sp, sp, #0x10 | | 0xffffffc00902d304: ldp x27, x28, [sp], #16 | | 0xffffffc00902d308: ldp x25, x26, [sp], #16 | | 0xffffffc00902d30c: ldp x21, x22, [sp], #16 | | 0xffffffc00902d310: ldp x19, x20, [sp], #16 | | 0xffffffc00902d314: ldp x29, x30, [sp], #16 | | 0xffffffc00902d318: add x0, x7, #0x0 | | 0xffffffc00902d31c: autiasp | | 0xffffffc00902d320: ret | | | | int classifier_0(struct __sk_buff *skb): | | 0xffffffc008ff5874: paciasp | | 0xffffffc008ff5878: stp x29, x30, [sp, #-16]! | | 0xffffffc008ff587c: mov x29, sp | | 0xffffffc008ff5880: stp x19, x20, [sp, #-16]! | | 0xffffffc008ff5884: stp x21, x22, [sp, #-16]! | | 0xffffffc008ff5888: stp x25, x26, [sp, #-16]! | | 0xffffffc008ff588c: stp x27, x28, [sp, #-16]! | | 0xffffffc008ff5890: mov x25, sp | | 0xffffffc008ff5894: mov x26, #0x0 | | 0xffffffc008ff5898: bti j <----------------------' | 0xffffffc008ff589c: sub x27, x25, #0x0 | 0xffffffc008ff58a0: sub sp, sp, #0x0 | 0xffffffc008ff58a4: mov x0, #0xffffffc0ffffffff | 0xffffffc008ff58a8: movk x0, #0x8fc, lsl #16 | 0xffffffc008ff58ac: movk x0, #0x6000 | 0xffffffc008ff58b0: mov w1, #0x1 | 0xffffffc008ff58b4: str w1, [x0] | 0xffffffc008ff58b8: mov w7, #0x0 | 0xffffffc008ff58bc: mov sp, sp | 0xffffffc008ff58c0: ldp x27, x28, [sp], #16 | 0xffffffc008ff58c4: ldp x25, x26, [sp], #16 | 0xffffffc008ff58c8: ldp x21, x22, [sp], #16 | 0xffffffc008ff58cc: ldp x19, x20, [sp], #16 | 0xffffffc008ff58d0: ldp x29, x30, [sp], #16 | 0xffffffc008ff58d4: add x0, x7, #0x0 | 0xffffffc008ff58d8: autiasp | 0xffffffc008ff58dc: ret -------------------------------' Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220617105735.733938-3-jakub@cloudflare.com --- arch/arm64/net/bpf_jit_comp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 42f2e9a8616c..f08a4447d363 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -246,6 +246,7 @@ static bool is_lsi_offset(int offset, int scale) static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) { const struct bpf_prog *prog = ctx->prog; + const bool is_main_prog = prog->aux->func_idx == 0; const u8 r6 = bpf2a64[BPF_REG_6]; const u8 r7 = bpf2a64[BPF_REG_7]; const u8 r8 = bpf2a64[BPF_REG_8]; @@ -299,7 +300,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); - if (!ebpf_from_cbpf) { + if (!ebpf_from_cbpf && is_main_prog) { /* Initialize tail_call_cnt */ emit(A64_MOVZ(1, tcc, 0, 0), ctx); @@ -1530,3 +1531,9 @@ void bpf_jit_free_exec(void *addr) { return vfree(addr); } + +/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} From 772251742262bd651529a3cea3ee756b71e95a29 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 21 Jun 2022 17:54:02 +0000 Subject: [PATCH 14/94] samples/bpf: fixup some tools to be able to support xdp multibuffer This changes the section name for the bpf program embedded in these files to "xdp.frags" to allow the programs to be loaded on drivers that are using an MTU greater than PAGE_SIZE. Rather than directly accessing the buffers, the packet data is now accessed via xdp helper functions to provide an example for those who may need to write more complex programs. v2: remove new unnecessary variable Signed-off-by: Andy Gospodarek Acked-by: John Fastabend Acked-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/20220621175402.35327-1-gospo@broadcom.com Signed-off-by: Alexei Starovoitov --- samples/bpf/xdp1_kern.c | 11 ++++++++--- samples/bpf/xdp2_kern.c | 11 ++++++++--- samples/bpf/xdp_tx_iptunnel_kern.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c index f0c5d95084de..0a5c704badd0 100644 --- a/samples/bpf/xdp1_kern.c +++ b/samples/bpf/xdp1_kern.c @@ -39,11 +39,13 @@ static int parse_ipv6(void *data, u64 nh_off, void *data_end) return ip6h->nexthdr; } -SEC("xdp1") +#define XDPBUFSIZE 64 +SEC("xdp.frags") int xdp_prog1(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; + __u8 pkt[XDPBUFSIZE] = {}; + void *data_end = &pkt[XDPBUFSIZE-1]; + void *data = pkt; struct ethhdr *eth = data; int rc = XDP_DROP; long *value; @@ -51,6 +53,9 @@ int xdp_prog1(struct xdp_md *ctx) u64 nh_off; u32 ipproto; + if (bpf_xdp_load_bytes(ctx, 0, pkt, sizeof(pkt))) + return rc; + nh_off = sizeof(*eth); if (data + nh_off > data_end) return rc; diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c index d8a64ab077b0..3332ba6bb95f 100644 --- a/samples/bpf/xdp2_kern.c +++ b/samples/bpf/xdp2_kern.c @@ -55,11 +55,13 @@ static int parse_ipv6(void *data, u64 nh_off, void *data_end) return ip6h->nexthdr; } -SEC("xdp1") +#define XDPBUFSIZE 64 +SEC("xdp.frags") int xdp_prog1(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; + __u8 pkt[XDPBUFSIZE] = {}; + void *data_end = &pkt[XDPBUFSIZE-1]; + void *data = pkt; struct ethhdr *eth = data; int rc = XDP_DROP; long *value; @@ -67,6 +69,9 @@ int xdp_prog1(struct xdp_md *ctx) u64 nh_off; u32 ipproto; + if (bpf_xdp_load_bytes(ctx, 0, pkt, sizeof(pkt))) + return rc; + nh_off = sizeof(*eth); if (data + nh_off > data_end) return rc; diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c index 575d57e4b8d6..0e2bca3a3fff 100644 --- a/samples/bpf/xdp_tx_iptunnel_kern.c +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -212,7 +212,7 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp) return XDP_TX; } -SEC("xdp_tx_iptunnel") +SEC("xdp.frags") int _xdp_tx_iptunnel(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; From 73087489250def7cdda2dee5ba685bdeae73b8af Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 20 Jun 2022 15:25:54 -0700 Subject: [PATCH 15/94] selftests/bpf: Add benchmark for local_storage get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a benchmarks to demonstrate the performance cliff for local_storage get as the number of local_storage maps increases beyond current local_storage implementation's cache size. "sequential get" and "interleaved get" benchmarks are added, both of which do many bpf_task_storage_get calls on sets of task local_storage maps of various counts, while considering a single specific map to be 'important' and counting task_storage_gets to the important map separately in addition to normal 'hits' count of all gets. Goal here is to mimic scenario where a particular program using one map - the important one - is running on a system where many other local_storage maps exist and are accessed often. While "sequential get" benchmark does bpf_task_storage_get for map 0, 1, ..., {9, 99, 999} in order, "interleaved" benchmark interleaves 4 bpf_task_storage_gets for the important map for every 10 map gets. This is meant to highlight performance differences when important map is accessed far more frequently than non-important maps. A "hashmap control" benchmark is also included for easy comparison of standard bpf hashmap lookup vs local_storage get. The benchmark is similar to "sequential get", but creates and uses BPF_MAP_TYPE_HASH instead of local storage. Only one inner map is created - a hashmap meant to hold tid -> data mapping for all tasks. Size of the hashmap is hardcoded to my system's PID_MAX_LIMIT (4,194,304). The number of these keys which are actually fetched as part of the benchmark is configurable. Addition of this benchmark is inspired by conversation with Alexei in a previous patchset's thread [0], which highlighted the need for such a benchmark to motivate and validate improvements to local_storage implementation. My approach in that series focused on improving performance for explicitly-marked 'important' maps and was rejected with feedback to make more generally-applicable improvements while avoiding explicitly marking maps as important. Thus the benchmark reports both general and important-map-focused metrics, so effect of future work on both is clear. Regarding the benchmark results. On a powerful system (Skylake, 20 cores, 256gb ram): Hashmap Control =============== num keys: 10 hashmap (control) sequential get: hits throughput: 20.900 ± 0.334 M ops/s, hits latency: 47.847 ns/op, important_hits throughput: 20.900 ± 0.334 M ops/s num keys: 1000 hashmap (control) sequential get: hits throughput: 13.758 ± 0.219 M ops/s, hits latency: 72.683 ns/op, important_hits throughput: 13.758 ± 0.219 M ops/s num keys: 10000 hashmap (control) sequential get: hits throughput: 6.995 ± 0.034 M ops/s, hits latency: 142.959 ns/op, important_hits throughput: 6.995 ± 0.034 M ops/s num keys: 100000 hashmap (control) sequential get: hits throughput: 4.452 ± 0.371 M ops/s, hits latency: 224.635 ns/op, important_hits throughput: 4.452 ± 0.371 M ops/s num keys: 4194304 hashmap (control) sequential get: hits throughput: 3.043 ± 0.033 M ops/s, hits latency: 328.587 ns/op, important_hits throughput: 3.043 ± 0.033 M ops/s Local Storage ============= num_maps: 1 local_storage cache sequential get: hits throughput: 47.298 ± 0.180 M ops/s, hits latency: 21.142 ns/op, important_hits throughput: 47.298 ± 0.180 M ops/s local_storage cache interleaved get: hits throughput: 55.277 ± 0.888 M ops/s, hits latency: 18.091 ns/op, important_hits throughput: 55.277 ± 0.888 M ops/s num_maps: 10 local_storage cache sequential get: hits throughput: 40.240 ± 0.802 M ops/s, hits latency: 24.851 ns/op, important_hits throughput: 4.024 ± 0.080 M ops/s local_storage cache interleaved get: hits throughput: 48.701 ± 0.722 M ops/s, hits latency: 20.533 ns/op, important_hits throughput: 17.393 ± 0.258 M ops/s num_maps: 16 local_storage cache sequential get: hits throughput: 44.515 ± 0.708 M ops/s, hits latency: 22.464 ns/op, important_hits throughput: 2.782 ± 0.044 M ops/s local_storage cache interleaved get: hits throughput: 49.553 ± 2.260 M ops/s, hits latency: 20.181 ns/op, important_hits throughput: 15.767 ± 0.719 M ops/s num_maps: 17 local_storage cache sequential get: hits throughput: 38.778 ± 0.302 M ops/s, hits latency: 25.788 ns/op, important_hits throughput: 2.284 ± 0.018 M ops/s local_storage cache interleaved get: hits throughput: 43.848 ± 1.023 M ops/s, hits latency: 22.806 ns/op, important_hits throughput: 13.349 ± 0.311 M ops/s num_maps: 24 local_storage cache sequential get: hits throughput: 19.317 ± 0.568 M ops/s, hits latency: 51.769 ns/op, important_hits throughput: 0.806 ± 0.024 M ops/s local_storage cache interleaved get: hits throughput: 24.397 ± 0.272 M ops/s, hits latency: 40.989 ns/op, important_hits throughput: 6.863 ± 0.077 M ops/s num_maps: 32 local_storage cache sequential get: hits throughput: 13.333 ± 0.135 M ops/s, hits latency: 75.000 ns/op, important_hits throughput: 0.417 ± 0.004 M ops/s local_storage cache interleaved get: hits throughput: 16.898 ± 0.383 M ops/s, hits latency: 59.178 ns/op, important_hits throughput: 4.717 ± 0.107 M ops/s num_maps: 100 local_storage cache sequential get: hits throughput: 6.360 ± 0.107 M ops/s, hits latency: 157.233 ns/op, important_hits throughput: 0.064 ± 0.001 M ops/s local_storage cache interleaved get: hits throughput: 7.303 ± 0.362 M ops/s, hits latency: 136.930 ns/op, important_hits throughput: 1.907 ± 0.094 M ops/s num_maps: 1000 local_storage cache sequential get: hits throughput: 0.452 ± 0.010 M ops/s, hits latency: 2214.022 ns/op, important_hits throughput: 0.000 ± 0.000 M ops/s local_storage cache interleaved get: hits throughput: 0.542 ± 0.007 M ops/s, hits latency: 1843.341 ns/op, important_hits throughput: 0.136 ± 0.002 M ops/s Looking at the "sequential get" results, it's clear that as the number of task local_storage maps grows beyond the current cache size (16), there's a significant reduction in hits throughput. Note that current local_storage implementation assigns a cache_idx to maps as they are created. Since "sequential get" is creating maps 0..n in order and then doing bpf_task_storage_get calls in the same order, the benchmark is effectively ensuring that a map will not be in cache when the program tries to access it. For "interleaved get" results, important-map hits throughput is greatly increased as the important map is more likely to be in cache by virtue of being accessed far more frequently. Throughput still reduces as # maps increases, though. To get a sense of the overhead of the benchmark program, I commented out bpf_task_storage_get/bpf_map_lookup_elem in local_storage_bench.c and ran the benchmark on the same host as the 'real' run. Results: Hashmap Control =============== num keys: 10 hashmap (control) sequential get: hits throughput: 54.288 ± 0.655 M ops/s, hits latency: 18.420 ns/op, important_hits throughput: 54.288 ± 0.655 M ops/s num keys: 1000 hashmap (control) sequential get: hits throughput: 52.913 ± 0.519 M ops/s, hits latency: 18.899 ns/op, important_hits throughput: 52.913 ± 0.519 M ops/s num keys: 10000 hashmap (control) sequential get: hits throughput: 53.480 ± 1.235 M ops/s, hits latency: 18.699 ns/op, important_hits throughput: 53.480 ± 1.235 M ops/s num keys: 100000 hashmap (control) sequential get: hits throughput: 54.982 ± 1.902 M ops/s, hits latency: 18.188 ns/op, important_hits throughput: 54.982 ± 1.902 M ops/s num keys: 4194304 hashmap (control) sequential get: hits throughput: 50.858 ± 0.707 M ops/s, hits latency: 19.662 ns/op, important_hits throughput: 50.858 ± 0.707 M ops/s Local Storage ============= num_maps: 1 local_storage cache sequential get: hits throughput: 110.990 ± 4.828 M ops/s, hits latency: 9.010 ns/op, important_hits throughput: 110.990 ± 4.828 M ops/s local_storage cache interleaved get: hits throughput: 161.057 ± 4.090 M ops/s, hits latency: 6.209 ns/op, important_hits throughput: 161.057 ± 4.090 M ops/s num_maps: 10 local_storage cache sequential get: hits throughput: 112.930 ± 1.079 M ops/s, hits latency: 8.855 ns/op, important_hits throughput: 11.293 ± 0.108 M ops/s local_storage cache interleaved get: hits throughput: 115.841 ± 2.088 M ops/s, hits latency: 8.633 ns/op, important_hits throughput: 41.372 ± 0.746 M ops/s num_maps: 16 local_storage cache sequential get: hits throughput: 115.653 ± 0.416 M ops/s, hits latency: 8.647 ns/op, important_hits throughput: 7.228 ± 0.026 M ops/s local_storage cache interleaved get: hits throughput: 138.717 ± 1.649 M ops/s, hits latency: 7.209 ns/op, important_hits throughput: 44.137 ± 0.525 M ops/s num_maps: 17 local_storage cache sequential get: hits throughput: 112.020 ± 1.649 M ops/s, hits latency: 8.927 ns/op, important_hits throughput: 6.598 ± 0.097 M ops/s local_storage cache interleaved get: hits throughput: 128.089 ± 1.960 M ops/s, hits latency: 7.807 ns/op, important_hits throughput: 38.995 ± 0.597 M ops/s num_maps: 24 local_storage cache sequential get: hits throughput: 92.447 ± 5.170 M ops/s, hits latency: 10.817 ns/op, important_hits throughput: 3.855 ± 0.216 M ops/s local_storage cache interleaved get: hits throughput: 128.844 ± 2.808 M ops/s, hits latency: 7.761 ns/op, important_hits throughput: 36.245 ± 0.790 M ops/s num_maps: 32 local_storage cache sequential get: hits throughput: 102.042 ± 1.462 M ops/s, hits latency: 9.800 ns/op, important_hits throughput: 3.194 ± 0.046 M ops/s local_storage cache interleaved get: hits throughput: 126.577 ± 1.818 M ops/s, hits latency: 7.900 ns/op, important_hits throughput: 35.332 ± 0.507 M ops/s num_maps: 100 local_storage cache sequential get: hits throughput: 111.327 ± 1.401 M ops/s, hits latency: 8.983 ns/op, important_hits throughput: 1.113 ± 0.014 M ops/s local_storage cache interleaved get: hits throughput: 131.327 ± 1.339 M ops/s, hits latency: 7.615 ns/op, important_hits throughput: 34.302 ± 0.350 M ops/s num_maps: 1000 local_storage cache sequential get: hits throughput: 101.978 ± 0.563 M ops/s, hits latency: 9.806 ns/op, important_hits throughput: 0.102 ± 0.001 M ops/s local_storage cache interleaved get: hits throughput: 141.084 ± 1.098 M ops/s, hits latency: 7.088 ns/op, important_hits throughput: 35.430 ± 0.276 M ops/s Adjusting for overhead, latency numbers for "hashmap control" and "sequential get" are: hashmap_control_1k: ~53.8ns hashmap_control_10k: ~124.2ns hashmap_control_100k: ~206.5ns sequential_get_1: ~12.1ns sequential_get_10: ~16.0ns sequential_get_16: ~13.8ns sequential_get_17: ~16.8ns sequential_get_24: ~40.9ns sequential_get_32: ~65.2ns sequential_get_100: ~148.2ns sequential_get_1000: ~2204ns Clearly demonstrating a cliff. In the discussion for v1 of this patch, Alexei noted that local_storage was 2.5x faster than a large hashmap when initially implemented [1]. The benchmark results show that local_storage is 5-10x faster: a long-running BPF application putting some pid-specific info into a hashmap for each pid it sees will probably see on the order of 10-100k pids. Bench numbers for hashmaps of this size are ~10x slower than sequential_get_16, but as the number of local_storage maps grows far past local_storage cache size the performance advantage shrinks and eventually reverses. When running the benchmarks it may be necessary to bump 'open files' ulimit for a successful run. [0]: https://lore.kernel.org/all/20220420002143.1096548-1-davemarchevsky@fb.com [1]: https://lore.kernel.org/bpf/20220511173305.ftldpn23m4ski3d3@MBP-98dd607d3435.dhcp.thefacebook.com/ Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20220620222554.270578-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 55 ++++ tools/testing/selftests/bpf/bench.h | 4 + .../bpf/benchs/bench_local_storage.c | 287 ++++++++++++++++++ .../bpf/benchs/run_bench_local_storage.sh | 24 ++ .../selftests/bpf/benchs/run_common.sh | 17 ++ .../selftests/bpf/progs/local_storage_bench.c | 104 +++++++ 7 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_local_storage.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh create mode 100644 tools/testing/selftests/bpf/progs/local_storage_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cb8e552e1418..4fbd88a8ed9e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -571,6 +571,7 @@ $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h +$(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -583,7 +584,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bloom_filter_map.o \ $(OUTPUT)/bench_bpf_loop.o \ $(OUTPUT)/bench_strncmp.o \ - $(OUTPUT)/bench_bpf_hashmap_full_update.o + $(OUTPUT)/bench_bpf_hashmap_full_update.o \ + $(OUTPUT)/bench_local_storage.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index d8aa62be996b..1e7b5d4b1f11 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -150,6 +150,53 @@ void ops_report_final(struct bench_res res[], int res_cnt) printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt); } +void local_storage_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double important_hits_per_sec, hits_per_sec; + double delta_sec = delta_ns / 1000000000.0; + + hits_per_sec = res->hits / 1000000.0 / delta_sec; + important_hits_per_sec = res->important_hits / 1000000.0 / delta_sec; + + printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s ", hits_per_sec); + printf("important_hits %8.3lfM/s\n", important_hits_per_sec); +} + +void local_storage_report_final(struct bench_res res[], int res_cnt) +{ + double important_hits_mean = 0.0, important_hits_stddev = 0.0; + double hits_mean = 0.0, hits_stddev = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) { + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + important_hits_mean += res[i].important_hits / 1000000.0 / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + important_hits_stddev += + (important_hits_mean - res[i].important_hits / 1000000.0) * + (important_hits_mean - res[i].important_hits / 1000000.0) / + (res_cnt - 1.0); + } + + hits_stddev = sqrt(hits_stddev); + important_hits_stddev = sqrt(important_hits_stddev); + } + printf("Summary: hits throughput %8.3lf \u00B1 %5.3lf M ops/s, ", + hits_mean, hits_stddev); + printf("hits latency %8.3lf ns/op, ", 1000.0 / hits_mean); + printf("important_hits throughput %8.3lf \u00B1 %5.3lf M ops/s\n", + important_hits_mean, important_hits_stddev); +} + const char *argp_program_version = "benchmark"; const char *argp_program_bug_address = ""; const char argp_program_doc[] = @@ -188,12 +235,14 @@ static const struct argp_option opts[] = { extern struct argp bench_ringbufs_argp; extern struct argp bench_bloom_map_argp; extern struct argp bench_bpf_loop_argp; +extern struct argp bench_local_storage_argp; extern struct argp bench_strncmp_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, { &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 }, { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, + { &bench_local_storage_argp, 0, "local_storage benchmark", 0 }, { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 }, {}, }; @@ -397,6 +446,9 @@ extern const struct bench bench_bpf_loop; extern const struct bench bench_strncmp_no_helper; extern const struct bench bench_strncmp_helper; extern const struct bench bench_bpf_hashmap_full_update; +extern const struct bench bench_local_storage_cache_seq_get; +extern const struct bench bench_local_storage_cache_interleaved_get; +extern const struct bench bench_local_storage_cache_hashmap_control; static const struct bench *benchs[] = { &bench_count_global, @@ -432,6 +484,9 @@ static const struct bench *benchs[] = { &bench_strncmp_no_helper, &bench_strncmp_helper, &bench_bpf_hashmap_full_update, + &bench_local_storage_cache_seq_get, + &bench_local_storage_cache_interleaved_get, + &bench_local_storage_cache_hashmap_control, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index fb3e213df3dc..4b15286753ba 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -34,6 +34,7 @@ struct bench_res { long hits; long drops; long false_hits; + long important_hits; }; struct bench { @@ -61,6 +62,9 @@ void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); void false_hits_report_final(struct bench_res res[], int res_cnt); void ops_report_progress(int iter, struct bench_res *res, long delta_ns); void ops_report_final(struct bench_res res[], int res_cnt); +void local_storage_report_progress(int iter, struct bench_res *res, + long delta_ns); +void local_storage_report_final(struct bench_res res[], int res_cnt); static inline __u64 get_time_ns(void) { diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage.c b/tools/testing/selftests/bpf/benchs/bench_local_storage.c new file mode 100644 index 000000000000..5a378c84e81f --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include +#include + +#include "local_storage_bench.skel.h" +#include "bench.h" + +#include + +static struct { + __u32 nr_maps; + __u32 hashmap_nr_keys_used; +} args = { + .nr_maps = 1000, + .hashmap_nr_keys_used = 1000, +}; + +enum { + ARG_NR_MAPS = 6000, + ARG_HASHMAP_NR_KEYS_USED = 6001, +}; + +static const struct argp_option opts[] = { + { "nr_maps", ARG_NR_MAPS, "NR_MAPS", 0, + "Set number of local_storage maps"}, + { "hashmap_nr_keys_used", ARG_HASHMAP_NR_KEYS_USED, "NR_KEYS", + 0, "When doing hashmap test, set number of hashmap keys test uses"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_MAPS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid nr_maps"); + argp_usage(state); + } + args.nr_maps = ret; + break; + case ARG_HASHMAP_NR_KEYS_USED: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid hashmap_nr_keys_used"); + argp_usage(state); + } + args.hashmap_nr_keys_used = ret; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_local_storage_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* Keep in sync w/ array of maps in bpf */ +#define MAX_NR_MAPS 1000 +/* keep in sync w/ same define in bpf */ +#define HASHMAP_SZ 4194304 + +static void validate(void) +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.nr_maps > MAX_NR_MAPS) { + fprintf(stderr, "nr_maps must be <= 1000\n"); + exit(1); + } + + if (args.hashmap_nr_keys_used > HASHMAP_SZ) { + fprintf(stderr, "hashmap_nr_keys_used must be <= %u\n", HASHMAP_SZ); + exit(1); + } +} + +static struct { + struct local_storage_bench *skel; + void *bpf_obj; + struct bpf_map *array_of_maps; +} ctx; + +static void prepopulate_hashmap(int fd) +{ + int i, key, val; + + /* local_storage gets will have BPF_LOCAL_STORAGE_GET_F_CREATE flag set, so + * populate the hashmap for a similar comparison + */ + for (i = 0; i < HASHMAP_SZ; i++) { + key = val = i; + if (bpf_map_update_elem(fd, &key, &val, 0)) { + fprintf(stderr, "Error prepopulating hashmap (key %d)\n", key); + exit(1); + } + } +} + +static void __setup(struct bpf_program *prog, bool hashmap) +{ + struct bpf_map *inner_map; + int i, fd, mim_fd, err; + + LIBBPF_OPTS(bpf_map_create_opts, create_opts); + + if (!hashmap) + create_opts.map_flags = BPF_F_NO_PREALLOC; + + ctx.skel->rodata->num_maps = args.nr_maps; + ctx.skel->rodata->hashmap_num_keys = args.hashmap_nr_keys_used; + inner_map = bpf_map__inner_map(ctx.array_of_maps); + create_opts.btf_key_type_id = bpf_map__btf_key_type_id(inner_map); + create_opts.btf_value_type_id = bpf_map__btf_value_type_id(inner_map); + + err = local_storage_bench__load(ctx.skel); + if (err) { + fprintf(stderr, "Error loading skeleton\n"); + goto err_out; + } + + create_opts.btf_fd = bpf_object__btf_fd(ctx.skel->obj); + + mim_fd = bpf_map__fd(ctx.array_of_maps); + if (mim_fd < 0) { + fprintf(stderr, "Error getting map_in_map fd\n"); + goto err_out; + } + + for (i = 0; i < args.nr_maps; i++) { + if (hashmap) + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int), + sizeof(int), HASHMAP_SZ, &create_opts); + else + fd = bpf_map_create(BPF_MAP_TYPE_TASK_STORAGE, NULL, sizeof(int), + sizeof(int), 0, &create_opts); + if (fd < 0) { + fprintf(stderr, "Error creating map %d: %d\n", i, fd); + goto err_out; + } + + if (hashmap) + prepopulate_hashmap(fd); + + err = bpf_map_update_elem(mim_fd, &i, &fd, 0); + if (err) { + fprintf(stderr, "Error updating array-of-maps w/ map %d\n", i); + goto err_out; + } + } + + if (!bpf_program__attach(prog)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + return; +err_out: + exit(1); +} + +static void hashmap_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_hash_maps; + skel->rodata->use_hashmap = 1; + skel->rodata->interleave = 0; + + __setup(skel->progs.get_local, true); +} + +static void local_storage_cache_get_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_local_storage_maps; + skel->rodata->use_hashmap = 0; + skel->rodata->interleave = 0; + + __setup(skel->progs.get_local, false); +} + +static void local_storage_cache_get_interleaved_setup(void) +{ + struct local_storage_bench *skel; + + setup_libbpf(); + + skel = local_storage_bench__open(); + ctx.skel = skel; + ctx.array_of_maps = skel->maps.array_of_local_storage_maps; + skel->rodata->use_hashmap = 0; + skel->rodata->interleave = 1; + + __setup(skel->progs.get_local, false); +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); + res->important_hits = atomic_swap(&ctx.skel->bss->important_hits, 0); +} + +static inline void trigger_bpf_program(void) +{ + syscall(__NR_getpgid); +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void *producer(void *input) +{ + while (true) + trigger_bpf_program(); + + return NULL; +} + +/* cache sequential and interleaved get benchs test local_storage get + * performance, specifically they demonstrate performance cliff of + * current list-plus-cache local_storage model. + * + * cache sequential get: call bpf_task_storage_get on n maps in order + * cache interleaved get: like "sequential get", but interleave 4 calls to the + * 'important' map (idx 0 in array_of_maps) for every 10 calls. Goal + * is to mimic environment where many progs are accessing their local_storage + * maps, with 'our' prog needing to access its map more often than others + */ +const struct bench bench_local_storage_cache_seq_get = { + .name = "local-storage-cache-seq-get", + .validate = validate, + .setup = local_storage_cache_get_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; + +const struct bench bench_local_storage_cache_interleaved_get = { + .name = "local-storage-cache-int-get", + .validate = validate, + .setup = local_storage_cache_get_interleaved_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; + +const struct bench bench_local_storage_cache_hashmap_control = { + .name = "local-storage-cache-hashmap-control", + .validate = validate, + .setup = hashmap_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = local_storage_report_progress, + .report_final = local_storage_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh new file mode 100755 index 000000000000..2eb2b513a173 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_local_storage.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +header "Hashmap Control" +for i in 10 1000 10000 100000 4194304; do +subtitle "num keys: $i" + summarize_local_storage "hashmap (control) sequential get: "\ + "$(./bench --nr_maps 1 --hashmap_nr_keys_used=$i local-storage-cache-hashmap-control)" + printf "\n" +done + +header "Local Storage" +for i in 1 10 16 17 24 32 100 1000; do +subtitle "num_maps: $i" + summarize_local_storage "local_storage cache sequential get: "\ + "$(./bench --nr_maps $i local-storage-cache-seq-get)" + summarize_local_storage "local_storage cache interleaved get: "\ + "$(./bench --nr_maps $i local-storage-cache-int-get)" + printf "\n" +done diff --git a/tools/testing/selftests/bpf/benchs/run_common.sh b/tools/testing/selftests/bpf/benchs/run_common.sh index 6c5e6023a69f..d9f40af82006 100644 --- a/tools/testing/selftests/bpf/benchs/run_common.sh +++ b/tools/testing/selftests/bpf/benchs/run_common.sh @@ -41,6 +41,16 @@ function ops() echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" } +function local_storage() +{ + echo -n "hits throughput: " + echo -n "$*" | sed -E "s/.* hits throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" + echo -n -e ", hits latency: " + echo -n "$*" | sed -E "s/.* hits latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" + echo -n ", important_hits throughput: " + echo "$*" | sed -E "s/.*important_hits throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" +} + function total() { echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" @@ -67,6 +77,13 @@ function summarize_ops() printf "%-20s %s\n" "$bench" "$(ops $summary)" } +function summarize_local_storage() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s\n" "$bench" "$(local_storage $summary)" +} + function summarize_total() { bench="$1" diff --git a/tools/testing/selftests/bpf/progs/local_storage_bench.c b/tools/testing/selftests/bpf/progs/local_storage_bench.c new file mode 100644 index 000000000000..2c3234c5b73a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/local_storage_bench.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +#define HASHMAP_SZ 4194304 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1000); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); + }); +} array_of_local_storage_maps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1000); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, HASHMAP_SZ); + __type(key, int); + __type(value, int); + }); +} array_of_hash_maps SEC(".maps"); + +long important_hits; +long hits; + +/* set from user-space */ +const volatile unsigned int use_hashmap; +const volatile unsigned int hashmap_num_keys; +const volatile unsigned int num_maps; +const volatile unsigned int interleave; + +struct loop_ctx { + struct task_struct *task; + long loop_hits; + long loop_important_hits; +}; + +static int do_lookup(unsigned int elem, struct loop_ctx *lctx) +{ + void *map, *inner_map; + int idx = 0; + + if (use_hashmap) + map = &array_of_hash_maps; + else + map = &array_of_local_storage_maps; + + inner_map = bpf_map_lookup_elem(map, &elem); + if (!inner_map) + return -1; + + if (use_hashmap) { + idx = bpf_get_prandom_u32() % hashmap_num_keys; + bpf_map_lookup_elem(inner_map, &idx); + } else { + bpf_task_storage_get(inner_map, lctx->task, &idx, + BPF_LOCAL_STORAGE_GET_F_CREATE); + } + + lctx->loop_hits++; + if (!elem) + lctx->loop_important_hits++; + return 0; +} + +static long loop(u32 index, void *ctx) +{ + struct loop_ctx *lctx = (struct loop_ctx *)ctx; + unsigned int map_idx = index % num_maps; + + do_lookup(map_idx, lctx); + if (interleave && map_idx % 3 == 0) + do_lookup(0, lctx); + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int get_local(void *ctx) +{ + struct loop_ctx lctx; + + lctx.task = bpf_get_current_task_btf(); + lctx.loop_hits = 0; + lctx.loop_important_hits = 0; + bpf_loop(10000, &loop, &lctx, 0); + __sync_add_and_fetch(&hits, lctx.loop_hits); + __sync_add_and_fetch(&important_hits, lctx.loop_important_hits); + return 0; +} + +char _license[] SEC("license") = "GPL"; From 9676feccacdb0571791c88b23e3b7ac4e7c9c457 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 22 Jun 2022 21:50:02 +0800 Subject: [PATCH 16/94] test_bpf: fix incorrect netdev features The prototype of .features is netdev_features_t, it should use NETIF_F_LLTX and NETIF_F_HW_VLAN_STAG_TX, not NETIF_F_LLTX_BIT and NETIF_F_HW_VLAN_STAG_TX_BIT. Fixes: cf204a718357 ("bpf, testing: Introduce 'gso_linear_no_head_frag' skb_segment test") Signed-off-by: Jian Shen Acked-by: John Fastabend Link: https://lore.kernel.org/r/20220622135002.8263-1-shenjian15@huawei.com Signed-off-by: Alexei Starovoitov --- lib/test_bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 2a7836e115b4..5820704165a6 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14733,9 +14733,9 @@ static struct skb_segment_test skb_segment_tests[] __initconst = { .build_skb = build_test_skb_linear_no_head_frag, .features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_GSO | - NETIF_F_LLTX_BIT | NETIF_F_GRO | + NETIF_F_LLTX | NETIF_F_GRO | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | - NETIF_F_HW_VLAN_STAG_TX_BIT + NETIF_F_HW_VLAN_STAG_TX } }; From 41c95dd6a604dc3f5fae55c99c138cc8e7fec76e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:23 +0200 Subject: [PATCH 17/94] bpf: Allow a TCP CC to write sk_pacing_rate and sk_pacing_status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A CC that implements tcp_congestion_ops.cong_control() should be able to control sk_pacing_rate and set sk_pacing_status, since tcp_update_pacing_rate() is never called in this case. A built-in CC or one from a kernel module is already able to write to both struct sock members. For a BPF program, write access has not been allowed, yet. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- net/ipv4/bpf_tcp_ca.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index f79ab942f03b..1f5c53ede4e5 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -111,6 +111,12 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, } switch (off) { + case offsetof(struct sock, sk_pacing_rate): + end = offsetofend(struct sock, sk_pacing_rate); + break; + case offsetof(struct sock, sk_pacing_status): + end = offsetofend(struct sock, sk_pacing_status); + break; case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): end = offsetofend(struct inet_connection_sock, icsk_ca_priv); break; From 9f0265e921dee14096943ee11f793fa076aa7a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:24 +0200 Subject: [PATCH 18/94] bpf: Require only one of cong_avoid() and cong_control() from a TCP CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the check for required and optional functions in a struct tcp_congestion_ops from bpf_tcp_ca.c. Rely on tcp_register_congestion_control() to reject a BPF CC that does not implement all required functions, as it will do for a non-BPF CC. When a CC implements tcp_congestion_ops.cong_control(), the alternate cong_avoid() is not in use in the TCP stack. Previously, a BPF CC was still forced to implement cong_avoid() as a no-op since it was non-optional in bpf_tcp_ca.c. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 +++---- net/ipv4/bpf_tcp_ca.c | 33 --------------------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d9a3c9207240..7e0068c3399c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -503,10 +503,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 1f5c53ede4e5..7a181631b995 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,18 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ extern struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 optional_ops[] = { - offsetof(struct tcp_congestion_ops, init), - offsetof(struct tcp_congestion_ops, release), - offsetof(struct tcp_congestion_ops, set_state), - offsetof(struct tcp_congestion_ops, cwnd_event), - offsetof(struct tcp_congestion_ops, in_ack_event), - offsetof(struct tcp_congestion_ops, pkts_acked), - offsetof(struct tcp_congestion_ops, min_tso_segs), - offsetof(struct tcp_congestion_ops, sndbuf_expand), - offsetof(struct tcp_congestion_ops, cong_control), -}; - static u32 unsupported_ops[] = { offsetof(struct tcp_congestion_ops, get_info), }; @@ -51,18 +39,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_optional(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { - if (member_offset == optional_ops[i]) - return true; - } - - return false; -} - static bool is_unsupported(u32 member_offset) { unsigned int i; @@ -246,7 +222,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, { const struct tcp_congestion_ops *utcp_ca; struct tcp_congestion_ops *tcp_ca; - int prog_fd; u32 moff; utcp_ca = (const struct tcp_congestion_ops *)udata; @@ -268,14 +243,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 1; } - if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) - return 0; - - /* Ensure bpf_prog is provided for compulsory func ptr */ - prog_fd = (int)(*(unsigned long *)(udata + moff)); - if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) - return -EINVAL; - return 0; } From 6e945d57cc9f6e27893d57a419434a2859ba6f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:25 +0200 Subject: [PATCH 19/94] selftests/bpf: Test a BPF CC writing sk_pacing_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF is allowed to write sk_pacing_rate and sk_pacing_status in struct sock. This is needed when cong_control() is implemented and used. Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-4-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 19 ++++++ .../bpf/progs/tcp_ca_write_sk_pacing.c | 60 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e9a9a31b2ffe..e79f3f5a9d33 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -9,6 +9,7 @@ #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" +#include "tcp_ca_write_sk_pacing.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -322,6 +323,22 @@ static void test_rel_setsockopt(void) bpf_dctcp_release__destroy(rel_skel); } +static void test_write_sk_pacing(void) +{ + struct tcp_ca_write_sk_pacing *skel; + struct bpf_link *link; + + skel = tcp_ca_write_sk_pacing__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.write_sk_pacing); + ASSERT_OK_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_write_sk_pacing__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -334,4 +351,6 @@ void test_bpf_tcp_ca(void) test_dctcp_fallback(); if (test__start_subtest("rel_setsockopt")) test_rel_setsockopt(); + if (test__start_subtest("write_sk_pacing")) + test_write_sk_pacing(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c new file mode 100644 index 000000000000..43447704cf0e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define USEC_PER_SEC 1000000UL + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/write_sk_pacing_init") +void BPF_PROG(write_sk_pacing_init, struct sock *sk) +{ +#ifdef ENABLE_ATOMICS_TESTS + __sync_bool_compare_and_swap(&sk->sk_pacing_status, SK_PACING_NONE, + SK_PACING_NEEDED); +#else + sk->sk_pacing_status = SK_PACING_NEEDED; +#endif +} + +SEC("struct_ops/write_sk_pacing_cong_control") +void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, + const struct rate_sample *rs) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned long rate = + ((tp->snd_cwnd * tp->mss_cache * USEC_PER_SEC) << 3) / + (tp->srtt_us ?: 1U << 3); + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +SEC("struct_ops/write_sk_pacing_ssthresh") +__u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/write_sk_pacing_undo_cwnd") +__u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops write_sk_pacing = { + .init = (void *)write_sk_pacing_init, + .cong_control = (void *)write_sk_pacing_cong_control, + .ssthresh = (void *)write_sk_pacing_ssthresh, + .undo_cwnd = (void *)write_sk_pacing_undo_cwnd, + .name = "bpf_w_sk_pacing", +}; From 0735627d78caa56f219dc14608ce0bdbd045e07e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:26 +0200 Subject: [PATCH 20/94] selftests/bpf: Test an incomplete BPF CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing neither cong_avoid() nor cong_control() is correctly rejected. This check solely depends on tcp_register_congestion_control() now, which is invoked during bpf_map__attach_struct_ops(). Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-5-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 22 ++++++++++++ .../bpf/progs/tcp_ca_incompl_cong_ops.c | 35 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e79f3f5a9d33..194d07310531 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -10,6 +10,7 @@ #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" +#include "tcp_ca_incompl_cong_ops.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -339,6 +340,25 @@ static void test_write_sk_pacing(void) tcp_ca_write_sk_pacing__destroy(skel); } +static void test_incompl_cong_ops(void) +{ + struct tcp_ca_incompl_cong_ops *skel; + struct bpf_link *link; + + skel = tcp_ca_incompl_cong_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + /* That cong_avoid() and cong_control() are missing is only reported at + * this point: + */ + link = bpf_map__attach_struct_ops(skel->maps.incompl_cong_ops); + ASSERT_ERR_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_incompl_cong_ops__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -353,4 +373,6 @@ void test_bpf_tcp_ca(void) test_rel_setsockopt(); if (test__start_subtest("write_sk_pacing")) test_write_sk_pacing(); + if (test__start_subtest("incompl_cong_ops")) + test_incompl_cong_ops(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c new file mode 100644 index 000000000000..7bb872fb22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/incompl_cong_ops_ssthresh") +__u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/incompl_cong_ops_undo_cwnd") +__u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops incompl_cong_ops = { + /* Intentionally leaving out any of the required cong_avoid() and + * cong_control() here. + */ + .ssthresh = (void *)incompl_cong_ops_ssthresh, + .undo_cwnd = (void *)incompl_cong_ops_undo_cwnd, + .name = "bpf_incompl_ops", +}; From f14a3f644a1c5a2e2dbe6073f51793119a12e6ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:27 +0200 Subject: [PATCH 21/94] selftests/bpf: Test a BPF CC implementing the unsupported get_info() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing get_info() is rejected correctly. get_info() is unsupported in a BPF CC. The check for required functions in a BPF CC has moved, this test ensures unsupported functions are still rejected correctly. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-6-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 20 ++++++++++++++++++ .../bpf/progs/tcp_ca_unsupp_cong_op.c | 21 +++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 194d07310531..2959a52ced06 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -11,6 +11,7 @@ #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" +#include "tcp_ca_unsupp_cong_op.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -359,6 +360,23 @@ static void test_incompl_cong_ops(void) tcp_ca_incompl_cong_ops__destroy(skel); } +static void test_unsupp_cong_op(void) +{ + libbpf_print_fn_t old_print_fn; + struct tcp_ca_unsupp_cong_op *skel; + + err_str = "attach to unsupported member get_info"; + found = false; + old_print_fn = libbpf_set_print(libbpf_debug_print); + + skel = tcp_ca_unsupp_cong_op__open_and_load(); + ASSERT_NULL(skel, "open_and_load"); + ASSERT_EQ(found, true, "expected_err_msg"); + + tcp_ca_unsupp_cong_op__destroy(skel); + libbpf_set_print(old_print_fn); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -375,4 +393,6 @@ void test_bpf_tcp_ca(void) test_write_sk_pacing(); if (test__start_subtest("incompl_cong_ops")) test_incompl_cong_ops(); + if (test__start_subtest("unsupp_cong_op")) + test_unsupp_cong_op(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c new file mode 100644 index 000000000000..c06f4a41c21a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/unsupp_cong_op_get_info") +size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + return 0; +} + +SEC(".struct_ops") +struct tcp_congestion_ops unsupp_cong_op = { + .get_info = (void *)unsupp_cong_op_get_info, + .name = "bpf_unsupp_op", +}; From 6dc7a0baf1a70b7d22662d38481824c14ddd80c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Tue, 21 Jun 2022 09:01:16 +0200 Subject: [PATCH 22/94] selftests/bpf: Fix rare segfault in sock_fields prog test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_sock_fields__detach() got called with a null pointer here when one of the CHECKs or ASSERTs up to the test_sock_fields__open_and_load() call resulted in a jump to the "done" label. A skeletons *__detach() is not safe to call with a null pointer, though. This led to a segfault. Go the easy route and only call test_sock_fields__destroy() which is null-pointer safe and includes detaching. Came across this while looking[1] to introduce the usage of bpf_tcp_helpers.h (included in progs/test_sock_fields.c) together with vmlinux.h. [1] https://lore.kernel.org/bpf/629bc069dd807d7ac646f836e9dca28bbc1108e2.camel@mailbox.tu-berlin.de/ Fixes: 8f50f16ff39d ("selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads") Signed-off-by: Jörn-Thorben Hinz Signed-off-by: Andrii Nakryiko Reviewed-by: Jakub Sitnicki Reviewed-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220621070116.307221-1-jthinz@mailbox.tu-berlin.de --- tools/testing/selftests/bpf/prog_tests/sock_fields.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c index 9d211b5c22c4..7d23166c77af 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c @@ -394,7 +394,6 @@ void serial_test_sock_fields(void) test(); done: - test_sock_fields__detach(skel); test_sock_fields__destroy(skel); if (child_cg_fd >= 0) close(child_cg_fd); From 395e942d34a25824457da379baf434b5d6da4dcc Mon Sep 17 00:00:00 2001 From: Simon Wang Date: Tue, 21 Jun 2022 23:19:23 -0400 Subject: [PATCH 23/94] bpf: Replace hard-coded 0 with BPF_K in check_alu_op Enhance readability a bit. Signed-off-by: Simon Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220622031923.65692-1-wangchuanguo@inspur.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a20d7736a5b2..f228141c01c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9096,7 +9096,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != 0 || + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); From fb4e3b33e3e7f13befdf9ee232e34818c6cc5fb9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Jun 2022 05:06:12 +0300 Subject: [PATCH 24/94] bpf: Fix for use-after-free bug in inline_bpf_loop As reported by Dan Carpenter, the following statements in inline_bpf_loop() might cause a use-after-free bug: struct bpf_prog *new_prog; // ... new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); // ... env->prog->insnsi[call_insn_offset].imm = callback_offset; The bpf_patch_insn_data() might free the memory used by env->prog. Fixes: 1ade23711971 ("bpf: Inline calls to bpf_loop when callback is known") Reported-by: Dan Carpenter Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220624020613.548108-2-eddyz87@gmail.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f228141c01c5..4938477912cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14417,7 +14417,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ call_insn_offset = position + 12; callback_offset = callback_start - call_insn_offset - 1; - env->prog->insnsi[call_insn_offset].imm = callback_offset; + new_prog->insnsi[call_insn_offset].imm = callback_offset; return new_prog; } From 41188e9e9defa1678abbf860ad7f6dd1ba48ad1c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Jun 2022 05:06:13 +0300 Subject: [PATCH 25/94] selftest/bpf: Test for use-after-free bug fix in inline_bpf_loop This test verifies that bpf_loop() inlining works as expected when address of `env->prog` is updated. This address is updated upon BPF program reallocation. Reallocation is handled by bpf_prog_realloc(), which reuses old memory if page boundary is not crossed. The value of `len` in the test is chosen to cross this boundary on bpf_loop() patching. Verify that the use-after-free bug in inline_bpf_loop() reported by Dan Carpenter is fixed. Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220624020613.548108-3-eddyz87@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 39 +++++++++++++++++++ .../selftests/bpf/verifier/bpf_loop_inline.c | 11 ++++++ 2 files changed, 50 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 7fe897c66d81..f9d553fbf68a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -425,6 +425,45 @@ static void bpf_fill_torturous_jumps(struct bpf_test *self) } } +static void bpf_fill_big_prog_with_loop_1(struct bpf_test *self) +{ + struct bpf_insn *insn = self->fill_insns; + /* This test was added to catch a specific use after free + * error, which happened upon BPF program reallocation. + * Reallocation is handled by core.c:bpf_prog_realloc, which + * reuses old memory if page boundary is not crossed. The + * value of `len` is chosen to cross this boundary on bpf_loop + * patching. + */ + const int len = getpagesize() - 25; + int callback_load_idx; + int callback_idx; + int i = 0; + + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1); + callback_load_idx = i; + insn[i++] = BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, + BPF_REG_2, BPF_PSEUDO_FUNC, 0, + 777 /* filled below */); + insn[i++] = BPF_RAW_INSN(0, 0, 0, 0, 0); + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0); + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop); + + while (i < len - 3) + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0); + insn[i++] = BPF_EXIT_INSN(); + + callback_idx = i; + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0); + insn[i++] = BPF_EXIT_INSN(); + + insn[callback_load_idx].imm = callback_idx - callback_load_idx - 1; + self->func_info[1].insn_off = callback_idx; + self->prog_len = i; + assert(i == len); +} + /* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */ #define BPF_SK_LOOKUP(func) \ /* struct bpf_sock_tuple tuple = {} */ \ diff --git a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c index 232da07c93b5..2d0023659d88 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c +++ b/tools/testing/selftests/bpf/verifier/bpf_loop_inline.c @@ -244,6 +244,17 @@ .func_info_cnt = 3, BTF_TYPES }, +{ + "inline bpf_loop call in a big program", + .insns = {}, + .fill_helper = bpf_fill_big_prog_with_loop_1, + .expected_insns = { PSEUDO_CALL_INSN() }, + .unexpected_insns = { HELPER_CALL_INSN() }, + .result = ACCEPT, + .func_info = { { 0, MAIN_TYPE }, { 16, CALLBACK_TYPE } }, + .func_info_cnt = 2, + BTF_TYPES +}, #undef HELPER_CALL_INSN #undef PSEUDO_CALL_INSN From b168852eb8eff57b63e72e3bc584bda3d2cb9577 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 16 Jun 2022 22:22:14 +0200 Subject: [PATCH 26/94] perf tools: Rework prologue generation code Some functions we use for bpf prologue generation are going to be deprecated. This change reworks current code not to use them. We need to replace following functions/struct: bpf_program__set_prep bpf_program__nth_fd struct bpf_prog_prep_result Currently we use bpf_program__set_prep to hook perf callback before program is loaded and provide new instructions with the prologue. We replace this function/ality by taking instructions for specific program, attaching prologue to them and load such new ebpf programs with prologue using separate bpf_prog_load calls (outside libbpf load machinery). Before we can take and use program instructions, we need libbpf to actually load it. This way we get the final shape of its instructions with all relocations and verifier adjustments). There's one glitch though.. perf kprobe program already assumes generated prologue code with proper values in argument registers, so loading such program directly will fail in the verifier. That's where the fallback pre-load handler fits in and prepends the initialization code to the program. Once such program is loaded we take its instructions, cut off the initialization code and prepend the prologue. I know.. sorry ;-) To have access to the program when loading this patch adds support to register 'fallback' section handler to take care of perf kprobe programs. The fallback means that it handles any section definition besides the ones that libbpf handles. The handler serves two purposes: - allows perf programs to have special arguments in section name - allows perf to use pre-load callback where we can attach init code (zeroing all argument registers) to each perf program Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Arnaldo Carvalho de Melo Acked-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/bpf/20220616202214.70359-2-jolsa@kernel.org --- tools/perf/util/bpf-loader.c | 204 ++++++++++++++++++++++++++++++----- 1 file changed, 175 insertions(+), 29 deletions(-) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index f8ad581ea247..6bd7c288e820 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ struct bpf_prog_priv { struct bpf_insn *insns_buf; int nr_types; int *type_mapping; + int *prologue_fds; }; struct bpf_perf_object { @@ -56,6 +58,11 @@ struct bpf_perf_object { struct bpf_object *obj; }; +struct bpf_preproc_result { + struct bpf_insn *new_insn_ptr; + int new_insn_cnt; +}; + static LIST_HEAD(bpf_objects_list); static struct hashmap *bpf_program_hash; static struct hashmap *bpf_map_hash; @@ -86,6 +93,7 @@ bpf_perf_object__next(struct bpf_perf_object *prev) (perf_obj) = (tmp), (tmp) = bpf_perf_object__next(tmp)) static bool libbpf_initialized; +static int libbpf_sec_handler; static int bpf_perf_object__add(struct bpf_object *obj) { @@ -99,12 +107,76 @@ static int bpf_perf_object__add(struct bpf_object *obj) return perf_obj ? 0 : -ENOMEM; } +static void *program_priv(const struct bpf_program *prog) +{ + void *priv; + + if (IS_ERR_OR_NULL(bpf_program_hash)) + return NULL; + if (!hashmap__find(bpf_program_hash, prog, &priv)) + return NULL; + return priv; +} + +static struct bpf_insn prologue_init_insn[] = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), +}; + +static int libbpf_prog_prepare_load_fn(struct bpf_program *prog, + struct bpf_prog_load_opts *opts __maybe_unused, + long cookie __maybe_unused) +{ + size_t init_size_cnt = ARRAY_SIZE(prologue_init_insn); + size_t orig_insn_cnt, insn_cnt, init_size, orig_size; + struct bpf_prog_priv *priv = program_priv(prog); + const struct bpf_insn *orig_insn; + struct bpf_insn *insn; + + if (IS_ERR_OR_NULL(priv)) { + pr_debug("bpf: failed to get private field\n"); + return -BPF_LOADER_ERRNO__INTERNAL; + } + + if (!priv->need_prologue) + return 0; + + /* prepend initialization code to program instructions */ + orig_insn = bpf_program__insns(prog); + orig_insn_cnt = bpf_program__insn_cnt(prog); + init_size = init_size_cnt * sizeof(*insn); + orig_size = orig_insn_cnt * sizeof(*insn); + + insn_cnt = orig_insn_cnt + init_size_cnt; + insn = malloc(insn_cnt * sizeof(*insn)); + if (!insn) + return -ENOMEM; + + memcpy(insn, prologue_init_insn, init_size); + memcpy((char *) insn + init_size, orig_insn, orig_size); + bpf_program__set_insns(prog, insn, insn_cnt); + return 0; +} + static int libbpf_init(void) { + LIBBPF_OPTS(libbpf_prog_handler_opts, handler_opts, + .prog_prepare_load_fn = libbpf_prog_prepare_load_fn, + ); + if (libbpf_initialized) return 0; libbpf_set_print(libbpf_perf_print); + libbpf_sec_handler = libbpf_register_prog_handler(NULL, BPF_PROG_TYPE_KPROBE, + 0, &handler_opts); + if (libbpf_sec_handler < 0) { + pr_debug("bpf: failed to register libbpf section handler: %d\n", + libbpf_sec_handler); + return -BPF_LOADER_ERRNO__INTERNAL; + } libbpf_initialized = true; return 0; } @@ -188,14 +260,31 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source) return obj; } +static void close_prologue_programs(struct bpf_prog_priv *priv) +{ + struct perf_probe_event *pev; + int i, fd; + + if (!priv->need_prologue) + return; + pev = &priv->pev; + for (i = 0; i < pev->ntevs; i++) { + fd = priv->prologue_fds[i]; + if (fd != -1) + close(fd); + } +} + static void clear_prog_priv(const struct bpf_program *prog __maybe_unused, void *_priv) { struct bpf_prog_priv *priv = _priv; + close_prologue_programs(priv); cleanup_perf_probe_events(&priv->pev, 1); zfree(&priv->insns_buf); + zfree(&priv->prologue_fds); zfree(&priv->type_mapping); zfree(&priv->sys_name); zfree(&priv->evt_name); @@ -243,17 +332,6 @@ static bool ptr_equal(const void *key1, const void *key2, return key1 == key2; } -static void *program_priv(const struct bpf_program *prog) -{ - void *priv; - - if (IS_ERR_OR_NULL(bpf_program_hash)) - return NULL; - if (!hashmap__find(bpf_program_hash, prog, &priv)) - return NULL; - return priv; -} - static int program_set_priv(struct bpf_program *prog, void *priv) { void *old_priv; @@ -558,8 +636,8 @@ static int bpf__prepare_probe(void) static int preproc_gen_prologue(struct bpf_program *prog, int n, - struct bpf_insn *orig_insns, int orig_insns_cnt, - struct bpf_prog_prep_result *res) + const struct bpf_insn *orig_insns, int orig_insns_cnt, + struct bpf_preproc_result *res) { struct bpf_prog_priv *priv = program_priv(prog); struct probe_trace_event *tev; @@ -607,7 +685,6 @@ preproc_gen_prologue(struct bpf_program *prog, int n, res->new_insn_ptr = buf; res->new_insn_cnt = prologue_cnt + orig_insns_cnt; - res->pfd = NULL; return 0; errout: @@ -715,7 +792,7 @@ static int hook_load_preprocessor(struct bpf_program *prog) struct bpf_prog_priv *priv = program_priv(prog); struct perf_probe_event *pev; bool need_prologue = false; - int err, i; + int i; if (IS_ERR_OR_NULL(priv)) { pr_debug("Internal error when hook preprocessor\n"); @@ -753,6 +830,13 @@ static int hook_load_preprocessor(struct bpf_program *prog) return -ENOMEM; } + priv->prologue_fds = malloc(sizeof(int) * pev->ntevs); + if (!priv->prologue_fds) { + pr_debug("Not enough memory: alloc prologue fds failed\n"); + return -ENOMEM; + } + memset(priv->prologue_fds, -1, sizeof(int) * pev->ntevs); + priv->type_mapping = malloc(sizeof(int) * pev->ntevs); if (!priv->type_mapping) { pr_debug("Not enough memory: alloc type_mapping failed\n"); @@ -761,13 +845,7 @@ static int hook_load_preprocessor(struct bpf_program *prog) memset(priv->type_mapping, -1, sizeof(int) * pev->ntevs); - err = map_prologue(pev, priv->type_mapping, &priv->nr_types); - if (err) - return err; - - err = bpf_program__set_prep(prog, priv->nr_types, - preproc_gen_prologue); - return err; + return map_prologue(pev, priv->type_mapping, &priv->nr_types); } int bpf__probe(struct bpf_object *obj) @@ -874,6 +952,77 @@ int bpf__unprobe(struct bpf_object *obj) return ret; } +static int bpf_object__load_prologue(struct bpf_object *obj) +{ + int init_cnt = ARRAY_SIZE(prologue_init_insn); + const struct bpf_insn *orig_insns; + struct bpf_preproc_result res; + struct perf_probe_event *pev; + struct bpf_program *prog; + int orig_insns_cnt; + + bpf_object__for_each_program(prog, obj) { + struct bpf_prog_priv *priv = program_priv(prog); + int err, i, fd; + + if (IS_ERR_OR_NULL(priv)) { + pr_debug("bpf: failed to get private field\n"); + return -BPF_LOADER_ERRNO__INTERNAL; + } + + if (!priv->need_prologue) + continue; + + /* + * For each program that needs prologue we do following: + * + * - take its current instructions and use them + * to generate the new code with prologue + * - load new instructions with bpf_prog_load + * and keep the fd in prologue_fds + * - new fd will be used in bpf__foreach_event + * to connect this program with perf evsel + */ + orig_insns = bpf_program__insns(prog); + orig_insns_cnt = bpf_program__insn_cnt(prog); + + pev = &priv->pev; + for (i = 0; i < pev->ntevs; i++) { + /* + * Skipping artificall prologue_init_insn instructions + * (init_cnt), so the prologue can be generated instead + * of them. + */ + err = preproc_gen_prologue(prog, i, + orig_insns + init_cnt, + orig_insns_cnt - init_cnt, + &res); + if (err) + return err; + + fd = bpf_prog_load(bpf_program__get_type(prog), + bpf_program__name(prog), "GPL", + res.new_insn_ptr, + res.new_insn_cnt, NULL); + if (fd < 0) { + char bf[128]; + + libbpf_strerror(-errno, bf, sizeof(bf)); + pr_debug("bpf: load objects with prologue failed: err=%d: (%s)\n", + -errno, bf); + return -errno; + } + priv->prologue_fds[i] = fd; + } + /* + * We no longer need the original program, + * we can unload it. + */ + bpf_program__unload(prog); + } + return 0; +} + int bpf__load(struct bpf_object *obj) { int err; @@ -885,7 +1034,7 @@ int bpf__load(struct bpf_object *obj) pr_debug("bpf: load objects failed: err=%d: (%s)\n", err, bf); return err; } - return 0; + return bpf_object__load_prologue(obj); } int bpf__foreach_event(struct bpf_object *obj, @@ -920,13 +1069,10 @@ int bpf__foreach_event(struct bpf_object *obj, for (i = 0; i < pev->ntevs; i++) { tev = &pev->tevs[i]; - if (priv->need_prologue) { - int type = priv->type_mapping[i]; - - fd = bpf_program__nth_fd(prog, type); - } else { + if (priv->need_prologue) + fd = priv->prologue_fds[i]; + else fd = bpf_program__fd(prog); - } if (fd < 0) { pr_debug("bpf: failed to get file descriptor\n"); From 2f6d1e0f8ff3cf66b1eef8ed36f6c64d2e679284 Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Fri, 24 Jun 2022 14:14:11 +0000 Subject: [PATCH 27/94] bpf, docs: Fix the code formatting in instruction-set A minor typo fix to include "| BPF_LD" into its previous code phrase: ``BPF_IND`` | BPF_LD --> ``BPF_IND | BPF_LD`` Signed-off-by: Shahab Vahedi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b6120b31-3d1d-bf2d-2f2a-aa768d91257b@synopsys.com --- Documentation/bpf/instruction-set.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 9e27fbdb2206..1b0e6711dec9 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -351,7 +351,7 @@ These instructions have seven implicit operands: * Register R0 is an implicit output which contains the data fetched from the packet. * Registers R1-R5 are scratch registers that are clobbered after a call to - ``BPF_ABS | BPF_LD`` or ``BPF_IND`` | BPF_LD instructions. + ``BPF_ABS | BPF_LD`` or ``BPF_IND | BPF_LD`` instructions. These instructions have an implicit program exit condition as well. When an eBPF program is trying to access the data beyond the packet boundary, the From fd75733da2f376c0c8c6513c3cb2ac227082ec5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Thu, 23 Jun 2022 18:29:34 +0000 Subject: [PATCH 28/94] bpf: Merge "types_are_compat" logic into relo_core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF type compatibility checks (bpf_core_types_are_compat()) are currently duplicated between kernel and user space. That's a historical artifact more than intentional doing and can lead to subtle bugs where one implementation is adjusted but another is forgotten. That happened with the enum64 work, for example, where the libbpf side was changed (commit 23b2a3a8f63a ("libbpf: Add enum64 relocation support")) to use the btf_kind_core_compat() helper function but the kernel side was not (commit 6089fb325cf7 ("bpf: Add btf enum64 support")). This patch addresses both the duplication issue, by merging both implementations and moving them into relo_core.c, and fixes the alluded to kind check (by giving preference to libbpf's already adjusted logic). For discussion of the topic, please refer to: https://lore.kernel.org/bpf/CAADnVQKbWR7oarBdewgOBZUPzryhRYvEbkhyPJQHHuxq=0K1gw@mail.gmail.com/T/#mcc99f4a33ad9a322afaf1b9276fb1f0b7add9665 Changelog: v1 -> v2: - limited libbpf recursion limit to 32 - changed name to __bpf_core_types_are_compat - included warning previously present in libbpf version - merged kernel and user space changes into a single patch Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220623182934.2582827-1-deso@posteo.net --- kernel/bpf/btf.c | 84 +-------------------------------------- tools/lib/bpf/libbpf.c | 72 +-------------------------------- tools/lib/bpf/relo_core.c | 80 +++++++++++++++++++++++++++++++++++++ tools/lib/bpf/relo_core.h | 2 + 4 files changed, 84 insertions(+), 154 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f08037c31dd7..2e2066d6af94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7416,87 +7416,6 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); #define MAX_TYPES_ARE_COMPAT_DEPTH 2 -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) -{ - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - case BTF_KIND_ENUM64: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; - - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - return 0; - } -} - /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already @@ -7519,8 +7438,7 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 49e359cd34df..335467ece75f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5732,77 +5732,7 @@ err_out: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf__type_by_id(local_btf, local_id); - targ_type = btf__type_by_id(targ_btf, targ_id); - if (!btf_kind_core_compat(local_type, targ_type)) - return 0; - -recur: - depth--; - if (depth < 0) - return -EINVAL; - - local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); - targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; - - if (!btf_kind_core_compat(local_type, targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_ENUM64: - case BTF_KIND_FWD: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - skip_mods_and_typedefs(local_btf, local_p->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); - err = bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - skip_mods_and_typedefs(local_btf, local_type->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", - btf_kind_str(local_type), local_id, targ_id); - return 0; - } + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); } static size_t bpf_core_hash_fn(const void *key, void *ctx) diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 6ad3c3891a9a..e070123332cd 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -141,6 +141,86 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) } } +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + /* * Turn bpf_core_relo into a low- and high-level spec representation, * validating correctness along the way, as well as calculating resulting diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 7df0da082f2c..3fd3842d4230 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -68,6 +68,8 @@ struct bpf_core_relo_res { __u32 new_type_id; }; +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); From 697fb80a53642be624f5121b6ca9d66769c180e0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 27 Jun 2022 20:58:03 -0700 Subject: [PATCH 29/94] bpf: Fix sockmap calling sleepable function in teardown path syzbot reproduced the bug ... BUG: sleeping function called from invalid context at kernel/workqueue.c:3010 ... with the following stack trace fragment ... start_flush_work kernel/workqueue.c:3010 [inline] __flush_work+0x109/0xb10 kernel/workqueue.c:3074 __cancel_work_timer+0x3f9/0x570 kernel/workqueue.c:3162 sk_psock_stop+0x4cb/0x630 net/core/skmsg.c:802 sock_map_destroy+0x333/0x760 net/core/sock_map.c:1581 inet_csk_destroy_sock+0x196/0x440 net/ipv4/inet_connection_sock.c:1130 __tcp_close+0xd5b/0x12b0 net/ipv4/tcp.c:2897 tcp_close+0x29/0xc0 net/ipv4/tcp.c:2909 ... introduced by d8616ee2affc. Do a quick trace of the code path and the bug is obvious: inet_csk_destroy_sock(sk) sk_prot->destroy(sk); <--- sock_map_destroy sk_psock_stop(, true); <--- true so cancel workqueue cancel_work_sync() <--- splat, because *_bh_disable() We can not call cancel_work_sync() from inside destroy path. So mark the sk_psock_stop call to skip this cancel_work_sync(). This will avoid the BUG, but means we may run sk_psock_backlog after or during the destroy op. We zapped the ingress_skb queue in sk_psock_stop (safe to do with local_bh_disable) so its empty and the sk_psock_backlog work item will not find any pkts to process here. However, because we are not going to wait for it or clear its ->state its possible it kicks off or is already running. This should be 'safe' up until psock drops its refcnt to psock->sk. The sock_put() that drops this reference is only done at psock destroy time from sk_psock_destroy(). This is done through workqueue when sk_psock_drop() is called on psock refnt reaches 0. And importantly sk_psock_destroy() does a cancel_work_sync(). So trivial fix works. I've had hit or miss luck reproducing this caught it once or twice with the provided reproducer when running with many runners. However, syzkaller is very good at reproducing so relying on syzkaller to verify fix. Fixes: d8616ee2affc ("bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues") Reported-by: syzbot+140186ceba0c496183bc@syzkaller.appspotmail.com Suggested-by: Hillf Danton Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Cc: Wang Yufen Link: https://lore.kernel.org/bpf/20220628035803.317876-1-john.fastabend@gmail.com --- net/core/sock_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9f08ccfaf6da..028813dfecb0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1578,7 +1578,7 @@ void sock_map_destroy(struct sock *sk) saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, true); + sk_psock_stop(psock, false); sk_psock_put(sk, psock); saved_destroy(sk); } From f36600634282a519e1b0abea609acdc8731515d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:13 -0700 Subject: [PATCH 30/94] libbpf: move xsk.{c,h} into selftests/bpf Remove deprecated xsk APIs from libbpf. But given we have selftests relying on this, move those files (with minimal adjustments to make them compilable) under selftests/bpf. We also remove all the removed APIs from libbpf.map, while overall keeping version inheritance chain, as most APIs are backwards compatible so there is no need to reassign them as LIBBPF_1.0.0 versions. Cc: Magnus Karlsson Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/libbpf.map | 12 ---- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/xdpxceiver.c | 2 +- tools/{lib => testing/selftests}/bpf/xsk.c | 76 ++++++++++++---------- tools/{lib => testing/selftests}/bpf/xsk.h | 29 ++------- 7 files changed, 49 insertions(+), 76 deletions(-) rename tools/{lib => testing/selftests}/bpf/xsk.c (95%) rename tools/{lib => testing/selftests}/bpf/xsk.h (84%) diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 31a1a9015902..5a3dfb56d78f 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,4 +1,4 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ - netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ + netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \ btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ usdt.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index a1265b152027..4c904ef0b47e 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -237,7 +237,7 @@ install_lib: all_cmd $(call do_install_mkdir,$(libdir_SQ)); \ cp -fpR $(LIB_FILE) $(DESTDIR)$(libdir_SQ) -SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h \ +SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ bpf_helpers.h bpf_tracing.h bpf_endian.h bpf_core_read.h \ skel_internal.h libbpf_version.h usdt.bpf.h GEN_HDRS := $(BPF_GENERATED) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 116a2a8ee7c2..da7a4f928452 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -147,12 +147,6 @@ LIBBPF_0.0.2 { btf_ext__new; btf_ext__reloc_func_info; btf_ext__reloc_line_info; - xsk_umem__create; - xsk_socket__create; - xsk_umem__delete; - xsk_socket__delete; - xsk_umem__fd; - xsk_socket__fd; bpf_program__get_prog_info_linear; bpf_program__bpil_addr_to_offs; bpf_program__bpil_offs_to_addr; @@ -183,7 +177,6 @@ LIBBPF_0.0.4 { perf_buffer__new; perf_buffer__new_raw; perf_buffer__poll; - xsk_umem__create; } LIBBPF_0.0.3; LIBBPF_0.0.5 { @@ -336,7 +329,6 @@ LIBBPF_0.2.0 { perf_buffer__buffer_fd; perf_buffer__epoll_fd; perf_buffer__consume_buffer; - xsk_socket__create_shared; } LIBBPF_0.1.0; LIBBPF_0.3.0 { @@ -348,8 +340,6 @@ LIBBPF_0.3.0 { btf__new_empty_split; btf__new_split; ring_buffer__epoll_fd; - xsk_setup_xdp_prog; - xsk_socket__update_xskmap; } LIBBPF_0.2.0; LIBBPF_0.4.0 { @@ -468,6 +458,4 @@ LIBBPF_1.0.0 { libbpf_bpf_link_type_str; libbpf_bpf_map_type_str; libbpf_bpf_prog_type_str; - - local: *; }; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4fbd88a8ed9e..e32a28fe8bc1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -230,6 +230,8 @@ $(OUTPUT)/xdping: $(TESTING_HELPERS) $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) +$(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/xdpxceiver: $(OUTPUT)/xsk.o BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index e5992a6b5e09..019c567b6b4e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -97,7 +97,7 @@ #include #include #include -#include +#include "xsk.h" #include "xdpxceiver.h" #include "../kselftest.h" diff --git a/tools/lib/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c similarity index 95% rename from tools/lib/bpf/xsk.c rename to tools/testing/selftests/bpf/xsk.c index af136f73b09d..eb50c3f336f8 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -30,16 +30,10 @@ #include #include -#include "bpf.h" -#include "libbpf.h" -#include "libbpf_internal.h" +#include +#include #include "xsk.h" -/* entire xsk.h and xsk.c is going away in libbpf 1.0, so ignore all internal - * uses of deprecated APIs - */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - #ifndef SOL_XDP #define SOL_XDP 283 #endif @@ -52,6 +46,8 @@ #define PF_XDP AF_XDP #endif +#define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) + enum xsk_prog { XSK_PROG_FALLBACK, XSK_PROG_REDIRECT_FLAGS, @@ -286,11 +282,10 @@ out_mmap: return err; } -DEFAULT_VERSION(xsk_umem__create_v0_0_4, xsk_umem__create, LIBBPF_0.0.4) -int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area, - __u64 size, struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *usr_config) +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, + __u64 size, struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *usr_config) { struct xdp_umem_reg mr; struct xsk_umem *umem; @@ -351,25 +346,9 @@ struct xsk_umem_config_v1 { __u32 frame_headroom; }; -COMPAT_VERSION(xsk_umem__create_v0_0_2, xsk_umem__create, LIBBPF_0.0.2) -int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area, - __u64 size, struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *usr_config) -{ - struct xsk_umem_config config; - - memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1)); - config.flags = 0; - - return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp, - &config); -} - static enum xsk_prog get_xsk_prog(void) { enum xsk_prog detected = XSK_PROG_FALLBACK; - __u32 size_out, retval, duration; char data_in = 0, data_out; struct bpf_insn insns[] = { BPF_LD_MAP_FD(BPF_REG_1, 0), @@ -378,6 +357,12 @@ static enum xsk_prog get_xsk_prog(void) BPF_EMIT_CALL(BPF_FUNC_redirect_map), BPF_EXIT_INSN(), }; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data_in, + .data_size_in = 1, + .data_out = &data_out, + ); + int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); @@ -392,8 +377,8 @@ static enum xsk_prog get_xsk_prog(void) return detected; } - ret = bpf_prog_test_run(prog_fd, 0, &data_in, 1, &data_out, &size_out, &retval, &duration); - if (!ret && retval == XDP_PASS) + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ret && opts.retval == XDP_PASS) detected = XSK_PROG_REDIRECT_FLAGS; close(prog_fd); close(map_fd); @@ -510,7 +495,7 @@ static int xsk_create_bpf_link(struct xsk_socket *xsk) int link_fd; int err; - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); + err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); if (err) { pr_warn("getting XDP prog id failed\n"); return err; @@ -536,6 +521,25 @@ static int xsk_create_bpf_link(struct xsk_socket *xsk) return 0; } +/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst + * is zero-terminated string no matter what (unless sz == 0, in which case + * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs + * in what is returned. Given this is internal helper, it's trivial to extend + * this, when necessary. Use this instead of strncpy inside libbpf source code. + */ +static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) +{ + size_t i; + + if (sz == 0) + return; + + sz--; + for (i = 0; i < sz && src[i]; i++) + dst[i] = src[i]; + dst[i] = '\0'; +} + static int xsk_get_max_queues(struct xsk_socket *xsk) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; @@ -792,8 +796,8 @@ static int xsk_init_xdp_res(struct xsk_socket *xsk, if (ctx->has_bpf_link) err = xsk_create_bpf_link(xsk); else - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, - xsk->config.xdp_flags); + err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd, + xsk->config.xdp_flags, NULL); if (err) goto err_attach_xdp_prog; @@ -811,7 +815,7 @@ err_set_bpf_maps: if (ctx->has_bpf_link) close(ctx->link_fd); else - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); + bpf_xdp_detach(ctx->ifindex, 0, NULL); err_attach_xdp_prog: close(ctx->prog_fd); err_load_xdp_prog: @@ -862,7 +866,7 @@ static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) if (ctx->has_bpf_link) err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); else - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); + err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); if (err) return err; diff --git a/tools/lib/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h similarity index 84% rename from tools/lib/bpf/xsk.h rename to tools/testing/selftests/bpf/xsk.h index 64e9c57fd792..915e7135337c 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -9,15 +9,15 @@ * Author(s): Magnus Karlsson */ -#ifndef __LIBBPF_XSK_H -#define __LIBBPF_XSK_H +#ifndef __XSK_H +#define __XSK_H #include #include #include #include -#include "libbpf.h" +#include #ifdef __cplusplus extern "C" { @@ -251,9 +251,7 @@ static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr) return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr); } -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_umem__fd(const struct xsk_umem *umem); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_socket__fd(const struct xsk_socket *xsk); #define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 @@ -271,9 +269,7 @@ struct xsk_umem_config { __u32 flags; }; -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); /* Flags for the libbpf_flags field. */ @@ -288,32 +284,17 @@ struct xsk_socket_config { }; /* Set config to NULL to get the default configuration. */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_umem__create(struct xsk_umem **umem, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__create_v0_0_2(struct xsk_umem **umem, - void *umem_area, __u64 size, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") -int xsk_umem__create_v0_0_4(struct xsk_umem **umem, - void *umem_area, __u64 size, - struct xsk_ring_prod *fill, - struct xsk_ring_cons *comp, - const struct xsk_umem_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_socket__create(struct xsk_socket **xsk, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *config); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, @@ -324,13 +305,11 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const struct xsk_socket_config *config); /* Returns 0 for success and -EBUSY if the umem is still in use. */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") int xsk_umem__delete(struct xsk_umem *umem); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "AF_XDP support deprecated and moved to libxdp") void xsk_socket__delete(struct xsk_socket *xsk); #ifdef __cplusplus } /* extern "C" */ #endif -#endif /* __LIBBPF_XSK_H */ +#endif /* __XSK_H */ From 765a34130ea506fafefe179bdfa47da296016d92 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:14 -0700 Subject: [PATCH 31/94] libbpf: remove deprecated low-level APIs Drop low-level APIs as well as high-level (and very confusingly named) BPF object loading bpf_prog_load_xattr() and bpf_prog_load_deprecated() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 174 +-------------------------------------- tools/lib/bpf/bpf.h | 83 ------------------- tools/lib/bpf/libbpf.c | 89 -------------------- tools/lib/bpf/libbpf.h | 16 ---- tools/lib/bpf/libbpf.map | 16 ---- 5 files changed, 4 insertions(+), 374 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 240186aac8e6..2f3495d80d69 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -233,11 +233,10 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, return info; } -DEFAULT_VERSION(bpf_prog_load_v0_6_0, bpf_prog_load, LIBBPF_0.6.0) -int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts) +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + const struct bpf_prog_load_opts *opts) { void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; @@ -384,94 +383,6 @@ done: return libbpf_err_errno(fd); } -__attribute__((alias("bpf_load_program_xattr2"))) -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); - -static int bpf_load_program_xattr2(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz) -{ - LIBBPF_OPTS(bpf_prog_load_opts, p); - - if (!load_attr || !log_buf != !log_buf_sz) - return libbpf_err(-EINVAL); - - p.expected_attach_type = load_attr->expected_attach_type; - switch (load_attr->prog_type) { - case BPF_PROG_TYPE_STRUCT_OPS: - case BPF_PROG_TYPE_LSM: - p.attach_btf_id = load_attr->attach_btf_id; - break; - case BPF_PROG_TYPE_TRACING: - case BPF_PROG_TYPE_EXT: - p.attach_btf_id = load_attr->attach_btf_id; - p.attach_prog_fd = load_attr->attach_prog_fd; - break; - default: - p.prog_ifindex = load_attr->prog_ifindex; - p.kern_version = load_attr->kern_version; - } - p.log_level = load_attr->log_level; - p.log_buf = log_buf; - p.log_size = log_buf_sz; - p.prog_btf_fd = load_attr->prog_btf_fd; - p.func_info_rec_size = load_attr->func_info_rec_size; - p.func_info_cnt = load_attr->func_info_cnt; - p.func_info = load_attr->func_info; - p.line_info_rec_size = load_attr->line_info_rec_size; - p.line_info_cnt = load_attr->line_info_cnt; - p.line_info = load_attr->line_info; - p.prog_flags = load_attr->prog_flags; - - return bpf_prog_load(load_attr->prog_type, load_attr->name, load_attr->license, - load_attr->insns, load_attr->insns_cnt, &p); -} - -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz) -{ - struct bpf_load_program_attr load_attr; - - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); - load_attr.prog_type = type; - load_attr.expected_attach_type = 0; - load_attr.name = NULL; - load_attr.insns = insns; - load_attr.insns_cnt = insns_cnt; - load_attr.license = license; - load_attr.kern_version = kern_version; - - return bpf_load_program_xattr2(&load_attr, log_buf, log_buf_sz); -} - -int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, const char *license, - __u32 kern_version, char *log_buf, size_t log_buf_sz, - int log_level) -{ - union bpf_attr attr; - int fd; - - bump_rlimit_memlock(); - - memset(&attr, 0, sizeof(attr)); - attr.prog_type = type; - attr.insn_cnt = (__u32)insns_cnt; - attr.insns = ptr_to_u64(insns); - attr.license = ptr_to_u64(license); - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; - attr.log_level = log_level; - log_buf[0] = 0; - attr.kern_version = kern_version; - attr.prog_flags = prog_flags; - - fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS); - return libbpf_err_errno(fd); -} - int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags) { @@ -910,62 +821,6 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, return libbpf_err_errno(ret); } -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration) -{ - union bpf_attr attr; - int ret; - - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = prog_fd; - attr.test.data_in = ptr_to_u64(data); - attr.test.data_out = ptr_to_u64(data_out); - attr.test.data_size_in = size; - attr.test.repeat = repeat; - - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); - - if (size_out) - *size_out = attr.test.data_size_out; - if (retval) - *retval = attr.test.retval; - if (duration) - *duration = attr.test.duration; - - return libbpf_err_errno(ret); -} - -int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr) -{ - union bpf_attr attr; - int ret; - - if (!test_attr->data_out && test_attr->data_size_out > 0) - return libbpf_err(-EINVAL); - - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = test_attr->prog_fd; - attr.test.data_in = ptr_to_u64(test_attr->data_in); - attr.test.data_out = ptr_to_u64(test_attr->data_out); - attr.test.data_size_in = test_attr->data_size_in; - attr.test.data_size_out = test_attr->data_size_out; - attr.test.ctx_in = ptr_to_u64(test_attr->ctx_in); - attr.test.ctx_out = ptr_to_u64(test_attr->ctx_out); - attr.test.ctx_size_in = test_attr->ctx_size_in; - attr.test.ctx_size_out = test_attr->ctx_size_out; - attr.test.repeat = test_attr->repeat; - - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); - - test_attr->data_size_out = attr.test.data_size_out; - test_attr->ctx_size_out = attr.test.ctx_size_out; - test_attr->retval = attr.test.retval; - test_attr->duration = attr.test.duration; - - return libbpf_err_errno(ret); -} - int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) { union bpf_attr attr; @@ -1162,27 +1017,6 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa return libbpf_err_errno(fd); } -int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) -{ - LIBBPF_OPTS(bpf_btf_load_opts, opts); - int fd; - -retry: - if (do_log && log_buf && log_buf_size) { - opts.log_buf = log_buf; - opts.log_size = log_buf_size; - opts.log_level = 1; - } - - fd = bpf_btf_load(btf, btf_size, &opts); - if (fd < 0 && !do_log && log_buf && log_buf_size) { - do_log = true; - goto retry; - } - - return libbpf_err_errno(fd); -} - int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index cabc03703e29..af7baf5da74d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -103,54 +103,6 @@ LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, const struct bpf_insn *insns, size_t insn_cnt, const struct bpf_prog_load_opts *opts); -/* this "specialization" should go away in libbpf 1.0 */ -LIBBPF_API int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts); - -/* This is an elaborate way to not conflict with deprecated bpf_prog_load() - * API, defined in libbpf.h. Once we hit libbpf 1.0, all this will be gone. - * With this approach, if someone is calling bpf_prog_load() with - * 4 arguments, they will use the deprecated API, which keeps backwards - * compatibility (both source code and binary). If bpf_prog_load() is called - * with 6 arguments, though, it gets redirected to __bpf_prog_load. - * So looking forward to libbpf 1.0 when this hack will be gone and - * __bpf_prog_load() will be called just bpf_prog_load(). - */ -#ifndef bpf_prog_load -#define bpf_prog_load(...) ___libbpf_overload(___bpf_prog_load, __VA_ARGS__) -#define ___bpf_prog_load4(file, type, pobj, prog_fd) \ - bpf_prog_load_deprecated(file, type, pobj, prog_fd) -#define ___bpf_prog_load6(prog_type, prog_name, license, insns, insn_cnt, opts) \ - bpf_prog_load(prog_type, prog_name, license, insns, insn_cnt, opts) -#endif /* bpf_prog_load */ - -struct bpf_load_program_attr { - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - const char *name; - const struct bpf_insn *insns; - size_t insns_cnt; - const char *license; - union { - __u32 kern_version; - __u32 attach_prog_fd; - }; - union { - __u32 prog_ifindex; - __u32 attach_btf_id; - }; - __u32 prog_btf_fd; - __u32 func_info_rec_size; - const void *func_info; - __u32 func_info_cnt; - __u32 line_info_rec_size; - const void *line_info; - __u32 line_info_cnt; - __u32 log_level; - __u32 prog_flags; -}; /* Flags to direct loading requirements */ #define MAPS_RELAX_COMPAT 0x01 @@ -158,22 +110,6 @@ struct bpf_load_program_attr { /* Recommended log buffer size */ #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program(enum bpf_prog_type type, - const struct bpf_insn *insns, size_t insns_cnt, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, - const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, - int log_level); - struct bpf_btf_load_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -187,10 +123,6 @@ struct bpf_btf_load_opts { LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_btf_load() instead") -LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, - __u32 log_buf_size, bool do_log); - LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); @@ -353,10 +285,6 @@ LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, enum bpf_attach_type type, const struct bpf_prog_attach_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_prog_attach_opts() instead") -LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); @@ -422,17 +350,6 @@ struct bpf_prog_test_run_attr { * out: length of cxt_out */ }; -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr); - -/* - * bpf_prog_test_run does not check that data_out is large enough. Consider - * using bpf_prog_test_run_opts instead. - */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, - __u32 size, void *data_out, __u32 *size_out, - __u32 *retval, __u32 *duration); LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 335467ece75f..b11d3e689126 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10205,95 +10205,6 @@ long libbpf_get_error(const void *ptr) return -errno; } -__attribute__((alias("bpf_prog_load_xattr2"))) -int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); - -static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd) -{ - struct bpf_object_open_attr open_attr = {}; - struct bpf_program *prog, *first_prog = NULL; - struct bpf_object *obj; - struct bpf_map *map; - int err; - - if (!attr) - return libbpf_err(-EINVAL); - if (!attr->file) - return libbpf_err(-EINVAL); - - open_attr.file = attr->file; - open_attr.prog_type = attr->prog_type; - - obj = __bpf_object__open_xattr(&open_attr, 0); - err = libbpf_get_error(obj); - if (err) - return libbpf_err(-ENOENT); - - bpf_object__for_each_program(prog, obj) { - enum bpf_attach_type attach_type = attr->expected_attach_type; - /* - * to preserve backwards compatibility, bpf_prog_load treats - * attr->prog_type, if specified, as an override to whatever - * bpf_object__open guessed - */ - if (attr->prog_type != BPF_PROG_TYPE_UNSPEC) { - prog->type = attr->prog_type; - prog->expected_attach_type = attach_type; - } - if (bpf_program__type(prog) == BPF_PROG_TYPE_UNSPEC) { - /* - * we haven't guessed from section name and user - * didn't provide a fallback type, too bad... - */ - bpf_object__close(obj); - return libbpf_err(-EINVAL); - } - - prog->prog_ifindex = attr->ifindex; - prog->log_level = attr->log_level; - prog->prog_flags |= attr->prog_flags; - if (!first_prog) - first_prog = prog; - } - - bpf_object__for_each_map(map, obj) { - if (map->def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) - map->map_ifindex = attr->ifindex; - } - - if (!first_prog) { - pr_warn("object file doesn't contain bpf program\n"); - bpf_object__close(obj); - return libbpf_err(-ENOENT); - } - - err = bpf_object__load(obj); - if (err) { - bpf_object__close(obj); - return libbpf_err(err); - } - - *pobj = obj; - *prog_fd = bpf_program__fd(first_prog); - return 0; -} - -COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1) -int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd) -{ - struct bpf_prog_load_attr attr; - - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = file; - attr.prog_type = type; - attr.expected_attach_type = 0; - - return bpf_prog_load_xattr2(&attr, pobj, prog_fd); -} - /* Replace link's underlying BPF program with the new one */ int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index fa27969da0da..0b2de10b5878 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1164,22 +1164,6 @@ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, */ LIBBPF_API long libbpf_get_error(const void *ptr); -struct bpf_prog_load_attr { - const char *file; - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - int ifindex; - int log_level; - int prog_flags; -}; - -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd); - /* XDP related API */ struct xdp_link_info { __u32 prog_id; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index da7a4f928452..a480490914a4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -1,15 +1,6 @@ LIBBPF_0.0.1 { global: bpf_btf_get_fd_by_id; - bpf_create_map; - bpf_create_map_in_map; - bpf_create_map_in_map_node; - bpf_create_map_name; - bpf_create_map_node; - bpf_create_map_xattr; - bpf_load_btf; - bpf_load_program; - bpf_load_program_xattr; bpf_map__btf_key_type_id; bpf_map__btf_value_type_id; bpf_map__def; @@ -61,11 +52,7 @@ LIBBPF_0.0.1 { bpf_prog_detach2; bpf_prog_get_fd_by_id; bpf_prog_get_next_id; - bpf_prog_load; - bpf_prog_load_xattr; bpf_prog_query; - bpf_prog_test_run; - bpf_prog_test_run_xattr; bpf_program__fd; bpf_program__is_kprobe; bpf_program__is_perf_event; @@ -106,7 +93,6 @@ LIBBPF_0.0.1 { bpf_raw_tracepoint_open; bpf_set_link_xdp_fd; bpf_task_fd_query; - bpf_verify_program; btf__fd; btf__find_by_name; btf__free; @@ -218,7 +204,6 @@ LIBBPF_0.0.7 { bpf_object__load_skeleton; bpf_object__open_skeleton; bpf_probe_large_insn_limit; - bpf_prog_attach_xattr; bpf_program__attach; bpf_program__name; bpf_program__is_extension; @@ -387,7 +372,6 @@ LIBBPF_0.6.0 { bpf_object__next_program; bpf_object__prev_map; bpf_object__prev_program; - bpf_prog_load_deprecated; bpf_prog_load; bpf_program__flags; bpf_program__insn_cnt; From 53e6af3a761c6913164303f4f31da55ee2d3b134 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:15 -0700 Subject: [PATCH 32/94] libbpf: remove deprecated XDP APIs Get rid of deprecated bpf_set_link*() and bpf_get_link*() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.h | 20 ------------- tools/lib/bpf/libbpf.map | 4 --- tools/lib/bpf/netlink.c | 62 ++++++---------------------------------- 3 files changed, 8 insertions(+), 78 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 0b2de10b5878..d1c93a1e7d66 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1164,15 +1164,6 @@ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, */ LIBBPF_API long libbpf_get_error(const void *ptr); -/* XDP related API */ -struct xdp_link_info { - __u32 prog_id; - __u32 drv_prog_id; - __u32 hw_prog_id; - __u32 skb_prog_id; - __u8 attach_mode; -}; - struct bpf_xdp_set_link_opts { size_t sz; int old_fd; @@ -1180,17 +1171,6 @@ struct bpf_xdp_set_link_opts { }; #define bpf_xdp_set_link_opts__last_field old_fd -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query_id() instead") -LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query() instead") -LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags); - struct bpf_xdp_attach_opts { size_t sz; int old_prog_fd; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index a480490914a4..713c769f125a 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -91,7 +91,6 @@ LIBBPF_0.0.1 { bpf_prog_linfo__lfind_addr_func; bpf_prog_linfo__lfind; bpf_raw_tracepoint_open; - bpf_set_link_xdp_fd; bpf_task_fd_query; btf__fd; btf__find_by_name; @@ -120,7 +119,6 @@ LIBBPF_0.0.2 { bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; - bpf_get_link_xdp_id; btf__dedup; btf__get_map_kv_tids; btf__get_nr_types; @@ -172,7 +170,6 @@ LIBBPF_0.0.5 { LIBBPF_0.0.6 { global: - bpf_get_link_xdp_info; bpf_map__get_pin_path; bpf_map__is_pinned; bpf_map__set_pin_path; @@ -231,7 +228,6 @@ LIBBPF_0.0.8 { bpf_program__is_lsm; bpf_program__set_attach_target; bpf_program__set_lsm; - bpf_set_link_xdp_fd_opts; } LIBBPF_0.0.7; LIBBPF_0.0.9 { diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index cbc8967d5402..6c013168032d 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -27,6 +27,14 @@ typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, void *cookie); +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + struct xdp_id_md { int ifindex; __u32 flags; @@ -288,31 +296,6 @@ int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *o return bpf_xdp_attach(ifindex, -1, flags, opts); } -int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts) -{ - int old_fd = -1, ret; - - if (!OPTS_VALID(opts, bpf_xdp_set_link_opts)) - return libbpf_err(-EINVAL); - - if (OPTS_HAS(opts, old_fd)) { - old_fd = OPTS_GET(opts, old_fd, -1); - flags |= XDP_FLAGS_REPLACE; - } - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, old_fd, flags); - return libbpf_err(ret); -} - -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - int ret; - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags); - return libbpf_err(ret); -} - static int __dump_link_nlmsg(struct nlmsghdr *nlh, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { @@ -413,30 +396,6 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) return 0; } -int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags) -{ - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - size_t sz; - int err; - - if (!info_size) - return libbpf_err(-EINVAL); - - err = bpf_xdp_query(ifindex, flags, &opts); - if (err) - return libbpf_err(err); - - /* struct xdp_link_info field layout matches struct bpf_xdp_query_opts - * layout after sz field - */ - sz = min(info_size, offsetofend(struct xdp_link_info, attach_mode)); - memcpy(info, &opts.prog_id, sz); - memset((void *)info + sz, 0, info_size - sz); - - return 0; -} - int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) { LIBBPF_OPTS(bpf_xdp_query_opts, opts); @@ -463,11 +422,6 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) } -int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) -{ - return bpf_xdp_query_id(ifindex, flags, prog_id); -} - typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); static int clsact_config(struct libbpf_nla_req *req) From d320fad217b79849d66b53d7fdb361020ab4f64c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:16 -0700 Subject: [PATCH 33/94] libbpf: remove deprecated probing APIs Get rid of deprecated feature-probing APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.h | 8 --- tools/lib/bpf/libbpf.map | 4 -- tools/lib/bpf/libbpf_probes.c | 125 ++-------------------------------- 3 files changed, 5 insertions(+), 132 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index d1c93a1e7d66..71b19ae1659c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1412,14 +1412,6 @@ bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, * user, causing subsequent probes to fail. In this case, the caller may want * to adjust that limit with setrlimit(). */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_prog_type() instead") -LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_map_type() instead") -LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_helper() instead") -LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "implement your own or use bpftool for feature detection") -LIBBPF_API bool bpf_probe_large_insn_limit(__u32 ifindex); /** * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 713c769f125a..3cea0bab95ea 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -112,9 +112,6 @@ LIBBPF_0.0.1 { LIBBPF_0.0.2 { global: - bpf_probe_helper; - bpf_probe_map_type; - bpf_probe_prog_type; bpf_map__resize; bpf_map_lookup_elem_flags; bpf_object__btf; @@ -200,7 +197,6 @@ LIBBPF_0.0.7 { bpf_object__detach_skeleton; bpf_object__load_skeleton; bpf_object__open_skeleton; - bpf_probe_large_insn_limit; bpf_program__attach; bpf_program__name; bpf_program__is_extension; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 97b06cede56f..0b5398786bf3 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -17,47 +17,14 @@ #include "libbpf.h" #include "libbpf_internal.h" -static bool grep(const char *buffer, const char *pattern) -{ - return !!strstr(buffer, pattern); -} - -static int get_vendor_id(int ifindex) -{ - char ifname[IF_NAMESIZE], path[64], buf[8]; - ssize_t len; - int fd; - - if (!if_indextoname(ifindex, ifname)) - return -1; - - snprintf(path, sizeof(path), "/sys/class/net/%s/device/vendor", ifname); - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return -1; - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) - return -1; - if (len >= (ssize_t)sizeof(buf)) - return -1; - buf[len] = '\0'; - - return strtol(buf, NULL, 0); -} - static int probe_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insns_cnt, - char *log_buf, size_t log_buf_sz, - __u32 ifindex) + char *log_buf, size_t log_buf_sz) { LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_buf = log_buf, .log_size = log_buf_sz, .log_level = log_buf ? 1 : 0, - .prog_ifindex = ifindex, ); int fd, err, exp_err = 0; const char *exp_msg = NULL; @@ -161,31 +128,10 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0, 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); return libbpf_err(ret); } -bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN() - }; - - /* prefer libbpf_probe_bpf_prog_type() unless offload is requested */ - if (ifindex == 0) - return libbpf_probe_bpf_prog_type(prog_type, NULL) == 1; - - if (ifindex && prog_type == BPF_PROG_TYPE_SCHED_CLS) - /* nfp returns -EINVAL on exit(0) with TC offload */ - insns[0].imm = 2; - - errno = 0; - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex); - - return errno != EINVAL && errno != EOPNOTSUPP; -} - int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len) { @@ -242,15 +188,13 @@ static int load_local_storage_btf(void) strs, sizeof(strs)); } -static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) +static int probe_map_create(enum bpf_map_type map_type) { LIBBPF_OPTS(bpf_map_create_opts, opts); int key_size, value_size, max_entries; __u32 btf_key_type_id = 0, btf_value_type_id = 0; int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err; - opts.map_ifindex = ifindex; - key_size = sizeof(__u32); value_size = sizeof(__u32); max_entries = 1; @@ -326,12 +270,6 @@ static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - /* TODO: probe for device, once libbpf has a function to create - * map-in-map for offload - */ - if (ifindex) - goto cleanup; - fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(__u32), sizeof(__u32), 1, NULL); if (fd_inner < 0) @@ -370,15 +308,10 @@ int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_map_create(map_type, 0); + ret = probe_map_create(map_type); return libbpf_err(ret); } -bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) -{ - return probe_map_create(map_type, ifindex) == 1; -} - int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts) { @@ -407,7 +340,7 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe } buf[0] = '\0'; - ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf), 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); if (ret < 0) return libbpf_err(ret); @@ -427,51 +360,3 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe return 0; return 1; /* assume supported */ } - -bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, - __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_EMIT_CALL(id), - BPF_EXIT_INSN() - }; - char buf[4096] = {}; - bool res; - - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); - - if (ifindex) { - switch (get_vendor_id(ifindex)) { - case 0x19ee: /* Netronome specific */ - res = res && !grep(buf, "not supported by FW") && - !grep(buf, "unsupported function id"); - break; - default: - break; - } - } - - return res; -} - -/* - * Probe for availability of kernel commit (5.3): - * - * c04c0d2b968a ("bpf: increase complexity limit and maximum program size") - */ -bool bpf_probe_large_insn_limit(__u32 ifindex) -{ - struct bpf_insn insns[BPF_MAXINSNS + 1]; - int i; - - for (i = 0; i < BPF_MAXINSNS; i++) - insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); - insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); - - errno = 0; - probe_prog_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0, - ifindex); - - return errno != E2BIG && errno != EINVAL; -} From aaf6886d9b53d34f0a3d2f577ddc1224026d12ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:17 -0700 Subject: [PATCH 34/94] libbpf: remove deprecated BTF APIs Get rid of deprecated BTF-related APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 183 +-------------------------------------- tools/lib/bpf/btf.h | 86 +----------------- tools/lib/bpf/btf_dump.c | 23 ++--- tools/lib/bpf/libbpf.c | 44 +++------- tools/lib/bpf/libbpf.map | 13 --- 5 files changed, 24 insertions(+), 325 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ae1520f7e1b0..2d14f1a52d7a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -448,11 +448,6 @@ static int btf_parse_type_sec(struct btf *btf) return 0; } -__u32 btf__get_nr_types(const struct btf *btf) -{ - return btf->start_id + btf->nr_types - 1; -} - __u32 btf__type_cnt(const struct btf *btf) { return btf->start_id + btf->nr_types; @@ -1408,92 +1403,6 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) return btf__load_from_kernel_by_id_split(id, NULL); } -int btf__get_from_id(__u32 id, struct btf **btf) -{ - struct btf *res; - int err; - - *btf = NULL; - res = btf__load_from_kernel_by_id(id); - err = libbpf_get_error(res); - - if (err) - return libbpf_err(err); - - *btf = res; - return 0; -} - -int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id) -{ - const struct btf_type *container_type; - const struct btf_member *key, *value; - const size_t max_name = 256; - char container_name[max_name]; - __s64 key_size, value_size; - __s32 container_id; - - if (snprintf(container_name, max_name, "____btf_map_%s", map_name) == max_name) { - pr_warn("map:%s length of '____btf_map_%s' is too long\n", - map_name, map_name); - return libbpf_err(-EINVAL); - } - - container_id = btf__find_by_name(btf, container_name); - if (container_id < 0) { - pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n", - map_name, container_name); - return libbpf_err(container_id); - } - - container_type = btf__type_by_id(btf, container_id); - if (!container_type) { - pr_warn("map:%s cannot find BTF type for container_id:%u\n", - map_name, container_id); - return libbpf_err(-EINVAL); - } - - if (!btf_is_struct(container_type) || btf_vlen(container_type) < 2) { - pr_warn("map:%s container_name:%s is an invalid container struct\n", - map_name, container_name); - return libbpf_err(-EINVAL); - } - - key = btf_members(container_type); - value = key + 1; - - key_size = btf__resolve_size(btf, key->type); - if (key_size < 0) { - pr_warn("map:%s invalid BTF key_type_size\n", map_name); - return libbpf_err(key_size); - } - - if (expected_key_size != key_size) { - pr_warn("map:%s btf_key_type_size:%u != map_def_key_size:%u\n", - map_name, (__u32)key_size, expected_key_size); - return libbpf_err(-EINVAL); - } - - value_size = btf__resolve_size(btf, value->type); - if (value_size < 0) { - pr_warn("map:%s invalid BTF value_type_size\n", map_name); - return libbpf_err(value_size); - } - - if (expected_value_size != value_size) { - pr_warn("map:%s btf_value_type_size:%u != map_def_value_size:%u\n", - map_name, (__u32)value_size, expected_value_size); - return libbpf_err(-EINVAL); - } - - *key_type_id = key->type; - *value_type_id = value->type; - - return 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -2965,81 +2874,6 @@ const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) return btf_ext->data; } -static int btf_ext_reloc_info(const struct btf *btf, - const struct btf_ext_info *ext_info, - const char *sec_name, __u32 insns_cnt, - void **info, __u32 *cnt) -{ - __u32 sec_hdrlen = sizeof(struct btf_ext_info_sec); - __u32 i, record_size, existing_len, records_len; - struct btf_ext_info_sec *sinfo; - const char *info_sec_name; - __u64 remain_len; - void *data; - - record_size = ext_info->rec_size; - sinfo = ext_info->info; - remain_len = ext_info->len; - while (remain_len > 0) { - records_len = sinfo->num_info * record_size; - info_sec_name = btf__name_by_offset(btf, sinfo->sec_name_off); - if (strcmp(info_sec_name, sec_name)) { - remain_len -= sec_hdrlen + records_len; - sinfo = (void *)sinfo + sec_hdrlen + records_len; - continue; - } - - existing_len = (*cnt) * record_size; - data = realloc(*info, existing_len + records_len); - if (!data) - return libbpf_err(-ENOMEM); - - memcpy(data + existing_len, sinfo->data, records_len); - /* adjust insn_off only, the rest data will be passed - * to the kernel. - */ - for (i = 0; i < sinfo->num_info; i++) { - __u32 *insn_off; - - insn_off = data + existing_len + (i * record_size); - *insn_off = *insn_off / sizeof(struct bpf_insn) + insns_cnt; - } - *info = data; - *cnt += sinfo->num_info; - return 0; - } - - return libbpf_err(-ENOENT); -} - -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->func_info, sec_name, - insns_cnt, func_info, cnt); -} - -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->line_info, sec_name, - insns_cnt, line_info, cnt); -} - -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->func_info.rec_size; -} - -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->line_info.rec_size; -} - struct btf_dedup; static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); @@ -3189,9 +3023,7 @@ static int btf_dedup_remap_types(struct btf_dedup *d); * deduplicating structs/unions is described in greater details in comments for * `btf_dedup_is_equiv` function. */ - -DEFAULT_VERSION(btf__dedup_v0_6_0, btf__dedup, LIBBPF_0.6.0) -int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) { struct btf_dedup *d; int err; @@ -3251,19 +3083,6 @@ done: return libbpf_err(err); } -COMPAT_VERSION(btf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2) -int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts) -{ - LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext); - - if (unused_opts) { - pr_warn("please use new version of btf__dedup() that supports options\n"); - return libbpf_err(-ENOTSUP); - } - - return btf__dedup(btf, &opts); -} - #define BTF_UNPROCESSED_ID ((__u32)-1) #define BTF_IN_PROGRESS_ID ((__u32)-2) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 9fb416eb5644..583760df83b4 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -120,20 +120,12 @@ LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_from_kernel_by_id instead") -LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "intended for internal libbpf use only") -LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_into_kernel instead") -LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API int btf__load_into_kernel(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind); -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__type_cnt() instead; note that btf__get_nr_types() == btf__type_cnt() - 1") -LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, @@ -150,29 +142,10 @@ LIBBPF_API void btf__set_fd(struct btf *btf, int fd); LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); -LIBBPF_DEPRECATED_SINCE(0, 7, "this API is not necessary when BTF-defined maps are used") -LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, - __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id); LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info is deprecated; write custom func_info parsing to fetch rec_size") -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info is deprecated; write custom line_info parsing to fetch rec_size") -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); @@ -259,22 +232,12 @@ struct btf_dedup_opts { LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); -LIBBPF_API int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__dedup() instead") -LIBBPF_API int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *opts); -#define btf__dedup(...) ___libbpf_overload(___btf_dedup, __VA_ARGS__) -#define ___btf_dedup3(btf, btf_ext, opts) btf__dedup_deprecated(btf, btf_ext, opts) -#define ___btf_dedup2(btf, opts) btf__dedup(btf, opts) - struct btf_dump; struct btf_dump_opts { - union { - size_t sz; - void *ctx; /* DEPRECATED: will be gone in v1.0 */ - }; + size_t sz; }; +#define btf_dump_opts__last_field sz typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); @@ -283,51 +246,6 @@ LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, void *ctx, const struct btf_dump_opts *opts); -LIBBPF_API struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts); - -LIBBPF_API struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn); - -/* Choose either btf_dump__new() or btf_dump__new_deprecated() based on the - * type of 4th argument. If it's btf_dump's print callback, use deprecated - * API; otherwise, choose the new btf_dump__new(). ___libbpf_override() - * doesn't work here because both variants have 4 input arguments. - * - * (void *) casts are necessary to avoid compilation warnings about type - * mismatches, because even though __builtin_choose_expr() only ever evaluates - * one side the other side still has to satisfy type constraints (this is - * compiler implementation limitation which might be lifted eventually, - * according to the documentation). So passing struct btf_ext in place of - * btf_dump_printf_fn_t would be generating compilation warning. Casting to - * void * avoids this issue. - * - * Also, two type compatibility checks for a function and function pointer are - * required because passing function reference into btf_dump__new() as - * btf_dump__new(..., my_callback, ...) and as btf_dump__new(..., - * &my_callback, ...) (not explicit ampersand in the latter case) actually - * differs as far as __builtin_types_compatible_p() is concerned. Thus two - * checks are combined to detect callback argument. - * - * The rest works just like in case of ___libbpf_override() usage with symbol - * versioning. - * - * C++ compilers don't support __builtin_types_compatible_p(), so at least - * don't screw up compilation for them and let C++ users pick btf_dump__new - * vs btf_dump__new_deprecated explicitly. - */ -#ifndef __cplusplus -#define btf_dump__new(a1, a2, a3, a4) __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(a4), btf_dump_printf_fn_t) || \ - __builtin_types_compatible_p(typeof(a4), void(void *, const char *, va_list)), \ - btf_dump__new_deprecated((void *)a1, (void *)a2, (void *)a3, (void *)a4), \ - btf_dump__new((void *)a1, (void *)a2, (void *)a3, (void *)a4)) -#endif - LIBBPF_API void btf_dump__free(struct btf_dump *d); LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index f5275f819027..400e84fd0578 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -144,15 +144,17 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_resize(struct btf_dump *d); -DEFAULT_VERSION(btf_dump__new_v0_6_0, btf_dump__new, LIBBPF_0.6.0) -struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts) +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) { struct btf_dump *d; int err; + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + if (!printf_fn) return libbpf_err_ptr(-EINVAL); @@ -188,17 +190,6 @@ err: return libbpf_err_ptr(err); } -COMPAT_VERSION(btf_dump__new_deprecated, btf_dump__new, LIBBPF_0.0.4) -struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn) -{ - if (!printf_fn) - return libbpf_err_ptr(-EINVAL); - return btf_dump__new_v0_6_0(btf, printf_fn, opts ? opts->ctx : NULL, opts); -} - static int btf_dump_resize(struct btf_dump *d) { int err, last_id = btf__type_cnt(d->btf) - 1; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b11d3e689126..c0150a7dbd81 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3061,11 +3061,6 @@ static int btf_finalize_data(struct bpf_object *obj, struct btf *btf) return libbpf_err(err); } -int btf__finalize_data(struct bpf_object *obj, struct btf *btf) -{ - return btf_finalize_data(obj, btf); -} - static int bpf_object__finalize_btf(struct bpf_object *obj) { int err; @@ -4397,9 +4392,7 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) { - struct bpf_map_def *def = &map->def; - __u32 key_type_id = 0, value_type_id = 0; - int ret; + int id; if (!obj->btf) return -ENOENT; @@ -4408,31 +4401,22 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) * For struct_ops map, it does not need btf_key_type_id and * btf_value_type_id. */ - if (map->sec_idx == obj->efile.btf_maps_shndx || - bpf_map__is_struct_ops(map)) + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) return 0; - if (!bpf_map__is_internal(map)) { - pr_warn("Use of BPF_ANNOTATE_KV_PAIR is deprecated, use BTF-defined maps in .maps section instead\n"); -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - ret = btf__get_map_kv_tids(obj->btf, map->name, def->key_size, - def->value_size, &key_type_id, - &value_type_id); -#pragma GCC diagnostic pop - } else { - /* - * LLVM annotates global data differently in BTF, that is, - * only as '.data', '.bss' or '.rodata'. - */ - ret = btf__find_by_name(obj->btf, map->real_name); - } - if (ret < 0) - return ret; + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; - map->btf_key_type_id = key_type_id; - map->btf_value_type_id = bpf_map__is_internal(map) ? - ret : value_type_id; + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; + + map->btf_key_type_id = 0; + map->btf_value_type_id = id; return 0; } diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 3cea0bab95ea..ff5fba17764b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -95,7 +95,6 @@ LIBBPF_0.0.1 { btf__fd; btf__find_by_name; btf__free; - btf__get_from_id; btf__name_by_offset; btf__new; btf__resolve_size; @@ -116,18 +115,10 @@ LIBBPF_0.0.2 { bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; - btf__dedup; - btf__get_map_kv_tids; - btf__get_nr_types; btf__get_raw_data; - btf__load; btf_ext__free; - btf_ext__func_info_rec_size; btf_ext__get_raw_data; - btf_ext__line_info_rec_size; btf_ext__new; - btf_ext__reloc_func_info; - btf_ext__reloc_line_info; bpf_program__get_prog_info_linear; bpf_program__bpil_addr_to_offs; bpf_program__bpil_offs_to_addr; @@ -137,7 +128,6 @@ LIBBPF_0.0.3 { global: bpf_map__is_internal; bpf_map_freeze; - btf__finalize_data; } LIBBPF_0.0.2; LIBBPF_0.0.4 { @@ -151,7 +141,6 @@ LIBBPF_0.0.4 { bpf_program__attach_uprobe; btf_dump__dump_type; btf_dump__free; - btf_dump__new; btf__parse_elf; libbpf_num_possible_cpus; perf_buffer__free; @@ -373,11 +362,9 @@ LIBBPF_0.6.0 { btf__add_decl_tag; btf__add_type_tag; btf__dedup; - btf__dedup_deprecated; btf__raw_data; btf__type_cnt; btf_dump__new; - btf_dump__new_deprecated; libbpf_major_version; libbpf_minor_version; libbpf_version_string; From 22dd7a58b2e9965c1a612b6a09c4a6146cbb5873 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:18 -0700 Subject: [PATCH 35/94] libbpf: clean up perfbuf APIs Remove deprecated perfbuf APIs and clean up opts structs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 54 +++++++----------------------- tools/lib/bpf/libbpf.h | 71 ++++------------------------------------ tools/lib/bpf/libbpf.map | 5 --- 3 files changed, 18 insertions(+), 112 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c0150a7dbd81..38dc00e440c3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11975,6 +11975,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return link; } +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + static enum bpf_perf_event_ret perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, @@ -12023,12 +12026,6 @@ perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, return libbpf_err(ret); } -__attribute__((alias("perf_event_read_simple"))) -enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); - struct perf_buffer; struct perf_buffer_params { @@ -12162,12 +12159,11 @@ error: static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p); -DEFAULT_VERSION(perf_buffer__new_v0_6_0, perf_buffer__new, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, - perf_buffer_lost_fn lost_cb, - void *ctx, - const struct perf_buffer_opts *opts) +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) { struct perf_buffer_params p = {}; struct perf_event_attr attr = {}; @@ -12189,22 +12185,10 @@ struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_deprecated, perf_buffer__new, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts) -{ - return perf_buffer__new_v0_6_0(map_fd, page_cnt, - opts ? opts->sample_cb : NULL, - opts ? opts->lost_cb : NULL, - opts ? opts->ctx : NULL, - NULL); -} - -DEFAULT_VERSION(perf_buffer__new_raw_v0_6_0, perf_buffer__new_raw, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, - struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts) +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) { struct perf_buffer_params p = {}; @@ -12224,20 +12208,6 @@ struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_raw_deprecated, perf_buffer__new_raw, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts) -{ - LIBBPF_OPTS(perf_buffer_raw_opts, inner_opts, - .cpu_cnt = opts->cpu_cnt, - .cpus = opts->cpus, - .map_keys = opts->map_keys, - ); - - return perf_buffer__new_raw_v0_6_0(map_fd, page_cnt, opts->attr, - opts->event_cb, opts->ctx, &inner_opts); -} - static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 71b19ae1659c..5dc4271fcdb7 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1269,17 +1269,7 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); /* common use perf buffer options */ struct perf_buffer_opts { - union { - size_t sz; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* if specified, sample_cb is called for each sample */ - perf_buffer_sample_fn sample_cb; - /* if specified, lost_cb is called for each batch of lost samples */ - perf_buffer_lost_fn lost_cb; - /* ctx is provided to sample_cb and lost_cb */ - void *ctx; - }; - }; + size_t sz; }; #define perf_buffer_opts__last_field sz @@ -1300,21 +1290,6 @@ perf_buffer__new(int map_fd, size_t page_cnt, perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, const struct perf_buffer_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, - const struct perf_buffer_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new() instead") -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts); - -#define perf_buffer__new(...) ___libbpf_overload(___perf_buffer_new, __VA_ARGS__) -#define ___perf_buffer_new6(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) \ - perf_buffer__new(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) -#define ___perf_buffer_new3(map_fd, page_cnt, opts) \ - perf_buffer__new_deprecated(map_fd, page_cnt, opts) - enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, LIBBPF_PERF_EVENT_ERROR = -1, @@ -1328,21 +1303,9 @@ typedef enum bpf_perf_event_ret /* raw perf buffer options, giving most power and control */ struct perf_buffer_raw_opts { - union { - struct { - size_t sz; - long :0; - long :0; - }; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* perf event attrs passed directly into perf_event_open() */ - struct perf_event_attr *attr; - /* raw event callback */ - perf_buffer_event_fn event_cb; - /* ctx is provided to event_cb */ - void *ctx; - }; - }; + size_t sz; + long :0; + long :0; /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of * max_entries of given PERF_EVENT_ARRAY map) */ @@ -1354,26 +1317,13 @@ struct perf_buffer_raw_opts { }; #define perf_buffer_raw_opts__last_field map_keys +struct perf_event_attr; + LIBBPF_API struct perf_buffer * perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, perf_buffer_event_fn event_cb, void *ctx, const struct perf_buffer_raw_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new_raw() instead") -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts); - -#define perf_buffer__new_raw(...) ___libbpf_overload(___perf_buffer_new_raw, __VA_ARGS__) -#define ___perf_buffer_new_raw6(map_fd, page_cnt, attr, event_cb, ctx, opts) \ - perf_buffer__new_raw(map_fd, page_cnt, attr, event_cb, ctx, opts) -#define ___perf_buffer_new_raw3(map_fd, page_cnt, opts) \ - perf_buffer__new_raw_deprecated(map_fd, page_cnt, opts) - LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); @@ -1382,15 +1332,6 @@ LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_id LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); -typedef enum bpf_perf_event_ret - (*bpf_perf_event_print_t)(struct perf_event_header *hdr, - void *private_data); -LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or perf_buffer__consume() instead") -LIBBPF_API enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); - struct bpf_prog_linfo; struct bpf_prog_info; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index ff5fba17764b..f327adc66933 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -46,7 +46,6 @@ LIBBPF_0.0.1 { bpf_object__unload; bpf_object__unpin_maps; bpf_object__unpin_programs; - bpf_perf_event_read_simple; bpf_prog_attach; bpf_prog_detach; bpf_prog_detach2; @@ -144,8 +143,6 @@ LIBBPF_0.0.4 { btf__parse_elf; libbpf_num_possible_cpus; perf_buffer__free; - perf_buffer__new; - perf_buffer__new_raw; perf_buffer__poll; } LIBBPF_0.0.3; @@ -369,9 +366,7 @@ LIBBPF_0.6.0 { libbpf_minor_version; libbpf_version_string; perf_buffer__new; - perf_buffer__new_deprecated; perf_buffer__new_raw; - perf_buffer__new_raw_deprecated; } LIBBPF_0.5.0; LIBBPF_0.7.0 { From 9a590538ba4fbe6d0a4586cdab039df782153fcb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:19 -0700 Subject: [PATCH 36/94] libbpf: remove prog_info_linear APIs Remove prog_info_linear-related APIs previously used by perf. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 248 --------------------------------------- tools/lib/bpf/libbpf.h | 66 ----------- tools/lib/bpf/libbpf.map | 3 - 3 files changed, 317 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 38dc00e440c3..cd06989812c9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -12509,254 +12509,6 @@ int perf_buffer__consume(struct perf_buffer *pb) return 0; } -struct bpf_prog_info_array_desc { - int array_offset; /* e.g. offset of jited_prog_insns */ - int count_offset; /* e.g. offset of jited_prog_len */ - int size_offset; /* > 0: offset of rec size, - * < 0: fix size of -size_offset - */ -}; - -static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { - [BPF_PROG_INFO_JITED_INSNS] = { - offsetof(struct bpf_prog_info, jited_prog_insns), - offsetof(struct bpf_prog_info, jited_prog_len), - -1, - }, - [BPF_PROG_INFO_XLATED_INSNS] = { - offsetof(struct bpf_prog_info, xlated_prog_insns), - offsetof(struct bpf_prog_info, xlated_prog_len), - -1, - }, - [BPF_PROG_INFO_MAP_IDS] = { - offsetof(struct bpf_prog_info, map_ids), - offsetof(struct bpf_prog_info, nr_map_ids), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_JITED_KSYMS] = { - offsetof(struct bpf_prog_info, jited_ksyms), - offsetof(struct bpf_prog_info, nr_jited_ksyms), - -(int)sizeof(__u64), - }, - [BPF_PROG_INFO_JITED_FUNC_LENS] = { - offsetof(struct bpf_prog_info, jited_func_lens), - offsetof(struct bpf_prog_info, nr_jited_func_lens), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_FUNC_INFO] = { - offsetof(struct bpf_prog_info, func_info), - offsetof(struct bpf_prog_info, nr_func_info), - offsetof(struct bpf_prog_info, func_info_rec_size), - }, - [BPF_PROG_INFO_LINE_INFO] = { - offsetof(struct bpf_prog_info, line_info), - offsetof(struct bpf_prog_info, nr_line_info), - offsetof(struct bpf_prog_info, line_info_rec_size), - }, - [BPF_PROG_INFO_JITED_LINE_INFO] = { - offsetof(struct bpf_prog_info, jited_line_info), - offsetof(struct bpf_prog_info, nr_jited_line_info), - offsetof(struct bpf_prog_info, jited_line_info_rec_size), - }, - [BPF_PROG_INFO_PROG_TAGS] = { - offsetof(struct bpf_prog_info, prog_tags), - offsetof(struct bpf_prog_info, nr_prog_tags), - -(int)sizeof(__u8) * BPF_TAG_SIZE, - }, - -}; - -static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, - int offset) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u32)]; - return -(int)offset; -} - -static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, - int offset) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u64)]; - return -(int)offset; -} - -static void bpf_prog_info_set_offset_u32(struct bpf_prog_info *info, int offset, - __u32 val) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - array[offset / sizeof(__u32)] = val; -} - -static void bpf_prog_info_set_offset_u64(struct bpf_prog_info *info, int offset, - __u64 val) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - array[offset / sizeof(__u64)] = val; -} - -struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays) -{ - struct bpf_prog_info_linear *info_linear; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - __u32 data_len = 0; - int i, err; - void *ptr; - - if (arrays >> BPF_PROG_INFO_LAST_ARRAY) - return libbpf_err_ptr(-EINVAL); - - /* step 1: get array dimensions */ - err = bpf_obj_get_info_by_fd(fd, &info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - return libbpf_err_ptr(-EFAULT); - } - - /* step 2: calculate total size of all arrays */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - bool include_array = (arrays & (1UL << i)) > 0; - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - desc = bpf_prog_info_array_desc + i; - - /* kernel is too old to support this field */ - if (info_len < desc->array_offset + sizeof(__u32) || - info_len < desc->count_offset + sizeof(__u32) || - (desc->size_offset > 0 && info_len < desc->size_offset)) - include_array = false; - - if (!include_array) { - arrays &= ~(1UL << i); /* clear the bit */ - continue; - } - - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - - data_len += count * size; - } - - /* step 3: allocate continuous memory */ - data_len = roundup(data_len, sizeof(__u64)); - info_linear = malloc(sizeof(struct bpf_prog_info_linear) + data_len); - if (!info_linear) - return libbpf_err_ptr(-ENOMEM); - - /* step 4: fill data to info_linear->info */ - info_linear->arrays = arrays; - memset(&info_linear->info, 0, sizeof(info)); - ptr = info_linear->data; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->count_offset, count); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->size_offset, size); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, - ptr_to_u64(ptr)); - ptr += count * size; - } - - /* step 5: call syscall again to get required arrays */ - err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - free(info_linear); - return libbpf_err_ptr(-EFAULT); - } - - /* step 6: verify the data */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 v1, v2; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->count_offset); - if (v1 != v2) - pr_warn("%s: mismatch in element count\n", __func__); - - v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->size_offset); - if (v1 != v2) - pr_warn("%s: mismatch in rec size\n", __func__); - } - - /* step 7: update info_len and data_len */ - info_linear->info_len = sizeof(struct bpf_prog_info); - info_linear->data_len = data_len; - - return info_linear; -} - -void bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - addr = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - offs = addr - ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, offs); - } -} - -void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - offs = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - addr = offs + ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, addr); - } -} - int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5dc4271fcdb7..e6357ea7bf41 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1396,72 +1396,6 @@ LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts); -/* - * Get bpf_prog_info in continuous memory - * - * struct bpf_prog_info has multiple arrays. The user has option to choose - * arrays to fetch from kernel. The following APIs provide an uniform way to - * fetch these data. All arrays in bpf_prog_info are stored in a single - * continuous memory region. This makes it easy to store the info in a - * file. - * - * Before writing bpf_prog_info_linear to files, it is necessary to - * translate pointers in bpf_prog_info to offsets. Helper functions - * bpf_program__bpil_addr_to_offs() and bpf_program__bpil_offs_to_addr() - * are introduced to switch between pointers and offsets. - * - * Examples: - * # To fetch map_ids and prog_tags: - * __u64 arrays = (1UL << BPF_PROG_INFO_MAP_IDS) | - * (1UL << BPF_PROG_INFO_PROG_TAGS); - * struct bpf_prog_info_linear *info_linear = - * bpf_program__get_prog_info_linear(fd, arrays); - * - * # To save data in file - * bpf_program__bpil_addr_to_offs(info_linear); - * write(f, info_linear, sizeof(*info_linear) + info_linear->data_len); - * - * # To read data from file - * read(f, info_linear, ); - * bpf_program__bpil_offs_to_addr(info_linear); - */ -enum bpf_prog_info_array { - BPF_PROG_INFO_FIRST_ARRAY = 0, - BPF_PROG_INFO_JITED_INSNS = 0, - BPF_PROG_INFO_XLATED_INSNS, - BPF_PROG_INFO_MAP_IDS, - BPF_PROG_INFO_JITED_KSYMS, - BPF_PROG_INFO_JITED_FUNC_LENS, - BPF_PROG_INFO_FUNC_INFO, - BPF_PROG_INFO_LINE_INFO, - BPF_PROG_INFO_JITED_LINE_INFO, - BPF_PROG_INFO_PROG_TAGS, - BPF_PROG_INFO_LAST_ARRAY, -}; - -struct bpf_prog_info_linear { - /* size of struct bpf_prog_info, when the tool is compiled */ - __u32 info_len; - /* total bytes allocated for data, round up to 8 bytes */ - __u32 data_len; - /* which arrays are included in data */ - __u64 arrays; - struct bpf_prog_info info; - __u8 data[]; -}; - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); - /** * @brief **libbpf_num_possible_cpus()** is a helper function to get the * number of possible CPUs that the host kernel supports and expects. diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f327adc66933..93c7bcc058c6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -118,9 +118,6 @@ LIBBPF_0.0.2 { btf_ext__free; btf_ext__get_raw_data; btf_ext__new; - bpf_program__get_prog_info_linear; - bpf_program__bpil_addr_to_offs; - bpf_program__bpil_offs_to_addr; } LIBBPF_0.0.1; LIBBPF_0.0.3 { From 146bf811f5ac133e619fcf4e77bc7a97a9e401e7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:20 -0700 Subject: [PATCH 37/94] libbpf: remove most other deprecated high-level APIs Remove a bunch of high-level bpf_object/bpf_map/bpf_program related APIs. All the APIs related to private per-object/map/prog state, program preprocessing callback, and generally everything multi-instance related is removed in a separate patch. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 241 ++++----------------------------------- tools/lib/bpf/libbpf.h | 163 +------------------------- tools/lib/bpf/libbpf.map | 43 ------- 3 files changed, 26 insertions(+), 421 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cd06989812c9..149392604aa4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -484,6 +483,14 @@ enum libbpf_map_type { LIBBPF_MAP_KCONFIG, }; +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + struct bpf_map { struct bpf_object *obj; char *name; @@ -568,8 +575,6 @@ struct extern_desc { }; }; -static LIST_HEAD(bpf_objects_list); - struct module_btf { struct btf *btf; char *name; @@ -638,12 +643,6 @@ struct bpf_object { /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ struct elf_state efile; - /* - * All loaded bpf_object are linked in a list, which is - * hidden to caller. bpf_objects__ handlers deal with - * all objects. - */ - struct list_head list; struct btf *btf; struct btf_ext *btf_ext; @@ -1313,7 +1312,6 @@ static struct bpf_object *bpf_object__new(const char *path, size_t obj_buf_sz, const char *obj_name) { - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); struct bpf_object *obj; char *end; @@ -1351,9 +1349,6 @@ static struct bpf_object *bpf_object__new(const char *path, obj->kern_version = get_kernel_version(); obj->loaded = false; - INIT_LIST_HEAD(&obj->list); - if (!strict) - list_add(&obj->list, &bpf_objects_list); return obj; } @@ -1386,10 +1381,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) } if (obj->efile.obj_buf_sz > 0) { - /* - * obj_buf should have been validated by - * bpf_object__open_buffer(). - */ + /* obj_buf should have been validated by bpf_object__open_mem(). */ elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); } else { obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); @@ -2306,6 +2298,13 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + int parse_btf_map_def(const char *map_name, struct btf *btf, const struct btf_type *def_t, bool strict, struct btf_map_def *map_def, struct btf_map_def *inner_def) @@ -4017,19 +4016,6 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title) -{ - struct bpf_program *pos; - - bpf_object__for_each_program(pos, obj) { - if (pos->sec_name && !strcmp(pos->sec_name, title)) - return pos; - } - return errno = ENOENT, NULL; -} - static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) { @@ -4548,14 +4534,6 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) return 0; } -int bpf_map__resize(struct bpf_map *map, __u32 max_entries) -{ - if (!map || !max_entries) - return libbpf_err(-EINVAL); - - return bpf_map__set_max_entries(map, max_entries); -} - static int bpf_object__probe_loading(struct bpf_object *obj) { @@ -7339,11 +7317,6 @@ out: return libbpf_err(err); } -int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_ver) -{ - return bpf_object_load_prog(prog->obj, prog, license, kern_ver); -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -7395,13 +7368,6 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object prog->type = prog->sec_def->prog_type; prog->expected_attach_type = prog->sec_def->expected_attach_type; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if (prog->sec_def->prog_type == BPF_PROG_TYPE_TRACING || - prog->sec_def->prog_type == BPF_PROG_TYPE_EXT) - prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); -#pragma GCC diagnostic pop - /* sec_def can have custom callback which should be called * after bpf_program is initialized to adjust its properties */ @@ -7507,36 +7473,6 @@ out: return ERR_PTR(err); } -static struct bpf_object * -__bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .relaxed_maps = flags & MAPS_RELAX_COMPAT, - ); - - /* param validation */ - if (!attr->file) - return NULL; - - pr_debug("loading %s\n", attr->file); - return bpf_object_open(attr->file, NULL, 0, &opts); -} - -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) -{ - return libbpf_ptr(__bpf_object__open_xattr(attr, 0)); -} - -struct bpf_object *bpf_object__open(const char *path) -{ - struct bpf_object_open_attr attr = { - .file = path, - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - - return libbpf_ptr(__bpf_object__open_xattr(&attr, 0)); -} - struct bpf_object * bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) { @@ -7548,6 +7484,11 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); } +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) @@ -7558,23 +7499,6 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); } -struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .object_name = name, - /* wrong default, but backwards-compatible */ - .relaxed_maps = true, - ); - - /* returning NULL is wrong, but backwards-compatible */ - if (!obj_buf || obj_buf_sz == 0) - return errno = EINVAL, NULL; - - return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, &opts)); -} - static int bpf_object_unload(struct bpf_object *obj) { size_t i; @@ -8007,11 +7931,6 @@ out: return libbpf_err(err); } -int bpf_object__load_xattr(struct bpf_object_load_attr *attr) -{ - return bpf_object_load(attr->obj, attr->log_level, attr->target_btf_path); -} - int bpf_object__load(struct bpf_object *obj) { return bpf_object_load(obj, 0, NULL); @@ -8642,33 +8561,9 @@ void bpf_object__close(struct bpf_object *obj) } zfree(&obj->programs); - list_del(&obj->list); free(obj); } -struct bpf_object * -bpf_object__next(struct bpf_object *prev) -{ - struct bpf_object *next; - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); - - if (strict) - return NULL; - - if (!prev) - next = list_first_entry(&bpf_objects_list, - struct bpf_object, - list); - else - next = list_next_entry(prev, list); - - /* Empty list is noticed here so don't need checking on entry. */ - if (&next->list == &bpf_objects_list) - return NULL; - - return next; -} - const char *bpf_object__name(const struct bpf_object *obj) { return obj ? obj->name : libbpf_err_ptr(-EINVAL); @@ -8757,12 +8652,6 @@ __bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, return &obj->programs[idx]; } -struct bpf_program * -bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) -{ - return bpf_object__next_program(obj, prev); -} - struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) { @@ -8775,12 +8664,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) return prog; } -struct bpf_program * -bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) -{ - return bpf_object__prev_program(obj, next); -} - struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) { @@ -8824,22 +8707,6 @@ const char *bpf_program__section_name(const struct bpf_program *prog) return prog->sec_name; } -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) -{ - const char *title; - - title = prog->sec_name; - if (needs_copy) { - title = strdup(title); - if (!title) { - pr_warn("failed to strdup program title\n"); - return libbpf_err_ptr(-ENOMEM); - } - } - - return title; -} - bool bpf_program__autoload(const struct bpf_program *prog) { return prog->autoload; @@ -8861,11 +8728,6 @@ int bpf_program__fd(const struct bpf_program *prog) return bpf_program_nth_fd(prog, 0); } -size_t bpf_program__size(const struct bpf_program *prog) -{ - return prog->insns_cnt * BPF_INSN_SZ; -} - const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) { return prog->insns; @@ -8967,39 +8829,6 @@ int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) return 0; } -static bool bpf_program__is_type(const struct bpf_program *prog, - enum bpf_prog_type type) -{ - return prog ? (prog->type == type) : false; -} - -#define BPF_PROG_TYPE_FNS(NAME, TYPE) \ -int bpf_program__set_##NAME(struct bpf_program *prog) \ -{ \ - if (!prog) \ - return libbpf_err(-EINVAL); \ - return bpf_program__set_type(prog, TYPE); \ -} \ - \ -bool bpf_program__is_##NAME(const struct bpf_program *prog) \ -{ \ - return bpf_program__is_type(prog, TYPE); \ -} \ - -BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); -BPF_PROG_TYPE_FNS(lsm, BPF_PROG_TYPE_LSM); -BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); -BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS); -BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT); -BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); -BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT); -BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); -BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); -BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); -BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); -BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); -BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); - __alias(bpf_program__expected_attach_type) enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); @@ -9773,11 +9602,6 @@ int bpf_map__fd(const struct bpf_map *map) return map ? map->fd : libbpf_err(-EINVAL); } -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map) -{ - return map ? &map->def : libbpf_err_ptr(-EINVAL); -} - static bool map_uses_real_name(const struct bpf_map *map) { /* Since libbpf started to support custom .data.* and .rodata.* maps, @@ -9932,11 +9756,6 @@ const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) return map->mmaped; } -bool bpf_map__is_offload_neutral(const struct bpf_map *map) -{ - return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; -} - bool bpf_map__is_internal(const struct bpf_map *map) { return map->libbpf_type != LIBBPF_MAP_UNSPEC; @@ -9997,12 +9816,6 @@ __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) return &obj->maps[idx]; } -struct bpf_map * -bpf_map__next(const struct bpf_map *prev, const struct bpf_object *obj) -{ - return bpf_object__next_map(obj, prev); -} - struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) { @@ -10012,12 +9825,6 @@ bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) return __bpf_map__iter(prev, obj, 1); } -struct bpf_map * -bpf_map__prev(const struct bpf_map *next, const struct bpf_object *obj) -{ - return bpf_object__prev_map(obj, next); -} - struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) { @@ -10063,12 +9870,6 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); } -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) -{ - return libbpf_err_ptr(-ENOTSUP); -} - static int validate_map_op(const struct bpf_map *map, size_t key_sz, size_t value_sz, bool check_value_sz) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e6357ea7bf41..9c2279b7b6ed 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -101,11 +101,6 @@ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ struct bpf_object; -struct bpf_object_open_attr { - const char *file; - enum bpf_prog_type prog_type; -}; - struct bpf_object_open_opts { /* size of this struct, for forward/backward compatibility */ size_t sz; @@ -118,21 +113,12 @@ struct bpf_object_open_opts { const char *object_name; /* parse map definitions non-strictly, allowing extra attributes/data */ bool relaxed_maps; - /* DEPRECATED: handle CO-RE relocations non-strictly, allowing failures. - * Value is ignored. Relocations always are processed non-strictly. - * Non-relocatable instructions are replaced with invalid ones to - * prevent accidental errors. - * */ - LIBBPF_DEPRECATED_SINCE(0, 6, "field has no effect") - bool relaxed_core_relocs; /* maps that set the 'pinning' attribute in their definition will have * their pin_path attribute set to a file in this directory, and be * auto-pinned to that path on load; defaults to "/sys/fs/bpf". */ const char *pin_root_path; - - LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__set_attach_target() on each individual bpf_program") - __u32 attach_prog_fd; + long :0; /* Additional kernel config content that augments and overrides * system Kconfig for CONFIG_xxx externs. */ @@ -215,20 +201,10 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); -/* deprecated bpf_object__open variants */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open_mem() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open_file() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_xattr(struct bpf_object_open_attr *attr); +/* Load/unload object into/from kernel */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); -enum libbpf_pin_type { - LIBBPF_PIN_NONE, - /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ - LIBBPF_PIN_BY_NAME, -}; +LIBBPF_API void bpf_object__close(struct bpf_object *object); /* pin_maps and unpin_maps can both be called with a NULL path, in which case * they will use the pin_path attribute of each map (and ignore all maps that @@ -242,20 +218,6 @@ LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); -LIBBPF_API void bpf_object__close(struct bpf_object *object); - -struct bpf_object_load_attr { - struct bpf_object *obj; - int log_level; - const char *target_btf_path; -}; - -/* Load/unload object into/from kernel */ -LIBBPF_API int bpf_object__load(struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__load() instead") -LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); -LIBBPF_DEPRECATED_SINCE(0, 6, "bpf_object__unload() is deprecated, use bpf_object__close() instead") -LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); @@ -265,22 +227,10 @@ struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__find_program_by_name() instead") -LIBBPF_API struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title); LIBBPF_API struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "track bpf_objects in application code instead") -struct bpf_object *bpf_object__next(struct bpf_object *prev); -#define bpf_object__for_each_safe(pos, tmp) \ - for ((pos) = bpf_object__next(NULL), \ - (tmp) = bpf_object__next(pos); \ - (pos) != NULL; \ - (pos) = (tmp), (tmp) = bpf_object__next(tmp)) - typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, @@ -298,9 +248,7 @@ LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, /* Accessors of bpf_program */ struct bpf_program; -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_program() instead") -struct bpf_program *bpf_program__next(struct bpf_program *prog, - const struct bpf_object *obj); + LIBBPF_API struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); @@ -309,9 +257,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) (pos) != NULL; \ (pos) = bpf_object__next_program((obj), (pos))) -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_program() instead") -struct bpf_program *bpf_program__prev(struct bpf_program *prog, - const struct bpf_object *obj); LIBBPF_API struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); @@ -327,15 +272,9 @@ LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); -LIBBPF_API LIBBPF_DEPRECATED("BPF program title is confusing term; please use bpf_program__section_name() instead") -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); -/* returns program size in bytes */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insn_cnt() instead") -LIBBPF_API size_t bpf_program__size(const struct bpf_program *prog); - struct bpf_insn; /** @@ -388,8 +327,6 @@ LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, */ LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 6, "use bpf_object__load() instead") -LIBBPF_API int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_version); LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, @@ -761,36 +698,6 @@ LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); -/* - * Adjust type of BPF program. Default is kprobe. - */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_lsm(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); - LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); /** @@ -853,47 +760,6 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_lsm(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); - -/* - * No need for __attribute__((packed)), all members of 'bpf_map_def' - * are all aligned. In addition, using __attribute__((packed)) - * would trigger a -Wpacked warning message, and lead to an error - * if -Werror is set. - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -}; - /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object @@ -908,16 +774,6 @@ bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); LIBBPF_API int bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); -/* - * Get bpf_map through the offset of corresponding struct bpf_map_def - * in the BPF object file. - */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead") -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead") -struct bpf_map *bpf_map__next(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); @@ -927,8 +783,6 @@ bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); (pos) = bpf_object__next_map((obj), (pos))) #define bpf_map__for_each bpf_object__for_each_map -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_map() instead") -struct bpf_map *bpf_map__prev(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); @@ -962,9 +816,6 @@ LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); */ LIBBPF_API int bpf_map__fd(const struct bpf_map *map); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); -/* get map definition */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use appropriate getters or setters instead") -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map); /* get map name */ LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); /* get/set map type */ @@ -973,8 +824,6 @@ LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); /* get/set map size (max_entries) */ LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__set_max_entries() instead") -LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); /* get/set map flags */ LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); @@ -1006,8 +855,6 @@ LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead") -LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 93c7bcc058c6..7f19c43028f1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -3,13 +3,9 @@ LIBBPF_0.0.1 { bpf_btf_get_fd_by_id; bpf_map__btf_key_type_id; bpf_map__btf_value_type_id; - bpf_map__def; bpf_map__fd; - bpf_map__is_offload_neutral; bpf_map__name; - bpf_map__next; bpf_map__pin; - bpf_map__prev; bpf_map__priv; bpf_map__reuse_fd; bpf_map__set_ifindex; @@ -29,21 +25,15 @@ LIBBPF_0.0.1 { bpf_object__btf_fd; bpf_object__close; bpf_object__find_map_by_name; - bpf_object__find_map_by_offset; - bpf_object__find_program_by_title; bpf_object__kversion; bpf_object__load; bpf_object__name; - bpf_object__next; bpf_object__open; - bpf_object__open_buffer; - bpf_object__open_xattr; bpf_object__pin; bpf_object__pin_maps; bpf_object__pin_programs; bpf_object__priv; bpf_object__set_priv; - bpf_object__unload; bpf_object__unpin_maps; bpf_object__unpin_programs; bpf_prog_attach; @@ -53,35 +43,15 @@ LIBBPF_0.0.1 { bpf_prog_get_next_id; bpf_prog_query; bpf_program__fd; - bpf_program__is_kprobe; - bpf_program__is_perf_event; - bpf_program__is_raw_tracepoint; - bpf_program__is_sched_act; - bpf_program__is_sched_cls; - bpf_program__is_socket_filter; - bpf_program__is_tracepoint; - bpf_program__is_xdp; - bpf_program__load; - bpf_program__next; bpf_program__nth_fd; bpf_program__pin; bpf_program__pin_instance; - bpf_program__prev; bpf_program__priv; bpf_program__set_expected_attach_type; bpf_program__set_ifindex; - bpf_program__set_kprobe; - bpf_program__set_perf_event; bpf_program__set_prep; bpf_program__set_priv; - bpf_program__set_raw_tracepoint; - bpf_program__set_sched_act; - bpf_program__set_sched_cls; - bpf_program__set_socket_filter; - bpf_program__set_tracepoint; bpf_program__set_type; - bpf_program__set_xdp; - bpf_program__title; bpf_program__unload; bpf_program__unpin; bpf_program__unpin_instance; @@ -110,7 +80,6 @@ LIBBPF_0.0.1 { LIBBPF_0.0.2 { global: - bpf_map__resize; bpf_map_lookup_elem_flags; bpf_object__btf; bpf_object__find_map_fd_by_name; @@ -129,7 +98,6 @@ LIBBPF_0.0.3 { LIBBPF_0.0.4 { global: bpf_link__destroy; - bpf_object__load_xattr; bpf_program__attach_kprobe; bpf_program__attach_perf_event; bpf_program__attach_raw_tracepoint; @@ -158,9 +126,6 @@ LIBBPF_0.0.6 { bpf_program__attach_trace; bpf_program__get_expected_attach_type; bpf_program__get_type; - bpf_program__is_tracing; - bpf_program__set_tracing; - bpf_program__size; btf__find_by_name_kind; libbpf_find_vmlinux_btf_id; } LIBBPF_0.0.5; @@ -182,10 +147,6 @@ LIBBPF_0.0.7 { bpf_object__open_skeleton; bpf_program__attach; bpf_program__name; - bpf_program__is_extension; - bpf_program__is_struct_ops; - bpf_program__set_extension; - bpf_program__set_struct_ops; btf__align_of; libbpf_find_kernel_btf; } LIBBPF_0.0.6; @@ -204,9 +165,7 @@ LIBBPF_0.0.8 { bpf_prog_attach_opts; bpf_program__attach_cgroup; bpf_program__attach_lsm; - bpf_program__is_lsm; bpf_program__set_attach_target; - bpf_program__set_lsm; } LIBBPF_0.0.7; LIBBPF_0.0.9 { @@ -244,9 +203,7 @@ LIBBPF_0.1.0 { bpf_map__value_size; bpf_program__attach_xdp; bpf_program__autoload; - bpf_program__is_sk_lookup; bpf_program__set_autoload; - bpf_program__set_sk_lookup; btf__parse; btf__parse_raw; btf__pointer_size; From b4bda502dfa28c7cb78e25f24593fc1d4628b21c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:21 -0700 Subject: [PATCH 38/94] libbpf: remove multi-instance and custom private data APIs Remove all the public APIs that are related to creating multi-instance bpf_programs through custom preprocessing callback and generally working with them. Also remove all the bpf_{object,map,program}__[set_]priv() APIs. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 121 ++++----------------------------------- tools/lib/bpf/libbpf.h | 91 ----------------------------- tools/lib/bpf/libbpf.map | 10 ---- 3 files changed, 10 insertions(+), 212 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 149392604aa4..fe98789d1776 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -366,6 +366,16 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; +struct bpf_prog_prep_result { + struct bpf_insn *new_insn_ptr; + int new_insn_cnt; + int *pfd; +}; + +typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, + struct bpf_insn *insns, int insns_cnt, + struct bpf_prog_prep_result *res); + /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -426,8 +436,6 @@ struct bpf_program { bpf_program_prep_t preprocessor; struct bpf_object *obj; - void *priv; - bpf_program_clear_priv_t clear_priv; bool autoload; bool mark_btf_static; @@ -511,8 +519,6 @@ struct bpf_map { __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; - void *priv; - bpf_map_clear_priv_t clear_priv; enum libbpf_map_type libbpf_type; void *mmaped; struct bpf_struct_ops *st_ops; @@ -668,9 +674,6 @@ struct bpf_object { size_t log_size; __u32 log_level; - void *priv; - bpf_object_clear_priv_t clear_priv; - int *fd_array; size_t fd_array_cap; size_t fd_array_cnt; @@ -721,12 +724,6 @@ static void bpf_program__exit(struct bpf_program *prog) if (!prog) return; - if (prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = NULL; - prog->clear_priv = NULL; - bpf_program__unload(prog); zfree(&prog->name); zfree(&prog->sec_name); @@ -8051,12 +8048,6 @@ static int bpf_program_unpin_instance(struct bpf_program *prog, const char *path return 0; } -__attribute__((alias("bpf_program_pin_instance"))) -int bpf_object__pin_instance(struct bpf_program *prog, const char *path, int instance); - -__attribute__((alias("bpf_program_unpin_instance"))) -int bpf_program__unpin_instance(struct bpf_program *prog, const char *path, int instance); - int bpf_program__pin(struct bpf_program *prog, const char *path) { int i, err; @@ -8492,11 +8483,6 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) static void bpf_map__destroy(struct bpf_map *map) { - if (map->clear_priv) - map->clear_priv(map, map->priv); - map->priv = NULL; - map->clear_priv = NULL; - if (map->inner_map) { bpf_map__destroy(map->inner_map); zfree(&map->inner_map); @@ -8532,9 +8518,6 @@ void bpf_object__close(struct bpf_object *obj) if (IS_ERR_OR_NULL(obj)) return; - if (obj->clear_priv) - obj->clear_priv(obj, obj->priv); - usdt_manager_free(obj->usdt_man); obj->usdt_man = NULL; @@ -8594,22 +8577,6 @@ int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) return 0; } -int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv) -{ - if (obj->priv && obj->clear_priv) - obj->clear_priv(obj, obj->priv); - - obj->priv = priv; - obj->clear_priv = clear_priv; - return 0; -} - -void *bpf_object__priv(const struct bpf_object *obj) -{ - return obj ? obj->priv : libbpf_err_ptr(-EINVAL); -} - int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) { struct bpf_gen *gen; @@ -8676,22 +8643,6 @@ bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) return prog; } -int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv) -{ - if (prog->priv && prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = priv; - prog->clear_priv = clear_priv; - return 0; -} - -void *bpf_program__priv(const struct bpf_program *prog) -{ - return prog ? prog->priv : libbpf_err_ptr(-EINVAL); -} - void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) { prog->prog_ifindex = ifindex; @@ -8758,37 +8709,6 @@ int bpf_program__set_insns(struct bpf_program *prog, return 0; } -int bpf_program__set_prep(struct bpf_program *prog, int nr_instances, - bpf_program_prep_t prep) -{ - int *instances_fds; - - if (nr_instances <= 0 || !prep) - return libbpf_err(-EINVAL); - - if (prog->instances.nr > 0 || prog->instances.fds) { - pr_warn("Can't set pre-processor after loading\n"); - return libbpf_err(-EINVAL); - } - - instances_fds = malloc(sizeof(int) * nr_instances); - if (!instances_fds) { - pr_warn("alloc memory failed for fds\n"); - return libbpf_err(-ENOMEM); - } - - /* fill all fd with -1 */ - memset(instances_fds, -1, sizeof(int) * nr_instances); - - prog->instances.nr = nr_instances; - prog->instances.fds = instances_fds; - prog->preprocessor = prep; - return 0; -} - -__attribute__((alias("bpf_program_nth_fd"))) -int bpf_program__nth_fd(const struct bpf_program *prog, int n); - static int bpf_program_nth_fd(const struct bpf_program *prog, int n) { int fd; @@ -9716,27 +9636,6 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map) return map ? map->btf_value_type_id : 0; } -int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv) -{ - if (!map) - return libbpf_err(-EINVAL); - - if (map->priv) { - if (map->clear_priv) - map->clear_priv(map, map->priv); - } - - map->priv = priv; - map->clear_priv = clear_priv; - return 0; -} - -void *bpf_map__priv(const struct bpf_map *map) -{ - return map ? map->priv : libbpf_err_ptr(-EINVAL); -} - int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 9c2279b7b6ed..5eb3145b8945 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -231,13 +231,6 @@ LIBBPF_API struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name); -typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_object__priv(const struct bpf_object *prog); - LIBBPF_API int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type); @@ -260,13 +253,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) LIBBPF_API struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); -typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); - -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); @@ -328,14 +314,6 @@ LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, - const char *path, - int instance); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__unpin_instance(struct bpf_program *prog, - const char *path, - int instance); /** * @brief **bpf_program__pin()** pins the BPF program to a file @@ -635,69 +613,6 @@ LIBBPF_API struct bpf_link * bpf_program__attach_iter(const struct bpf_program *prog, const struct bpf_iter_attach_opts *opts); -/* - * Libbpf allows callers to adjust BPF programs before being loaded - * into kernel. One program in an object file can be transformed into - * multiple variants to be attached to different hooks. - * - * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd - * form an API for this purpose. - * - * - bpf_program_prep_t: - * Defines a 'preprocessor', which is a caller defined function - * passed to libbpf through bpf_program__set_prep(), and will be - * called before program is loaded. The processor should adjust - * the program one time for each instance according to the instance id - * passed to it. - * - * - bpf_program__set_prep: - * Attaches a preprocessor to a BPF program. The number of instances - * that should be created is also passed through this function. - * - * - bpf_program__nth_fd: - * After the program is loaded, get resulting FD of a given instance - * of the BPF program. - * - * If bpf_program__set_prep() is not used, the program would be loaded - * without adjustment during bpf_object__load(). The program has only - * one instance. In this case bpf_program__fd(prog) is equal to - * bpf_program__nth_fd(prog, 0). - */ -struct bpf_prog_prep_result { - /* - * If not NULL, load new instruction array. - * If set to NULL, don't load this instance. - */ - struct bpf_insn *new_insn_ptr; - int new_insn_cnt; - - /* If not NULL, result FD is written to it. */ - int *pfd; -}; - -/* - * Parameters of bpf_program_prep_t: - * - prog: The bpf_program being loaded. - * - n: Index of instance being generated. - * - insns: BPF instructions array. - * - insns_cnt:Number of instructions in insns. - * - res: Output parameter, result of transformation. - * - * Return value: - * - Zero: pre-processing success. - * - Non-zero: pre-processing error, stop loading. - */ -typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, - struct bpf_insn *insns, int insns_cnt, - struct bpf_prog_prep_result *res); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insns() for getting bpf_program instructions") -LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, - bpf_program_prep_t prep); - -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); - LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); /** @@ -846,12 +761,6 @@ LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); -typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 7f19c43028f1..236efc7338f1 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -6,11 +6,9 @@ LIBBPF_0.0.1 { bpf_map__fd; bpf_map__name; bpf_map__pin; - bpf_map__priv; bpf_map__reuse_fd; bpf_map__set_ifindex; bpf_map__set_inner_map_fd; - bpf_map__set_priv; bpf_map__unpin; bpf_map_delete_elem; bpf_map_get_fd_by_id; @@ -32,8 +30,6 @@ LIBBPF_0.0.1 { bpf_object__pin; bpf_object__pin_maps; bpf_object__pin_programs; - bpf_object__priv; - bpf_object__set_priv; bpf_object__unpin_maps; bpf_object__unpin_programs; bpf_prog_attach; @@ -43,18 +39,12 @@ LIBBPF_0.0.1 { bpf_prog_get_next_id; bpf_prog_query; bpf_program__fd; - bpf_program__nth_fd; bpf_program__pin; - bpf_program__pin_instance; - bpf_program__priv; bpf_program__set_expected_attach_type; bpf_program__set_ifindex; - bpf_program__set_prep; - bpf_program__set_priv; bpf_program__set_type; bpf_program__unload; bpf_program__unpin; - bpf_program__unpin_instance; bpf_prog_linfo__free; bpf_prog_linfo__new; bpf_prog_linfo__lfind_addr_func; From a11113a2dcbedd488ca82f46853cf3e1ee486a6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:22 -0700 Subject: [PATCH 39/94] libbpf: cleanup LIBBPF_DEPRECATED_SINCE supporting macros for v0.x Keep the LIBBPF_DEPRECATED_SINCE macro "framework" for future deprecations, but clean up 0.x related helper macros. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf_common.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index 000e37798ff2..9a7937f339df 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -30,20 +30,10 @@ /* Add checks for other versions below when planning deprecation of API symbols * with the LIBBPF_DEPRECATED_SINCE macro. */ -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 6) -#define __LIBBPF_MARK_DEPRECATED_0_6(X) X +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X #else -#define __LIBBPF_MARK_DEPRECATED_0_6(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 7) -#define __LIBBPF_MARK_DEPRECATED_0_7(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_7(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 8) -#define __LIBBPF_MARK_DEPRECATED_0_8(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_8(X) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) #endif /* This set of internal macros allows to do "function overloading" based on From cf90a20db878fddedefbdfeaff6695a0ee20f690 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:23 -0700 Subject: [PATCH 40/94] libbpf: remove internal multi-instance prog support Clean up internals that had to deal with the possibility of multi-instance bpf_programs. Libbpf 1.0 doesn't support this, so all this is not necessary now and can be simplified. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 323 +++++------------------------------------ 1 file changed, 37 insertions(+), 286 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fe98789d1776..65f2a57bc78d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -366,16 +366,6 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; -struct bpf_prog_prep_result { - struct bpf_insn *new_insn_ptr; - int new_insn_cnt; - int *pfd; -}; - -typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, - struct bpf_insn *insns, int insns_cnt, - struct bpf_prog_prep_result *res); - /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -429,22 +419,19 @@ struct bpf_program { size_t log_size; __u32 log_level; - struct { - int nr; - int *fds; - } instances; - bpf_program_prep_t preprocessor; - struct bpf_object *obj; + int fd; bool autoload; bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int prog_ifindex; __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 attach_prog_fd; + void *func_info; __u32 func_info_rec_size; __u32 func_info_cnt; @@ -695,25 +682,10 @@ static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); void bpf_program__unload(struct bpf_program *prog) { - int i; - if (!prog) return; - /* - * If the object is opened but the program was never loaded, - * it is possible that prog->instances.nr == -1. - */ - if (prog->instances.nr > 0) { - for (i = 0; i < prog->instances.nr; i++) - zclose(prog->instances.fds[i]); - } else if (prog->instances.nr != -1) { - pr_warn("Internal error: instances.nr is %d\n", - prog->instances.nr); - } - - prog->instances.nr = -1; - zfree(&prog->instances.fds); + zclose(prog->fd); zfree(&prog->func_info); zfree(&prog->line_info); @@ -797,6 +769,7 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->insns_cnt = prog->sec_insn_cnt; prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; /* libbpf's convention for SEC("?abc...") is that it's just like * SEC("abc...") but the corresponding bpf_program starts out with @@ -810,9 +783,6 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->autoload = true; } - prog->instances.fds = NULL; - prog->instances.nr = -1; - /* inherit object's log_level */ prog->log_level = obj->log_level; @@ -6862,10 +6832,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); -static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_program *prog, - struct bpf_insn *insns, int insns_cnt, - const char *license, __u32 kern_version, - int *prog_fd) +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) { LIBBPF_OPTS(bpf_prog_load_opts, load_attr); const char *prog_name = NULL; @@ -7232,88 +7201,6 @@ static int bpf_program_record_relos(struct bpf_program *prog) return 0; } -static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, - const char *license, __u32 kern_ver) -{ - int err = 0, fd, i; - - if (obj->loaded) { - pr_warn("prog '%s': can't load after object was loaded\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr < 0 || !prog->instances.fds) { - if (prog->preprocessor) { - pr_warn("Internal error: can't load program '%s'\n", - prog->name); - return libbpf_err(-LIBBPF_ERRNO__INTERNAL); - } - - prog->instances.fds = malloc(sizeof(int)); - if (!prog->instances.fds) { - pr_warn("Not enough memory for BPF fds\n"); - return libbpf_err(-ENOMEM); - } - prog->instances.nr = 1; - prog->instances.fds[0] = -1; - } - - if (!prog->preprocessor) { - if (prog->instances.nr != 1) { - pr_warn("prog '%s': inconsistent nr(%d) != 1\n", - prog->name, prog->instances.nr); - } - if (obj->gen_loader) - bpf_program_record_relos(prog); - err = bpf_object_load_prog_instance(obj, prog, - prog->insns, prog->insns_cnt, - license, kern_ver, &fd); - if (!err) - prog->instances.fds[0] = fd; - goto out; - } - - for (i = 0; i < prog->instances.nr; i++) { - struct bpf_prog_prep_result result; - bpf_program_prep_t preprocessor = prog->preprocessor; - - memset(&result, 0, sizeof(result)); - err = preprocessor(prog, i, prog->insns, - prog->insns_cnt, &result); - if (err) { - pr_warn("Preprocessing the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (!result.new_insn_ptr || !result.new_insn_cnt) { - pr_debug("Skip loading the %dth instance of program '%s'\n", - i, prog->name); - prog->instances.fds[i] = -1; - if (result.pfd) - *result.pfd = -1; - continue; - } - - err = bpf_object_load_prog_instance(obj, prog, - result.new_insn_ptr, result.new_insn_cnt, - license, kern_ver, &fd); - if (err) { - pr_warn("Loading the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (result.pfd) - *result.pfd = fd; - prog->instances.fds[i] = fd; - } -out: - if (err) - pr_warn("failed to load program '%s'\n", prog->name); - return libbpf_err(err); -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -7337,9 +7224,16 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) continue; } prog->log_level |= log_level; - err = bpf_object_load_prog(obj, prog, obj->license, obj->kern_version); - if (err) + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); return err; + } } bpf_object__free_relocs(obj); @@ -7985,11 +7879,16 @@ static int check_path(const char *path) return err; } -static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, int instance) +int bpf_program__pin(struct bpf_program *prog, const char *path) { char *cp, errmsg[STRERR_BUFSIZE]; int err; + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + err = make_parent_dir(path); if (err) return libbpf_err(err); @@ -7998,164 +7897,35 @@ static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, if (err) return libbpf_err(err); - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - if (bpf_obj_pin(prog->instances.fds[instance], path)) { + if (bpf_obj_pin(prog->fd, path)) { err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("failed to pin program: %s\n", cp); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); return libbpf_err(err); } - pr_debug("pinned program '%s'\n", path); + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); return 0; } -static int bpf_program_unpin_instance(struct bpf_program *prog, const char *path, int instance) -{ - int err; - - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - err = unlink(path); - if (err != 0) - return libbpf_err(-errno); - - pr_debug("unpinned program '%s'\n", path); - - return 0; -} - -int bpf_program__pin(struct bpf_program *prog, const char *path) -{ - int i, err; - - err = make_parent_dir(path); - if (err) - return libbpf_err(err); - - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_pin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) { - err = -EINVAL; - goto err_unpin; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; - goto err_unpin; - } - - err = bpf_program_pin_instance(prog, buf, i); - if (err) - goto err_unpin; - } - - return 0; - -err_unpin: - for (i = i - 1; i >= 0; i--) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - continue; - else if (len >= PATH_MAX) - continue; - - bpf_program_unpin_instance(prog, buf, i); - } - - rmdir(path); - - return libbpf_err(err); -} - int bpf_program__unpin(struct bpf_program *prog, const char *path) { - int i, err; + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } err = check_path(path); if (err) return libbpf_err(err); - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_unpin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); - - err = bpf_program_unpin_instance(prog, buf, i); - if (err) - return err; - } - - err = rmdir(path); + err = unlink(path); if (err) return libbpf_err(-errno); + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); return 0; } @@ -8672,13 +8442,6 @@ int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) return 0; } -static int bpf_program_nth_fd(const struct bpf_program *prog, int n); - -int bpf_program__fd(const struct bpf_program *prog) -{ - return bpf_program_nth_fd(prog, 0); -} - const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) { return prog->insns; @@ -8709,27 +8472,15 @@ int bpf_program__set_insns(struct bpf_program *prog, return 0; } -static int bpf_program_nth_fd(const struct bpf_program *prog, int n) +int bpf_program__fd(const struct bpf_program *prog) { - int fd; - if (!prog) return libbpf_err(-EINVAL); - if (n >= prog->instances.nr || n < 0) { - pr_warn("Can't get the %dth fd from program %s: only %d instances\n", - n, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - fd = prog->instances.fds[n]; - if (fd < 0) { - pr_warn("%dth instance of program '%s' is invalid\n", - n, prog->name); + if (prog->fd < 0) return libbpf_err(-ENOENT); - } - return fd; + return prog->fd; } __alias(bpf_program__type) From 450b167fb9be91a8164d3f3d734674f5fe95b22d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:24 -0700 Subject: [PATCH 41/94] libbpf: clean up SEC() handling Get rid of sloppy prefix logic and remove deprecated xdp_{devmap,cpumap} sections. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-13-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 119 ++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 72 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 65f2a57bc78d..ae58cf2898ad 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -346,12 +346,8 @@ enum sec_def_flags { SEC_ATTACH_BTF = 4, /* BPF program type allows sleeping/blocking in kernel */ SEC_SLEEPABLE = 8, - /* allow non-strict prefix matching */ - SEC_SLOPPY_PFX = 16, /* BPF program support non-linear XDP buffer */ - SEC_XDP_FRAGS = 32, - /* deprecated sec definitions not supposed to be used */ - SEC_DEPRECATED = 64, + SEC_XDP_FRAGS = 16, }; struct bpf_sec_def { @@ -6785,11 +6781,6 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; - if (def & SEC_DEPRECATED) { - pr_warn("SEC(\"%s\") is deprecated, please see https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide#bpf-program-sec-annotation-deprecations for details\n", - prog->sec_name); - } - if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { int btf_obj_fd = 0, btf_type_id = 0, err; const char *attach_name; @@ -8586,9 +8577,9 @@ static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_li static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); static const struct bpf_sec_def section_defs[] = { - SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), @@ -8599,8 +8590,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), - SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED), - SEC_DEF("action", SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX), + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), @@ -8622,50 +8613,48 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_devmap/", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_cpumap/", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), - SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), - SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), }; static size_t custom_sec_def_cnt; @@ -8760,8 +8749,7 @@ int libbpf_unregister_prog_handler(int handler_id) return 0; } -static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name, - bool allow_sloppy) +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) { size_t len = strlen(sec_def->sec); @@ -8786,17 +8774,6 @@ static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_n return false; } - /* SEC_SLOPPY_PFX definitions are allowed to be just prefix - * matches, unless strict section name mode - * (LIBBPF_STRICT_SEC_NAME) is enabled, in which case the - * match has to be exact. - */ - if (allow_sloppy && str_has_pfx(sec_name, sec_def->sec)) - return true; - - /* Definitions not marked SEC_SLOPPY_PFX (e.g., - * SEC("syscall")) are exact matches in both modes. - */ return strcmp(sec_name, sec_def->sec) == 0; } @@ -8804,20 +8781,18 @@ static const struct bpf_sec_def *find_sec_def(const char *sec_name) { const struct bpf_sec_def *sec_def; int i, n; - bool strict = libbpf_mode & LIBBPF_STRICT_SEC_NAME, allow_sloppy; n = custom_sec_def_cnt; for (i = 0; i < n; i++) { sec_def = &custom_sec_defs[i]; - if (sec_def_matches(sec_def, sec_name, false)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } n = ARRAY_SIZE(section_defs); for (i = 0; i < n; i++) { sec_def = §ion_defs[i]; - allow_sloppy = (sec_def->cookie & SEC_SLOPPY_PFX) && !strict; - if (sec_def_matches(sec_def, sec_name, allow_sloppy)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } From 31e42721976b9c445477038f8a4006150cd27a60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:25 -0700 Subject: [PATCH 42/94] selftests/bpf: remove last tests with legacy BPF map definitions Libbpf 1.0 stops support legacy-style BPF map definitions. Selftests has been migrated away from using legacy BPF map definitions except for two selftests, to make sure that legacy functionality still worked in pre-1.0 libbpf. Now it's time to let those tests go as libbpf 1.0 is imminent. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-14-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_legacy.h | 9 ---- tools/testing/selftests/bpf/prog_tests/btf.c | 1 - .../selftests/bpf/progs/test_btf_haskv.c | 51 ------------------- .../selftests/bpf/progs/test_btf_newkv.c | 18 ------- 4 files changed, 79 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_btf_haskv.c diff --git a/tools/testing/selftests/bpf/bpf_legacy.h b/tools/testing/selftests/bpf/bpf_legacy.h index 719ab56cdb5d..845209581440 100644 --- a/tools/testing/selftests/bpf/bpf_legacy.h +++ b/tools/testing/selftests/bpf/bpf_legacy.h @@ -2,15 +2,6 @@ #ifndef __BPF_LEGACY__ #define __BPF_LEGACY__ -#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ - struct ____btf_map_##name { \ - type_key key; \ - type_val value; \ - }; \ - struct ____btf_map_##name \ - __attribute__ ((section(".maps." #name), used)) \ - ____btf_map_##name = { } - /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions */ diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 1fd792a92a1c..941b0100bafa 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4651,7 +4651,6 @@ struct btf_file_test { }; static struct btf_file_test file_tests[] = { - { .file = "test_btf_haskv.o", }, { .file = "test_btf_newkv.o", }, { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, }; diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c deleted file mode 100644 index 07c94df13660..000000000000 --- a/tools/testing/selftests/bpf/progs/test_btf_haskv.c +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2018 Facebook */ -#include -#include -#include "bpf_legacy.h" - -struct ipv_counts { - unsigned int v4; - unsigned int v6; -}; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -struct bpf_map_def SEC("maps") btf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(struct ipv_counts), - .max_entries = 4, -}; -#pragma GCC diagnostic pop - -BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); - -__attribute__((noinline)) -int test_long_fname_2(void) -{ - struct ipv_counts *counts; - int key = 0; - - counts = bpf_map_lookup_elem(&btf_map, &key); - if (!counts) - return 0; - - counts->v6++; - - return 0; -} - -__attribute__((noinline)) -int test_long_fname_1(void) -{ - return test_long_fname_2(); -} - -SEC("dummy_tracepoint") -int _dummy_tracepoint(void *arg) -{ - return test_long_fname_1(); -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c index 762671a2e90c..251854a041b5 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c @@ -9,19 +9,6 @@ struct ipv_counts { unsigned int v6; }; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -/* just to validate we can handle maps in multiple sections */ -struct bpf_map_def SEC("maps") btf_map_legacy = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(long long), - .max_entries = 4, -}; -#pragma GCC diagnostic pop - -BPF_ANNOTATE_KV_PAIR(btf_map_legacy, int, struct ipv_counts); - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); @@ -41,11 +28,6 @@ int test_long_fname_2(void) counts->v6++; - /* just verify we can reference both maps */ - counts = bpf_map_lookup_elem(&btf_map_legacy, &key); - if (!counts) - return 0; - return 0; } From bd054102a8c7f36ff03446cc41822601180241f3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:26 -0700 Subject: [PATCH 43/94] libbpf: enforce strict libbpf 1.0 behaviors Remove support for legacy features and behaviors that previously had to be disabled by calling libbpf_set_strict_mode(): - legacy BPF map definitions are not supported now; - RLIMIT_MEMLOCK auto-setting, if necessary, is always on (but see libbpf_set_memlock_rlim()); - program name is used for program pinning (instead of section name); - cleaned up error returning logic; - entry BPF programs should have SEC() always. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-15-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 4 - tools/lib/bpf/libbpf.c | 210 ++------------------------------ tools/lib/bpf/libbpf.h | 34 ------ tools/lib/bpf/libbpf_internal.h | 24 +--- tools/lib/bpf/libbpf_legacy.h | 24 ++++ 5 files changed, 36 insertions(+), 260 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 2f3495d80d69..6d848b02c1e0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -147,10 +147,6 @@ int bump_rlimit_memlock(void) { struct rlimit rlim; - /* this the default in libbpf 1.0, but for now user has to opt-in explicitly */ - if (!(libbpf_mode & LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK)) - return 0; - /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) return 0; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ae58cf2898ad..e994797bcd48 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -278,12 +278,9 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } -/* this goes away in libbpf 1.0 */ -enum libbpf_strict_mode libbpf_mode = LIBBPF_STRICT_NONE; - int libbpf_set_strict_mode(enum libbpf_strict_mode mode) { - libbpf_mode = mode; + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ return 0; } @@ -367,9 +364,10 @@ struct bpf_sec_def { * linux/filter.h. */ struct bpf_program { - const struct bpf_sec_def *sec_def; + char *name; char *sec_name; size_t sec_idx; + const struct bpf_sec_def *sec_def; /* this program's instruction offset (in number of instructions) * within its containing ELF section */ @@ -389,12 +387,6 @@ struct bpf_program { */ size_t sub_insn_off; - char *name; - /* name with / replaced by _; makes recursive pinning - * in bpf_object__pin_programs easier - */ - char *pin_name; - /* instructions that belong to BPF program; insns[0] is located at * sec_insn_off instruction within its ELF section in ELF file, so * when mapping ELF file instruction index to the local instruction, @@ -695,7 +687,6 @@ static void bpf_program__exit(struct bpf_program *prog) bpf_program__unload(prog); zfree(&prog->name); zfree(&prog->sec_name); - zfree(&prog->pin_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -704,26 +695,6 @@ static void bpf_program__exit(struct bpf_program *prog) prog->sec_idx = -1; } -static char *__bpf_program__pin_name(struct bpf_program *prog) -{ - char *name, *p; - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - name = strdup(prog->name); - else - name = strdup(prog->sec_name); - - if (!name) - return NULL; - - p = name; - - while ((p = strchr(p, '/'))) - *p = '_'; - - return name; -} - static bool insn_is_subprog_call(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && @@ -790,10 +761,6 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, if (!prog->name) goto errout; - prog->pin_name = __bpf_program__pin_name(prog); - if (!prog->pin_name) - goto errout; - prog->insns = malloc(insn_data_sz); if (!prog->insns) goto errout; @@ -2007,143 +1974,6 @@ static int bpf_object__init_kconfig_map(struct bpf_object *obj) return 0; } -static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) -{ - Elf_Data *symbols = obj->efile.symbols; - int i, map_def_sz = 0, nr_maps = 0, nr_syms; - Elf_Data *data = NULL; - Elf_Scn *scn; - - if (obj->efile.maps_shndx < 0) - return 0; - - if (libbpf_mode & LIBBPF_STRICT_MAP_DEFINITIONS) { - pr_warn("legacy map definitions in SEC(\"maps\") are not supported\n"); - return -EOPNOTSUPP; - } - - if (!symbols) - return -EINVAL; - - scn = elf_sec_by_idx(obj, obj->efile.maps_shndx); - data = elf_sec_data(obj, scn); - if (!scn || !data) { - pr_warn("elf: failed to get legacy map definitions for %s\n", - obj->path); - return -EINVAL; - } - - /* - * Count number of maps. Each map has a name. - * Array of maps is not supported: only the first element is - * considered. - * - * TODO: Detect array of map and report error. - */ - nr_syms = symbols->d_size / sizeof(Elf64_Sym); - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - nr_maps++; - } - /* Assume equally sized map definitions */ - pr_debug("elf: found %d legacy map definitions (%zd bytes) in %s\n", - nr_maps, data->d_size, obj->path); - - if (!data->d_size || nr_maps == 0 || (data->d_size % nr_maps) != 0) { - pr_warn("elf: unable to determine legacy map definition size in %s\n", - obj->path); - return -EINVAL; - } - map_def_sz = data->d_size / nr_maps; - - /* Fill obj->maps using data in "maps" section. */ - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - const char *map_name; - struct bpf_map_def *def; - struct bpf_map *map; - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - - map = bpf_object__add_map(obj); - if (IS_ERR(map)) - return PTR_ERR(map); - - map_name = elf_sym_str(obj, sym->st_name); - if (!map_name) { - pr_warn("failed to get map #%d name sym string for obj %s\n", - i, obj->path); - return -LIBBPF_ERRNO__FORMAT; - } - - pr_warn("map '%s' (legacy): legacy map definitions are deprecated, use BTF-defined maps instead\n", map_name); - - if (ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { - pr_warn("map '%s' (legacy): static maps are not supported\n", map_name); - return -ENOTSUP; - } - - map->libbpf_type = LIBBPF_MAP_UNSPEC; - map->sec_idx = sym->st_shndx; - map->sec_offset = sym->st_value; - pr_debug("map '%s' (legacy): at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); - if (sym->st_value + map_def_sz > data->d_size) { - pr_warn("corrupted maps section in %s: last map \"%s\" too small\n", - obj->path, map_name); - return -EINVAL; - } - - map->name = strdup(map_name); - if (!map->name) { - pr_warn("map '%s': failed to alloc map name\n", map_name); - return -ENOMEM; - } - pr_debug("map %d is \"%s\"\n", i, map->name); - def = (struct bpf_map_def *)(data->d_buf + sym->st_value); - /* - * If the definition of the map in the object file fits in - * bpf_map_def, copy it. Any extra fields in our version - * of bpf_map_def will default to zero as a result of the - * calloc above. - */ - if (map_def_sz <= sizeof(struct bpf_map_def)) { - memcpy(&map->def, def, map_def_sz); - } else { - /* - * Here the map structure being read is bigger than what - * we expect, truncate if the excess bits are all zero. - * If they are not zero, reject this map as - * incompatible. - */ - char *b; - - for (b = ((char *)def) + sizeof(struct bpf_map_def); - b < ((char *)def) + map_def_sz; b++) { - if (*b != 0) { - pr_warn("maps section in %s: \"%s\" has unrecognized, non-zero options\n", - obj->path, map_name); - if (strict) - return -EINVAL; - } - } - memcpy(&map->def, def, sizeof(struct bpf_map_def)); - } - - /* btf info may not exist but fill it in if it does exist */ - (void) bpf_map_find_btf_info(obj, map); - } - return 0; -} - const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { @@ -2700,12 +2530,11 @@ static int bpf_object__init_maps(struct bpf_object *obj, { const char *pin_root_path; bool strict; - int err; + int err = 0; strict = !OPTS_GET(opts, relaxed_maps, false); pin_root_path = OPTS_GET(opts, pin_root_path, NULL); - err = bpf_object__init_user_maps(obj, strict); err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); err = err ?: bpf_object__init_kconfig_map(obj); @@ -3979,28 +3808,8 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -static bool prog_is_subprog(const struct bpf_object *obj, - const struct bpf_program *prog) +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) { - /* For legacy reasons, libbpf supports an entry-point BPF programs - * without SEC() attribute, i.e., those in the .text section. But if - * there are 2 or more such programs in the .text section, they all - * must be subprograms called from entry-point BPF programs in - * designated SEC()'tions, otherwise there is no way to distinguish - * which of those programs should be loaded vs which are a subprogram. - * Similarly, if there is a function/program in .text and at least one - * other BPF program with custom SEC() attribute, then we just assume - * .text programs are subprograms (even if they are not called from - * other programs), because libbpf never explicitly supported mixing - * SEC()-designated BPF programs and .text entry-point BPF programs. - * - * In libbpf 1.0 strict mode, we always consider .text - * programs to be subprograms. - */ - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - return prog->sec_idx == obj->efile.text_shndx; - return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; } @@ -8163,8 +7972,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) { err = -EINVAL; goto err_unpin_programs; @@ -8185,8 +7993,7 @@ err_unpin_programs: char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) continue; else if (len >= PATH_MAX) @@ -8210,8 +8017,7 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); + len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name); if (len < 0) return libbpf_err(-EINVAL); else if (len >= PATH_MAX) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5eb3145b8945..e4d5353f757b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -886,40 +886,6 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, const void *cur_key, void *next_key, size_t key_sz); -/** - * @brief **libbpf_get_error()** extracts the error code from the passed - * pointer - * @param ptr pointer returned from libbpf API function - * @return error code; or 0 if no error occured - * - * Many libbpf API functions which return pointers have logic to encode error - * codes as pointers, and do not return NULL. Meaning **libbpf_get_error()** - * should be used on the return value from these functions immediately after - * calling the API function, with no intervening calls that could clobber the - * `errno` variable. Consult the individual functions documentation to verify - * if this logic applies should be used. - * - * For these API functions, if `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` - * is enabled, NULL is returned on error instead. - * - * If ptr is NULL, then errno should be already set by the failing - * API, because libbpf never returns NULL on success and it now always - * sets errno on error. - * - * Example usage: - * - * struct perf_buffer *pb; - * - * pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, &opts); - * err = libbpf_get_error(pb); - * if (err) { - * pb = NULL; - * fprintf(stderr, "failed to open perf buffer: %d\n", err); - * goto cleanup; - * } - */ -LIBBPF_API long libbpf_get_error(const void *ptr); - struct bpf_xdp_set_link_opts { size_t sz; int old_fd; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index a1ad145ffa74..9cd7829cbe41 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -15,7 +15,6 @@ #include #include #include -#include "libbpf_legacy.h" #include "relo_core.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -478,8 +477,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -extern enum libbpf_strict_mode libbpf_mode; - typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, const char *sym_name, void *ctx); @@ -498,12 +495,8 @@ static inline int libbpf_err(int ret) */ static inline int libbpf_err_errno(int ret) { - if (libbpf_mode & LIBBPF_STRICT_DIRECT_ERRS) - /* errno is already assumed to be set on error */ - return ret < 0 ? -errno : ret; - - /* legacy: on error return -1 directly and don't touch errno */ - return ret; + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; } /* handle error for pointer-returning APIs, err is assumed to be < 0 always */ @@ -511,12 +504,7 @@ static inline void *libbpf_err_ptr(int err) { /* set errno on error, this doesn't break anything */ errno = -err; - - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return NULL; - - /* legacy: encode err as ptr */ - return ERR_PTR(err); + return NULL; } /* handle pointer-returning APIs' error handling */ @@ -526,11 +514,7 @@ static inline void *libbpf_ptr(void *ret) if (IS_ERR(ret)) errno = -PTR_ERR(ret); - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return IS_ERR(ret) ? NULL : ret; - - /* legacy: pass-through original pointer */ - return ret; + return IS_ERR(ret) ? NULL : ret; } static inline bool str_is_empty(const char *s) diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index d7bcbd01f66f..863f49df8bf4 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -20,6 +20,11 @@ extern "C" { #endif +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ enum libbpf_strict_mode { /* Turn on all supported strict features of libbpf to simulate libbpf * v1.0 behavior. @@ -88,6 +93,25 @@ enum libbpf_strict_mode { LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + #define DECLARE_LIBBPF_OPTS LIBBPF_OPTS /* "Discouraged" APIs which don't follow consistent libbpf naming patterns. From ab9a5a05dc480f8994eddd31093a8920b08ee71d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jun 2022 14:15:27 -0700 Subject: [PATCH 44/94] libbpf: fix up few libbpf.map problems Seems like we missed to add 2 APIs to libbpf.map and another API was misspelled. Fix it in libbpf.map. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220627211527.2245459-16-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.map | 3 ++- tools/lib/bpf/libbpf_legacy.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 236efc7338f1..43c6697a6d27 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -326,10 +326,11 @@ LIBBPF_0.7.0 { bpf_xdp_detach; bpf_xdp_query; bpf_xdp_query_id; + btf_ext__raw_data; libbpf_probe_bpf_helper; libbpf_probe_bpf_map_type; libbpf_probe_bpf_prog_type; - libbpf_set_memlock_rlim_max; + libbpf_set_memlock_rlim; } LIBBPF_0.6.0; LIBBPF_0.8.0 { diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index 863f49df8bf4..5b7e0155db6a 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -76,8 +76,8 @@ enum libbpf_strict_mode { * first BPF program or map creation operation. This is done only if * kernel is too old to support memcg-based memory accounting for BPF * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, - * but it can be overriden with libbpf_set_memlock_rlim_max() API. - * Note that libbpf_set_memlock_rlim_max() needs to be called before + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() * operation. */ From af3f4134006bf3bf894179c08aaf98ed5938cf4e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:04 -0700 Subject: [PATCH 45/94] bpf: add bpf_func_t and trampoline helpers I'll be adding lsm cgroup specific helpers that grab trampoline mutex. No functional changes. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++--- kernel/bpf/trampoline.c | 63 +++++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d05e1495a06e..d547be9db75f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,6 +56,8 @@ typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +typedef unsigned int (*bpf_func_t)(const void *, + const struct bpf_insn *); struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; @@ -879,8 +881,7 @@ struct bpf_dispatcher { static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, - unsigned int (*bpf_func)(const void *, - const struct bpf_insn *)) + bpf_func_t bpf_func) { return bpf_func(ctx, insnsi); } @@ -909,8 +910,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ - unsigned int (*bpf_func)(const void *, \ - const struct bpf_insn *)) \ + bpf_func_t bpf_func) \ { \ return bpf_func(ctx, insnsi); \ } \ @@ -921,8 +921,7 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ - unsigned int (*bpf_func)(const void *, \ - const struct bpf_insn *)); \ + bpf_func_t bpf_func); \ extern struct bpf_dispatcher bpf_dispatcher_##name; #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 93c7675f0c9e..5466e15be61f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -410,7 +410,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -418,44 +418,33 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); - if (tr->extension_prog) { + if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ - err = -EBUSY; - goto out; - } + return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) { - err = -EBUSY; - goto out; - } + if (cnt) + return -EBUSY; tr->extension_prog = link->link.prog; - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - goto out; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } - if (cnt >= BPF_MAX_TRAMP_LINKS) { - err = -E2BIG; - goto out; - } - if (!hlist_unhashed(&link->tramp_hlist)) { + if (cnt >= BPF_MAX_TRAMP_LINKS) + return -E2BIG; + if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ - err = -EBUSY; - goto out; - } + return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ - err = -EBUSY; - goto out; + return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); @@ -465,30 +454,44 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } -out: + return err; +} + +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -/* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); - mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; - goto out; + return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); -out: + return bpf_trampoline_update(tr); +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_unlink_prog(link, tr); mutex_unlock(&tr->mutex); return err; } From 00442143a2ab7f1da46fbf4d2a99c85df767d49a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:05 -0700 Subject: [PATCH 46/94] bpf: convert cgroup_bpf.progs to hlist This lets us reclaim some space to be used by new cgroup lsm slots. Before: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct list_head progs[23]; /* 184 368 */ /* --- cacheline 8 boundary (512 bytes) was 40 bytes ago --- */ u32 flags[23]; /* 552 92 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 10 boundary (640 bytes) was 8 bytes ago --- */ struct list_head storages; /* 648 16 */ struct bpf_prog_array * inactive; /* 664 8 */ struct percpu_ref refcnt; /* 672 16 */ struct work_struct release_work; /* 688 32 */ /* size: 720, cachelines: 12, members: 7 */ /* sum members: 716, holes: 1, sum holes: 4 */ /* last cacheline: 16 bytes */ }; After: struct cgroup_bpf { struct bpf_prog_array * effective[23]; /* 0 184 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ struct hlist_head progs[23]; /* 184 184 */ /* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */ u8 flags[23]; /* 368 23 */ /* XXX 1 byte hole, try to pack */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ struct list_head storages; /* 392 16 */ struct bpf_prog_array * inactive; /* 408 8 */ struct percpu_ref refcnt; /* 416 16 */ struct work_struct release_work; /* 432 72 */ /* size: 504, cachelines: 8, members: 7 */ /* sum members: 503, holes: 1, sum holes: 1 */ /* last cacheline: 56 bytes */ }; Suggested-by: Jakub Sitnicki Reviewed-by: Jakub Sitnicki Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup-defs.h | 4 +- include/linux/bpf-cgroup.h | 2 +- kernel/bpf/cgroup.c | 76 +++++++++++++++++++-------------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 695d1224a71b..5d268e76d8e6 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -47,8 +47,8 @@ struct cgroup_bpf { * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; - u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; + struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 669d96d074ad..6673acfbf2ef 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -95,7 +95,7 @@ struct bpf_cgroup_link { }; struct bpf_prog_list { - struct list_head node; + struct hlist_node node; struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 7a394f7c205c..4adb4f3ecb7f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -157,11 +157,12 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_lock(&cgroup_mutex); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { - struct list_head *progs = &cgrp->bpf.progs[atype]; - struct bpf_prog_list *pl, *pltmp; + struct hlist_head *progs = &cgrp->bpf.progs[atype]; + struct bpf_prog_list *pl; + struct hlist_node *pltmp; - list_for_each_entry_safe(pl, pltmp, progs, node) { - list_del(&pl->node); + hlist_for_each_entry_safe(pl, pltmp, progs, node) { + hlist_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) @@ -217,12 +218,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct list_head *head) +static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; @@ -291,7 +292,7 @@ static int compute_effective_progs(struct cgroup *cgrp, if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[atype], node) { + hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -342,7 +343,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cgroup_bpf_get(p); for (i = 0; i < NR; i++) - INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); @@ -418,7 +419,7 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 -static struct bpf_prog_list *find_attach_entry(struct list_head *progs, +static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, @@ -428,12 +429,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* single-attach case */ if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) return NULL; - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); @@ -444,7 +445,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* direct prog multi-attach w/ replacement case */ if (replace_prog) { - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; @@ -480,7 +481,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -503,7 +504,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) + if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -525,12 +526,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { + struct hlist_node *last = NULL; + pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - list_add_tail(&pl->node, progs); + if (hlist_empty(progs)) + hlist_add_head(&pl->node, progs); + else + hlist_for_each(last, progs) { + if (last->next) + continue; + hlist_add_behind(&pl->node, last); + break; + } } pl->prog = prog; @@ -556,7 +567,7 @@ cleanup: } bpf_cgroup_storages_free(new_storage); if (!old_prog) { - list_del(&pl->node); + hlist_del(&pl->node); kfree(pl); } return err; @@ -587,7 +598,7 @@ static void replace_effective_prog(struct cgroup *cgrp, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -603,7 +614,7 @@ static void replace_effective_prog(struct cgroup *cgrp, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) @@ -637,7 +648,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; bool found = false; atype = to_cgroup_bpf_attach_type(link->type); @@ -649,7 +660,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (link->link.prog->type != new_prog->type) return -EINVAL; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; @@ -688,7 +699,7 @@ out_unlock: return ret; } -static struct bpf_prog_list *find_detach_entry(struct list_head *progs, +static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) @@ -696,14 +707,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog_list *pl; if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) @@ -713,7 +724,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } @@ -737,7 +748,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -754,7 +765,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->prog == prog && pl->link == link) @@ -791,7 +802,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; u32 flags; atype = to_cgroup_bpf_attach_type(type); @@ -822,9 +833,10 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } /* now can actually delete it from this cgroup list */ - list_del(&pl->node); + hlist_del(&pl->node); + kfree(pl); - if (list_empty(progs)) + if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; if (old_prog) @@ -852,7 +864,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct list_head *progs; + struct hlist_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; u32 flags; @@ -891,7 +903,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; i = 0; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:06 -0700 Subject: [PATCH 47/94] bpf: per-cgroup lsm flavor Allow attaching to lsm hooks in the cgroup context. Attaching to per-cgroup LSM works exactly like attaching to other per-cgroup hooks. New BPF_LSM_CGROUP is added to trigger new mode; the actual lsm hook we attach to is signaled via existing attach_btf_id. For the hooks that have 'struct socket' or 'struct sock' as its first argument, we use the cgroup associated with that socket. For the rest, we use 'current' cgroup (this is all on default hierarchy == v2 only). Note that for some hooks that work on 'struct sock' we still take the cgroup from 'current' because some of them work on the socket that hasn't been properly initialized yet. Behind the scenes, we allocate a shim program that is attached to the trampoline and runs cgroup effective BPF programs array. This shim has some rudimentary ref counting and can be shared between several programs attaching to the same lsm hook from different cgroups. Note that this patch bloats cgroup size because we add 211 cgroup_bpf_attach_type(s) for simplicity sake. This will be addressed in the subsequent patch. Also note that we only add non-sleepable flavor for now. To enable sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu, shim programs have to be freed via trace rcu, cgroup_bpf.effective should be also trace-rcu-managed + maybe some other changes that I'm not aware of. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 24 ++-- include/linux/bpf-cgroup-defs.h | 8 ++ include/linux/bpf-cgroup.h | 7 ++ include/linux/bpf.h | 24 ++++ include/linux/bpf_lsm.h | 13 +++ include/linux/btf_ids.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_lsm.c | 48 ++++++++ kernel/bpf/btf.c | 11 ++ kernel/bpf/cgroup.c | 136 ++++++++++++++++++++-- kernel/bpf/core.c | 2 + kernel/bpf/syscall.c | 10 ++ kernel/bpf/trampoline.c | 198 ++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 32 ++++++ tools/include/uapi/linux/bpf.h | 1 + 15 files changed, 498 insertions(+), 20 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2c51ca9f7cec..2f460c67f9c7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1770,6 +1770,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { + void (*exit)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; + u64 (*enter)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1788,15 +1792,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + if (p->aux->sleepable) { + enter = __bpf_prog_enter_sleepable; + exit = __bpf_prog_exit_sleepable; + } else if (p->expected_attach_type == BPF_LSM_CGROUP) { + enter = __bpf_prog_enter_lsm_cgroup; + exit = __bpf_prog_exit_lsm_cgroup; + } + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_enter_sleepable : - __bpf_prog_enter, prog)) - return -EINVAL; + if (emit_call(&prog, enter, prog)) + return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1840,10 +1850,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_exit_sleepable : - __bpf_prog_exit, prog)) - return -EINVAL; + if (emit_call(&prog, exit, prog)) + return -EINVAL; *pprog = prog; return 0; diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 5d268e76d8e6..b99f8c3e37ea 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -10,6 +10,12 @@ struct bpf_prog_array; +#ifdef CONFIG_BPF_LSM +#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */ +#else +#define CGROUP_LSM_NUM 0 +#endif + enum cgroup_bpf_attach_type { CGROUP_BPF_ATTACH_TYPE_INVALID = -1, CGROUP_INET_INGRESS = 0, @@ -35,6 +41,8 @@ enum cgroup_bpf_attach_type { CGROUP_INET4_GETSOCKNAME, CGROUP_INET6_GETSOCKNAME, CGROUP_INET_SOCK_RELEASE, + CGROUP_LSM_START, + CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1, MAX_CGROUP_BPF_ATTACH_TYPE }; diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6673acfbf2ef..2bd1b5f8de9b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,6 +23,13 @@ struct ctl_table; struct ctl_table_header; struct task_struct; +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn); +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn); +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn); + #ifdef CONFIG_CGROUP_BPF #define CGROUP_ATYPE(type) \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d547be9db75f..77cd613a00bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -794,6 +794,10 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1060,6 +1064,7 @@ struct bpf_prog_aux { struct user_struct *user; u64 load_time; /* ns since boottime */ u32 verified_insns; + int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY @@ -1167,6 +1172,11 @@ struct bpf_tramp_link { u64 cookie; }; +struct bpf_shim_tramp_link { + struct bpf_tramp_link link; + struct bpf_trampoline *trampoline; +}; + struct bpf_tracing_link { struct bpf_tramp_link link; enum bpf_attach_type attach_type; @@ -1245,6 +1255,9 @@ struct bpf_dummy_ops { int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype); +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { @@ -1268,6 +1281,14 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + return -EOPNOTSUPP; +} +static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ +} #endif struct bpf_array { @@ -2368,6 +2389,8 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; +extern const struct bpf_func_proto bpf_set_retval_proto; +extern const struct bpf_func_proto bpf_get_retval_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2485,6 +2508,7 @@ int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +int btf_id_set_index(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 479c101546ad..61787a5f6af9 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -42,6 +42,9 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +int bpf_lsm_hook_idx(u32 btf_id); +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); + #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -65,6 +68,16 @@ static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ +} + +static inline int bpf_lsm_hook_idx(u32 btf_id) +{ + return -EINVAL; +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 335a19092368..252a4befeab1 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -179,7 +179,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket) enum { #define BTF_SOCK_TYPE(name, str) name, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e81362891596..b7479898c879 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -998,6 +998,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c1351df9f7ee..0f72020bfdcf 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_SET_END(bpf_lsm_current_hooks) + +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + + args = btf_params(prog->aux->attach_func_proto); + +#ifdef CONFIG_NET + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} + +int bpf_lsm_hook_idx(u32 btf_id) +{ + return btf_id_set_index(&bpf_lsm_hooks, btf_id); +} + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; + case BPF_FUNC_get_local_storage: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_local_storage_proto : NULL; + case BPF_FUNC_set_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_set_retval_proto : NULL; + case BPF_FUNC_get_retval: + return prog->expected_attach_type == BPF_LSM_CGROUP ? + &bpf_get_retval_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2e2066d6af94..7c1fe422ed3f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } +int btf_id_set_index(const struct btf_id_set *set, u32 id) +{ + const u32 *p; + + p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); + if (!p) + return -1; + return p - set->ids; +} + bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4adb4f3ecb7f..9cf41dd4f96f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, return run_ctx.retval; } +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work) hlist_for_each_entry_safe(pl, pltmp, progs, node) { hlist_del(&pl->node); - if (pl->prog) + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; @@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; } @@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f023cb399e3f..4cc10b942a3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d5af5b99f0d..626b8f7d237b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; @@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_raw_tp_link_attach(prog, NULL); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5466e15be61f..d7c251d7fbcd 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin return err; } +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + + return err; +} + +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + migrate_enable(); + rcu_read_unlock(); +} + u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4938477912cd..df3ec6b05f05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_set_retval: + if (env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; } if (err) @@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e81362891596..b7479898c879 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -998,6 +998,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, __MAX_BPF_ATTACH_TYPE }; From c0e19f2c9a3edd38e4b1bdae98eb44555d02bc31 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:07 -0700 Subject: [PATCH 48/94] bpf: minimize number of allocated lsm slots per program Previous patch adds 1:1 mapping between all 211 LSM hooks and bpf_cgroup program array. Instead of reserving a slot per possible hook, reserve 10 slots per cgroup for lsm programs. Those slots are dynamically allocated on demand and reclaimed. struct cgroup_bpf { struct bpf_prog_array * effective[33]; /* 0 264 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ struct hlist_head progs[33]; /* 264 264 */ /* --- cacheline 8 boundary (512 bytes) was 16 bytes ago --- */ u8 flags[33]; /* 528 33 */ /* XXX 7 bytes hole, try to pack */ struct list_head storages; /* 568 16 */ /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */ struct bpf_prog_array * inactive; /* 584 8 */ struct percpu_ref refcnt; /* 592 16 */ struct work_struct release_work; /* 608 72 */ /* size: 680, cachelines: 11, members: 7 */ /* sum members: 673, holes: 1, sum holes: 7 */ /* last cacheline: 40 bytes */ }; Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup-defs.h | 3 ++- include/linux/bpf.h | 9 ++++++- include/linux/bpf_lsm.h | 6 ----- kernel/bpf/bpf_lsm.c | 5 ---- kernel/bpf/btf.c | 10 ------- kernel/bpf/cgroup.c | 47 ++++++++++++++++++++++++++++++++- kernel/bpf/core.c | 7 +++++ kernel/bpf/trampoline.c | 1 + 8 files changed, 64 insertions(+), 24 deletions(-) diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index b99f8c3e37ea..7b121bd780eb 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -11,7 +11,8 @@ struct bpf_prog_array; #ifdef CONFIG_BPF_LSM -#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */ +/* Maximum number of concurrently attachable per-cgroup LSM hooks. */ +#define CGROUP_LSM_NUM 10 #else #define CGROUP_LSM_NUM 0 #endif diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 77cd613a00bd..5d2afa55c7c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2508,7 +2508,6 @@ int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -int btf_id_set_index(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 @@ -2545,4 +2544,12 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +#ifdef CONFIG_BPF_LSM +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); +void bpf_cgroup_atype_put(int cgroup_atype); +#else +static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} +static inline void bpf_cgroup_atype_put(int cgroup_atype) {} +#endif /* CONFIG_BPF_LSM */ + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 61787a5f6af9..4bcf76a9bb06 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -42,7 +42,6 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); -int bpf_lsm_hook_idx(u32 btf_id); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); #else /* !CONFIG_BPF_LSM */ @@ -73,11 +72,6 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, { } -static inline int bpf_lsm_hook_idx(u32 btf_id) -{ - return -EINVAL; -} - #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0f72020bfdcf..83aa431dd52e 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -69,11 +69,6 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, *bpf_func = __cgroup_bpf_run_lsm_current; } -int bpf_lsm_hook_idx(u32 btf_id) -{ - return btf_id_set_index(&bpf_lsm_hooks, btf_id); -} - int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7c1fe422ed3f..8d3c7ab8af46 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6843,16 +6843,6 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -int btf_id_set_index(const struct btf_id_set *set, u32 id) -{ - const u32 *p; - - p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func); - if (!p) - return -1; - return p - set->ids; -} - bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9cf41dd4f96f..169cbd0de797 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -127,12 +127,57 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { + int i; + + lockdep_assert_held(&cgroup_mutex); + if (attach_type != BPF_LSM_CGROUP) return to_cgroup_bpf_attach_type(attach_type); - return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id); + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) + return CGROUP_LSM_START + i; + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == 0) + return CGROUP_LSM_START + i; + + return -E2BIG; + +} + +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + lockdep_assert_held(&cgroup_mutex); + + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && + cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + cgroup_lsm_atype[i].refcnt++; +} + +void bpf_cgroup_atype_put(int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + mutex_lock(&cgroup_mutex); + if (--cgroup_lsm_atype[i].refcnt <= 0) + cgroup_lsm_atype[i].attach_btf_id = 0; + WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); + mutex_unlock(&cgroup_mutex); } #else static enum cgroup_bpf_attach_type diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4cc10b942a3c..805c2ad5c793 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -107,6 +107,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); +#ifdef CONFIG_CGROUP_BPF + aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; +#endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -2569,6 +2572,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); +#endif +#ifdef CONFIG_CGROUP_BPF + if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) + bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d7c251d7fbcd..6cd226584c33 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -555,6 +555,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p); + bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } From b79c9fc9551b45953a94abf550b7bd3b00e3a0f9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:08 -0700 Subject: [PATCH 49/94] bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP We have two options: 1. Treat all BPF_LSM_CGROUP the same, regardless of attach_btf_id 2. Treat BPF_LSM_CGROUP+attach_btf_id as a separate hook point I was doing (2) in the original patch, but switching to (1) here: * bpf_prog_query returns all attached BPF_LSM_CGROUP programs regardless of attach_btf_id * attach_btf_id is exported via bpf_prog_info Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-6-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++ kernel/bpf/cgroup.c | 95 +++++++++++++++++++++++++++------------- kernel/bpf/syscall.c | 8 +++- 3 files changed, 74 insertions(+), 32 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b7479898c879..ad9e7311c4cf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1432,6 +1432,7 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; + __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ @@ -6076,6 +6077,8 @@ struct bpf_prog_info { __u64 run_cnt; __u64 recursion_misses; __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 169cbd0de797..59b7eb60d5b4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1017,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; + enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct hlist_head *progs; - struct bpf_prog *prog; int cnt, ret = 0, i; + int total_cnt = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); - if (atype < 0) - return -EINVAL; + if (type == BPF_LSM_CGROUP) { + if (attr->query.prog_cnt && prog_ids && !prog_attach_flags) + return -EINVAL; - progs = &cgrp->bpf.progs[atype]; - flags = cgrp->bpf.flags[atype]; + from_atype = CGROUP_LSM_START; + to_atype = CGROUP_LSM_END; + flags = 0; + } else { + from_atype = to_cgroup_bpf_attach_type(type); + if (from_atype < 0) + return -EINVAL; + to_atype = from_atype; + flags = cgrp->bpf.flags[from_atype]; + } - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], - lockdep_is_held(&cgroup_mutex)); - - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(effective); - else - cnt = prog_list_length(progs); + for (atype = from_atype; atype <= to_atype; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + total_cnt += bpf_prog_array_length(effective); + } else { + total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + } + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; + + if (attr->query.prog_cnt < total_cnt) { + total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; + for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); + ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); + } else { + struct hlist_head *progs; + struct bpf_prog_list *pl; + struct bpf_prog *prog; + u32 id; - i = 0; - hlist_for_each_entry(pl, progs, node) { - prog = prog_list_prog(pl); - id = prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; + progs = &cgrp->bpf.progs[atype]; + cnt = min_t(int, prog_list_length(progs), total_cnt); + i = 0; + hlist_for_each_entry(pl, progs, node) { + prog = prog_list_prog(pl); + id = prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } } + + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; + + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } + + prog_ids += cnt; + total_cnt -= cnt; } return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 626b8f7d237b..ab688d85b2c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3520,7 +3520,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) } } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3556,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: + case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); @@ -4066,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); + info.attach_btf_id = prog->aux->attach_btf_id; + if (prog->aux->attach_btf) + info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf); + else if (prog->aux->dst_prog) + info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; From 9113d7e48e9128522b9f5a54dfd30dff10509a92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:09 -0700 Subject: [PATCH 50/94] bpf: expose bpf_{g,s}etsockopt to lsm cgroup I don't see how to make it nice without introducing btf id lists for the hooks where these helpers are allowed. Some LSM hooks work on the locked sockets, some are triggering early and don't grab any locks, so have two lists for now: 1. LSM hooks which trigger under socket lock - minority of the hooks, but ideal case for us, we can expose existing BTF-based helpers 2. LSM hooks which trigger without socket lock, but they trigger early in the socket creation path where it should be safe to do setsockopt without any locks 3. The rest are prohibited. I'm thinking that this use-case might be a good gateway to sleeping lsm cgroup hooks in the future. We can either expose lock/unlock operations (and add tracking to the verifier) or have another set of bpf_setsockopt wrapper that grab the locks and might sleep. Reviewed-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-7-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/bpf_lsm.c | 38 ++++++++++++++++++++++++++++ net/core/filter.c | 60 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 93 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5d2afa55c7c3..2b21f2a3452f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2386,6 +2386,8 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; +extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 83aa431dd52e..d469b7f3deef 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -45,6 +45,24 @@ BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) BTF_SET_END(bpf_lsm_current_hooks) +/* List of LSM hooks that trigger while the socket is properly locked. + */ +BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) +BTF_ID(func, bpf_lsm_sock_graft) +BTF_ID(func, bpf_lsm_inet_csk_clone) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_SET_END(bpf_lsm_locked_sockopt_hooks) + +/* List of LSM hooks that trigger while the socket is _not_ locked, + * but it's ok to call bpf_{g,s}etsockopt because the socket is still + * in the early init phase. + */ +BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) + void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { @@ -201,6 +219,26 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_retval: return prog->expected_attach_type == BPF_LSM_CGROUP ? &bpf_get_retval_proto : NULL; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_setsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_getsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_getsockopt_proto; + return NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/net/core/filter.c b/net/core/filter.c index 151aa4756bd6..c6941ab0eb52 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5012,8 +5012,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -5024,8 +5024,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (!sk_fullsock(sk)) return -EINVAL; - sock_owned_by_me(sk); - if (level == SOL_SOCKET) { if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; @@ -5258,14 +5256,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, return ret; } -static int _bpf_getsockopt(struct sock *sk, int level, int optname, +static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { if (!sk_fullsock(sk)) goto err_clear; - sock_owned_by_me(sk); - if (level == SOL_SOCKET) { if (optlen != sizeof(int)) goto err_clear; @@ -5360,6 +5364,14 @@ err_clear: return -EINVAL; } +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { @@ -5400,6 +5412,40 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { + .func = bpf_unlocked_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { + .func = bpf_unlocked_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { From 3b34bcb946c2a8240ef6761be2ee404ceb7e1079 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:10 -0700 Subject: [PATCH 51/94] tools/bpf: Sync btf_ids.h to tools Has been slowly getting out of sync, let's update it. resolve_btfids usage has been updated to match the header changes. Also bring new parts of tools/include/uapi/linux/bpf.h. Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-8-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/include/linux/btf_ids.h | 35 +++++++++++++++---- tools/include/uapi/linux/bpf.h | 3 ++ .../selftests/bpf/prog_tests/resolve_btfids.c | 2 +- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 57890b357f85..71e54b1e3796 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -73,7 +73,7 @@ asm( \ __BTF_ID_LIST(name, local) \ extern u32 name[]; -#define BTF_ID_LIST_GLOBAL(name) \ +#define BTF_ID_LIST_GLOBAL(name, n) \ __BTF_ID_LIST(name, globl) /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with @@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl) #define BTF_ID_LIST_SINGLE(name, prefix, typename) \ BTF_ID_LIST(name) \ BTF_ID(prefix, typename) +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ + BTF_ID_LIST_GLOBAL(name, 1) \ + BTF_ID(prefix, typename) /* * The BTF_ID_UNUSED macro defines 4 zero bytes. @@ -143,13 +146,14 @@ extern struct btf_id_set name; #else -#define BTF_ID_LIST(name) static u32 name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; -#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; -#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1]; +#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ @@ -172,7 +176,10 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket) enum { #define BTF_SOCK_TYPE(name, str) name, @@ -184,4 +191,18 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif +#define BTF_TRACING_TYPE_xxx \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct) + +enum { +#define BTF_TRACING_TYPE(name, type) name, +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE +MAX_BTF_TRACING_TYPE, +}; + +extern u32 btf_tracing_ids[]; + #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b7479898c879..ad9e7311c4cf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1432,6 +1432,7 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; + __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ @@ -6076,6 +6077,8 @@ struct bpf_prog_info { __u64 run_cnt; __u64 recursion_misses; __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index f4a13d9dd5c8..c197261d02e2 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -44,7 +44,7 @@ BTF_ID(union, U) BTF_ID(func, func) extern __u32 test_list_global[]; -BTF_ID_LIST_GLOBAL(test_list_global) +BTF_ID_LIST_GLOBAL(test_list_global, 1) BTF_ID_UNUSED BTF_ID(typedef, S) BTF_ID(typedef, T) From bffcf34878b1c16c322c6606e4a48d82bd5f7f24 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:11 -0700 Subject: [PATCH 52/94] libbpf: add lsm_cgoup_sock type lsm_cgroup/ is the prefix for BPF_LSM_CGROUP. Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-9-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e994797bcd48..8a45a84eb9b2 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -106,6 +106,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_FEXIT] = "trace_fexit", [BPF_MODIFY_RETURN] = "modify_return", [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", [BPF_SK_LOOKUP] = "sk_lookup", [BPF_TRACE_ITER] = "trace_iter", [BPF_XDP_DEVMAP] = "xdp_devmap", @@ -8414,6 +8415,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), @@ -8851,6 +8853,7 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *kind = BTF_KIND_TYPEDEF; break; case BPF_LSM_MAC: + case BPF_LSM_CGROUP: *prefix = BTF_LSM_PREFIX; *kind = BTF_KIND_FUNC; break; From a4b2f3cf699f8ec3964722e7ef3f37f809701172 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:12 -0700 Subject: [PATCH 53/94] libbpf: implement bpf_prog_query_opts Implement bpf_prog_query_opts as a more expendable version of bpf_prog_query. Expose new prog_attach_flags and attach_btf_func_id as well: * prog_attach_flags is a per-program attach_type; relevant only for lsm cgroup program which might have different attach_flags per attach_btf_id * attach_btf_func_id is a new field expose for prog_query which specifies real btf function id for lsm cgroup attachments Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-10-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 38 +++++++++++++++++++++++++++++++------- tools/lib/bpf/bpf.h | 15 +++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 6d848b02c1e0..5eb0df90eb2b 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -795,24 +795,48 @@ int bpf_iter_create(int link_fd) return libbpf_err_errno(fd); } -int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, - __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) { union bpf_attr attr; int ret; + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = target_fd; attr.query.attach_type = type; - attr.query.query_flags = query_flags; - attr.query.prog_cnt = *prog_cnt; - attr.query.prog_ids = ptr_to_u64(prog_ids); + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + + return libbpf_err_errno(ret); +} + +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + LIBBPF_OPTS(bpf_prog_query_opts, opts); + int ret; + + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; + + ret = bpf_prog_query_opts(target_fd, type, &opts); + if (attach_flags) - *attach_flags = attr.query.attach_flags; - *prog_cnt = attr.query.prog_cnt; + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; return libbpf_err_errno(ret); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index af7baf5da74d..88a7cc4bd76f 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -359,9 +359,24 @@ LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + __u32 prog_cnt; /* input+output argument */ + __u32 *prog_attach_flags; +}; +#define bpf_prog_query_opts__last_field prog_attach_flags + +LIBBPF_API int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); + LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 43c6697a6d27..94b589ecfeaa 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -355,6 +355,7 @@ LIBBPF_0.8.0 { LIBBPF_1.0.0 { global: + bpf_prog_query_opts; btf__add_enum64; btf__add_enum64_value; libbpf_bpf_attach_type_str; From 596f5fb2ea2a38032abb8043606ce7168f21f6ab Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:13 -0700 Subject: [PATCH 54/94] bpftool: implement cgroup tree for BPF_LSM_CGROUP $ bpftool --nomount prog loadall $KDIR/tools/testing/selftests/bpf/lsm_cgroup.o /sys/fs/bpf/x $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_alloc $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_bind $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_clone $ bpftool cgroup attach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_post_create $ bpftool cgroup tree CgroupPath ID AttachType AttachFlags Name /sys/fs/cgroup 6 lsm_cgroup socket_post_create bpf_lsm_socket_post_create 8 lsm_cgroup socket_bind bpf_lsm_socket_bind 10 lsm_cgroup socket_alloc bpf_lsm_sk_alloc_security 11 lsm_cgroup socket_clone bpf_lsm_inet_csk_clone $ bpftool cgroup detach /sys/fs/cgroup lsm_cgroup pinned /sys/fs/bpf/x/socket_post_create $ bpftool cgroup tree CgroupPath ID AttachType AttachFlags Name /sys/fs/cgroup 8 lsm_cgroup socket_bind bpf_lsm_socket_bind 10 lsm_cgroup socket_alloc bpf_lsm_sk_alloc_security 11 lsm_cgroup socket_clone bpf_lsm_inet_csk_clone Reviewed-by: Quentin Monnet Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220628174314.1216643-11-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/cgroup.c | 109 +++++++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 22 deletions(-) diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 42421fe47a58..cced668fb2a3 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -15,6 +15,7 @@ #include #include +#include #include "main.h" @@ -36,6 +37,8 @@ " cgroup_inet_sock_release }" static unsigned int query_flags; +static struct btf *btf_vmlinux; +static __u32 btf_vmlinux_id; static enum bpf_attach_type parse_attach_type(const char *str) { @@ -64,11 +67,38 @@ static enum bpf_attach_type parse_attach_type(const char *str) return __MAX_BPF_ATTACH_TYPE; } +static void guess_vmlinux_btf_id(__u32 attach_btf_obj_id) +{ + struct bpf_btf_info btf_info = {}; + __u32 btf_len = sizeof(btf_info); + char name[16] = {}; + int err; + int fd; + + btf_info.name = ptr_to_u64(name); + btf_info.name_len = sizeof(name); + + fd = bpf_btf_get_fd_by_id(attach_btf_obj_id); + if (fd < 0) + return; + + err = bpf_obj_get_info_by_fd(fd, &btf_info, &btf_len); + if (err) + goto out; + + if (btf_info.kernel_btf && strncmp(name, "vmlinux", sizeof(name)) == 0) + btf_vmlinux_id = btf_info.id; + +out: + close(fd); +} + static int show_bpf_prog(int id, enum bpf_attach_type attach_type, const char *attach_flags_str, int level) { char prog_name[MAX_PROG_FULL_NAME]; + const char *attach_btf_name = NULL; struct bpf_prog_info info = {}; const char *attach_type_str; __u32 info_len = sizeof(info); @@ -84,6 +114,20 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, } attach_type_str = libbpf_bpf_attach_type_str(attach_type); + + if (btf_vmlinux) { + if (!btf_vmlinux_id) + guess_vmlinux_btf_id(info.attach_btf_obj_id); + + if (btf_vmlinux_id == info.attach_btf_obj_id && + info.attach_btf_id < btf__type_cnt(btf_vmlinux)) { + const struct btf_type *t = + btf__type_by_id(btf_vmlinux, info.attach_btf_id); + attach_btf_name = + btf__name_by_offset(btf_vmlinux, t->name_off); + } + } + get_prog_full_name(&info, prog_fd, prog_name, sizeof(prog_name)); if (json_output) { jsonw_start_object(json_wtr); @@ -95,6 +139,10 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); jsonw_string_field(json_wtr, "name", prog_name); + if (attach_btf_name) + jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name); + jsonw_uint_field(json_wtr, "attach_btf_obj_id", info.attach_btf_obj_id); + jsonw_uint_field(json_wtr, "attach_btf_id", info.attach_btf_id); jsonw_end_object(json_wtr); } else { printf("%s%-8u ", level ? " " : "", info.id); @@ -102,7 +150,13 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type, printf("%-15s", attach_type_str); else printf("type %-10u", attach_type); - printf(" %-15s %-15s\n", attach_flags_str, prog_name); + printf(" %-15s %-15s", attach_flags_str, prog_name); + if (attach_btf_name) + printf(" %-15s", attach_btf_name); + else if (info.attach_btf_id) + printf(" attach_btf_obj_id=%d attach_btf_id=%d", + info.attach_btf_obj_id, info.attach_btf_id); + printf("\n"); } close(prog_fd); @@ -144,40 +198,49 @@ static int cgroup_has_attached_progs(int cgroup_fd) static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, int level) { + LIBBPF_OPTS(bpf_prog_query_opts, p); + __u32 prog_attach_flags[1024] = {0}; const char *attach_flags_str; __u32 prog_ids[1024] = {0}; - __u32 prog_cnt, iter; - __u32 attach_flags; char buf[32]; + __u32 iter; int ret; - prog_cnt = ARRAY_SIZE(prog_ids); - ret = bpf_prog_query(cgroup_fd, type, query_flags, &attach_flags, - prog_ids, &prog_cnt); + p.query_flags = query_flags; + p.prog_cnt = ARRAY_SIZE(prog_ids); + p.prog_ids = prog_ids; + p.prog_attach_flags = prog_attach_flags; + + ret = bpf_prog_query_opts(cgroup_fd, type, &p); if (ret) return ret; - if (prog_cnt == 0) + if (p.prog_cnt == 0) return 0; - switch (attach_flags) { - case BPF_F_ALLOW_MULTI: - attach_flags_str = "multi"; - break; - case BPF_F_ALLOW_OVERRIDE: - attach_flags_str = "override"; - break; - case 0: - attach_flags_str = ""; - break; - default: - snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); - attach_flags_str = buf; - } + for (iter = 0; iter < p.prog_cnt; iter++) { + __u32 attach_flags; + + attach_flags = prog_attach_flags[iter] ?: p.attach_flags; + + switch (attach_flags) { + case BPF_F_ALLOW_MULTI: + attach_flags_str = "multi"; + break; + case BPF_F_ALLOW_OVERRIDE: + attach_flags_str = "override"; + break; + case 0: + attach_flags_str = ""; + break; + default: + snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); + attach_flags_str = buf; + } - for (iter = 0; iter < prog_cnt; iter++) show_bpf_prog(prog_ids[iter], type, attach_flags_str, level); + } return 0; } @@ -233,6 +296,7 @@ static int do_show(int argc, char **argv) printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType", "AttachFlags", "Name"); + btf_vmlinux = libbpf_find_kernel_btf(); for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { /* * Not all attach types may be supported, so it's expected, @@ -296,6 +360,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb, printf("%s\n", fpath); } + btf_vmlinux = libbpf_find_kernel_btf(); for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) show_attached_bpf_progs(cgroup_fd, type, ftw->level); From dca85aac8895708b74c7f2264e3ab5048c02b8b2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 Jun 2022 10:43:14 -0700 Subject: [PATCH 55/94] selftests/bpf: lsm_cgroup functional test Functional test that exercises the following: 1. apply default sk_priority policy 2. permit TX-only AF_PACKET socket 3. cgroup attach/detach/replace 4. reusing trampoline shim Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220628174314.1216643-12-sdf@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/lsm_cgroup.c | 293 ++++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../testing/selftests/bpf/progs/lsm_cgroup.c | 180 +++++++++++ 3 files changed, 474 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c create mode 100644 tools/testing/selftests/bpf/progs/lsm_cgroup.c diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c new file mode 100644 index 000000000000..d40810a742fa --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include "lsm_cgroup.skel.h" +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static struct btf *btf; + +static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func) +{ + LIBBPF_OPTS(bpf_prog_query_opts, p); + int cnt = 0; + int i; + + ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_LSM_CGROUP, &p), "prog_query"); + + if (!attach_func) + return p.prog_cnt; + + /* When attach_func is provided, count the number of progs that + * attach to the given symbol. + */ + + if (!btf) + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK(libbpf_get_error(btf), "btf_vmlinux")) + return -1; + + p.prog_ids = malloc(sizeof(u32) * p.prog_cnt); + p.prog_attach_flags = malloc(sizeof(u32) * p.prog_cnt); + ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_LSM_CGROUP, &p), "prog_query"); + + for (i = 0; i < p.prog_cnt; i++) { + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int fd; + + fd = bpf_prog_get_fd_by_id(p.prog_ids[i]); + ASSERT_GE(fd, 0, "prog_get_fd_by_id"); + ASSERT_OK(bpf_obj_get_info_by_fd(fd, &info, &info_len), "prog_info_by_fd"); + close(fd); + + if (info.attach_btf_id == + btf__find_by_name_kind(btf, attach_func, BTF_KIND_FUNC)) + cnt++; + } + + free(p.prog_ids); + free(p.prog_attach_flags); + + return cnt; +} + +static void test_lsm_cgroup_functional(void) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, attach_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int cgroup_fd = -1, cgroup_fd2 = -1, cgroup_fd3 = -1; + int listen_fd, client_fd, accepted_fd; + struct lsm_cgroup *skel = NULL; + int post_create_prog_fd2 = -1; + int post_create_prog_fd = -1; + int bind_link_fd2 = -1; + int bind_prog_fd2 = -1; + int alloc_prog_fd = -1; + int bind_prog_fd = -1; + int bind_link_fd = -1; + int clone_prog_fd = -1; + int err, fd, prio; + socklen_t socklen; + + cgroup_fd3 = test__join_cgroup("/sock_policy_empty"); + if (!ASSERT_GE(cgroup_fd3, 0, "create empty cgroup")) + goto close_cgroup; + + cgroup_fd2 = test__join_cgroup("/sock_policy_reuse"); + if (!ASSERT_GE(cgroup_fd2, 0, "create cgroup for reuse")) + goto close_cgroup; + + cgroup_fd = test__join_cgroup("/sock_policy"); + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) + goto close_cgroup; + + skel = lsm_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + goto close_cgroup; + + post_create_prog_fd = bpf_program__fd(skel->progs.socket_post_create); + post_create_prog_fd2 = bpf_program__fd(skel->progs.socket_post_create2); + bind_prog_fd = bpf_program__fd(skel->progs.socket_bind); + bind_prog_fd2 = bpf_program__fd(skel->progs.socket_bind2); + alloc_prog_fd = bpf_program__fd(skel->progs.socket_alloc); + clone_prog_fd = bpf_program__fd(skel->progs.socket_clone); + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 0, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 0, "total prog count"); + err = bpf_prog_attach(alloc_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach alloc_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 1, "total prog count"); + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_inet_csk_clone"), 0, "prog count"); + err = bpf_prog_attach(clone_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach clone_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_inet_csk_clone"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 2, "total prog count"); + + /* Make sure replacing works. */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 0, "prog count"); + err = bpf_prog_attach(post_create_prog_fd, cgroup_fd, + BPF_LSM_CGROUP, 0); + if (!ASSERT_OK(err, "attach post_create_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 3, "total prog count"); + + attach_opts.replace_prog_fd = post_create_prog_fd; + err = bpf_prog_attach_opts(post_create_prog_fd2, cgroup_fd, + BPF_LSM_CGROUP, &attach_opts); + if (!ASSERT_OK(err, "prog replace post_create_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_post_create"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 3, "total prog count"); + + /* Try the same attach/replace via link API. */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 0, "prog count"); + bind_link_fd = bpf_link_create(bind_prog_fd, cgroup_fd, + BPF_LSM_CGROUP, NULL); + if (!ASSERT_GE(bind_link_fd, 0, "link create bind_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + + update_opts.old_prog_fd = bind_prog_fd; + update_opts.flags = BPF_F_REPLACE; + + err = bpf_link_update(bind_link_fd, bind_prog_fd2, &update_opts); + if (!ASSERT_OK(err, "link update bind_prog_fd")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + + /* Attach another instance of bind program to another cgroup. + * This should trigger the reuse of the trampoline shim (two + * programs attaching to the same btf_id). + */ + + ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd2, "bpf_lsm_socket_bind"), 0, "prog count"); + bind_link_fd2 = bpf_link_create(bind_prog_fd2, cgroup_fd2, + BPF_LSM_CGROUP, NULL); + if (!ASSERT_GE(bind_link_fd2, 0, "link create bind_prog_fd2")) + goto detach_cgroup; + ASSERT_EQ(query_prog_cnt(cgroup_fd2, "bpf_lsm_socket_bind"), 1, "prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 4, "total prog count"); + ASSERT_EQ(query_prog_cnt(cgroup_fd2, NULL), 1, "total prog count"); + + /* AF_UNIX is prohibited. */ + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LT(fd, 0, "socket(AF_UNIX)"); + close(fd); + + /* AF_INET6 gets default policy (sk_priority). */ + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket(SOCK_STREAM)")) + goto detach_cgroup; + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 123, "sk_priority"); + + close(fd); + + /* TX-only AF_PACKET is allowed. */ + + ASSERT_LT(socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)), 0, + "socket(AF_PACKET, ..., ETH_P_ALL)"); + + fd = socket(AF_PACKET, SOCK_RAW, 0); + ASSERT_GE(fd, 0, "socket(AF_PACKET, ..., 0)"); + + /* TX-only AF_PACKET can not be rebound. */ + + struct sockaddr_ll sa = { + .sll_family = AF_PACKET, + .sll_protocol = htons(ETH_P_ALL), + }; + ASSERT_LT(bind(fd, (struct sockaddr *)&sa, sizeof(sa)), 0, + "bind(ETH_P_ALL)"); + + close(fd); + + /* Trigger passive open. */ + + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + ASSERT_GE(listen_fd, 0, "start_server"); + client_fd = connect_to_fd(listen_fd, 0); + ASSERT_GE(client_fd, 0, "connect_to_fd"); + accepted_fd = accept(listen_fd, NULL, NULL); + ASSERT_GE(accepted_fd, 0, "accept"); + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(accepted_fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 234, "sk_priority"); + + /* These are replaced and never called. */ + ASSERT_EQ(skel->bss->called_socket_post_create, 0, "called_create"); + ASSERT_EQ(skel->bss->called_socket_bind, 0, "called_bind"); + + /* AF_INET6+SOCK_STREAM + * AF_PACKET+SOCK_RAW + * listen_fd + * client_fd + * accepted_fd + */ + ASSERT_EQ(skel->bss->called_socket_post_create2, 5, "called_create2"); + + /* start_server + * bind(ETH_P_ALL) + */ + ASSERT_EQ(skel->bss->called_socket_bind2, 2, "called_bind2"); + /* Single accept(). */ + ASSERT_EQ(skel->bss->called_socket_clone, 1, "called_clone"); + + /* AF_UNIX+SOCK_STREAM (failed) + * AF_INET6+SOCK_STREAM + * AF_PACKET+SOCK_RAW (failed) + * AF_PACKET+SOCK_RAW + * listen_fd + * client_fd + * accepted_fd + */ + ASSERT_EQ(skel->bss->called_socket_alloc, 7, "called_alloc"); + + close(listen_fd); + close(client_fd); + close(accepted_fd); + + /* Make sure other cgroup doesn't trigger the programs. */ + + if (!ASSERT_OK(join_cgroup("/sock_policy_empty"), "join root cgroup")) + goto detach_cgroup; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket(SOCK_STREAM)")) + goto detach_cgroup; + + prio = 0; + socklen = sizeof(prio); + ASSERT_GE(getsockopt(fd, SOL_SOCKET, SO_PRIORITY, &prio, &socklen), 0, + "getsockopt"); + ASSERT_EQ(prio, 0, "sk_priority"); + + close(fd); + +detach_cgroup: + ASSERT_GE(bpf_prog_detach2(post_create_prog_fd2, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_create"); + close(bind_link_fd); + /* Don't close bind_link_fd2, exercise cgroup release cleanup. */ + ASSERT_GE(bpf_prog_detach2(alloc_prog_fd, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_alloc"); + ASSERT_GE(bpf_prog_detach2(clone_prog_fd, cgroup_fd, + BPF_LSM_CGROUP), 0, "detach_clone"); + +close_cgroup: + close(cgroup_fd); + close(cgroup_fd2); + close(cgroup_fd3); + lsm_cgroup__destroy(skel); +} + +void test_lsm_cgroup(void) +{ + if (test__start_subtest("functional")) + test_lsm_cgroup_functional(); + btf__free(btf); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 1c1289ba5fc5..98dd2c4815f0 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -8,6 +8,7 @@ #define SOL_SOCKET 1 #define SO_SNDBUF 7 #define __SO_ACCEPTCON (1 << 16) +#define SO_PRIORITY 12 #define SOL_TCP 6 #define TCP_CONGESTION 13 diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c new file mode 100644 index 000000000000..89f3b1e961a8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#ifndef AF_PACKET +#define AF_PACKET 17 +#endif + +#ifndef AF_UNIX +#define AF_UNIX 1 +#endif + +#ifndef EPERM +#define EPERM 1 +#endif + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, __u64); + __type(value, __u64); +} cgroup_storage SEC(".maps"); + +int called_socket_post_create; +int called_socket_post_create2; +int called_socket_bind; +int called_socket_bind2; +int called_socket_alloc; +int called_socket_clone; + +static __always_inline int test_local_storage(void) +{ + __u64 *val; + + val = bpf_get_local_storage(&cgroup_storage, 0); + if (!val) + return 0; + *val += 1; + + return 1; +} + +static __always_inline int real_create(struct socket *sock, int family, + int protocol) +{ + struct sock *sk; + int prio = 123; + + /* Reject non-tx-only AF_PACKET. */ + if (family == AF_PACKET && protocol != 0) + return 0; /* EPERM */ + + sk = sock->sk; + if (!sk) + return 1; + + /* The rest of the sockets get default policy. */ + if (bpf_setsockopt(sk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + + /* Make sure bpf_getsockopt is allowed and works. */ + prio = 0; + if (bpf_getsockopt(sk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + if (prio != 123) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create, struct socket *sock, int family, + int type, int protocol, int kern) +{ + called_socket_post_create++; + return real_create(sock, family, protocol); +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_post_create") +int BPF_PROG(socket_post_create2, struct socket *sock, int family, + int type, int protocol, int kern) +{ + called_socket_post_create2++; + return real_create(sock, family, protocol); +} + +static __always_inline int real_bind(struct socket *sock, + struct sockaddr *address, + int addrlen) +{ + struct sockaddr_ll sa = {}; + + if (sock->sk->__sk_common.skc_family != AF_PACKET) + return 1; + + if (sock->sk->sk_kern_sock) + return 1; + + bpf_probe_read_kernel(&sa, sizeof(sa), address); + if (sa.sll_protocol) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_bind") +int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, + int addrlen) +{ + called_socket_bind++; + return real_bind(sock, address, addrlen); +} + +/* __cgroup_bpf_run_lsm_socket */ +SEC("lsm_cgroup/socket_bind") +int BPF_PROG(socket_bind2, struct socket *sock, struct sockaddr *address, + int addrlen) +{ + called_socket_bind2++; + return real_bind(sock, address, addrlen); +} + +/* __cgroup_bpf_run_lsm_current (via bpf_lsm_current_hooks) */ +SEC("lsm_cgroup/sk_alloc_security") +int BPF_PROG(socket_alloc, struct sock *sk, int family, gfp_t priority) +{ + called_socket_alloc++; + if (family == AF_UNIX) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} + +/* __cgroup_bpf_run_lsm_sock */ +SEC("lsm_cgroup/inet_csk_clone") +int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) +{ + int prio = 234; + + called_socket_clone++; + + if (!newsk) + return 1; + + /* Accepted request sockets get a different priority. */ + if (bpf_setsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + + /* Make sure bpf_getsockopt is allowed and works. */ + prio = 0; + if (bpf_getsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) + return 0; /* EPERM */ + if (prio != 234) + return 0; /* EPERM */ + + /* Can access cgroup local storage. */ + if (!test_local_storage()) + return 0; /* EPERM */ + + return 1; +} From f0cf642c56b76dfbbb5f2be67fa180191d5ab0ef Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 12:13:51 +0100 Subject: [PATCH 56/94] bpftool: Probe for memcg-based accounting before bumping rlimit Bpftool used to bump the memlock rlimit to make sure to be able to load BPF objects. After the kernel has switched to memcg-based memory accounting [0] in 5.11, bpftool has relied on libbpf to probe the system for memcg-based accounting support and for raising the rlimit if necessary [1]. But this was later reverted, because the probe would sometimes fail, resulting in bpftool not being able to load all required objects [2]. Here we add a more efficient probe, in bpftool itself. We first lower the rlimit to 0, then we attempt to load a BPF object (and finally reset the rlimit): if the load succeeds, then memcg-based memory accounting is supported. This approach was earlier proposed for the probe in libbpf itself [3], but given that the library may be used in multithreaded applications, the probe could have undesirable consequences if one thread attempts to lock kernel memory while memlock rlimit is at 0. Since bpftool is single-threaded and the rlimit is process-based, this is fine to do in bpftool itself. This probe was inspired by the similar one from the cilium/ebpf Go library [4]. [0] commit 97306be45fbe ("Merge branch 'switch to memcg-based memory accounting'") [1] commit a777e18f1bcd ("bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK") [2] commit 6b4384ff1088 ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"") [3] https://lore.kernel.org/bpf/20220609143614.97837-1-quentin@isovalent.com/t/#u [4] https://github.com/cilium/ebpf/blob/v0.9.0/rlimit/rlimit.go#L39 Suggested-by: Daniel Borkmann Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: Yafang Shao Link: https://lore.kernel.org/bpf/20220629111351.47699-1-quentin@isovalent.com --- tools/bpf/bpftool/common.c | 71 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index a0d4acd7c54a..fc8172a4969a 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -13,14 +13,17 @@ #include #include #include -#include -#include #include #include #include #include #include +#include +#include +#include +#include + #include #include #include /* libbpf_num_possible_cpus */ @@ -73,11 +76,73 @@ static bool is_bpffs(char *path) return (unsigned long)st_fs.f_type == BPF_FS_MAGIC; } +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and programs. This was done in + * commit 97306be45fbe ("Merge branch 'switch to memcg-based memory + * accounting'"), in Linux 5.11. + * + * Libbpf also offers to probe for memcg-based accounting vs rlimit, but does + * so by checking for the availability of a given BPF helper and this has + * failed on some kernels with backports in the past, see commit 6b4384ff1088 + * ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK""). + * Instead, we can probe by lowering the process-based rlimit to 0, trying to + * load a BPF object, and resetting the rlimit. If the load succeeds then + * memcg-based accounting is supported. + * + * This would be too dangerous to do in the library, because multithreaded + * applications might attempt to load items while the rlimit is at 0. Given + * that bpftool is single-threaded, this is fine to do here. + */ +static bool known_to_need_rlimit(void) +{ + struct rlimit rlim_init, rlim_cur_zero = {}; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd, err; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim_init)) + return false; + + /* Drop the soft limit to zero. We maintain the hard limit to its + * current value, because lowering it would be a permanent operation + * for unprivileged users. + */ + rlim_cur_zero.rlim_max = rlim_init.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rlim_cur_zero)) + return false; + + /* Do not use bpf_prog_load() from libbpf here, because it calls + * bump_rlimit_memlock(), interfering with the current probe. + */ + prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + err = errno; + + /* reset soft rlimit to its initial value */ + setrlimit(RLIMIT_MEMLOCK, &rlim_init); + + if (prog_fd < 0) + return err == EPERM; + + close(prog_fd); + return false; +} + void set_max_rlimit(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; - setrlimit(RLIMIT_MEMLOCK, &rinf); + if (known_to_need_rlimit()) + setrlimit(RLIMIT_MEMLOCK, &rinf); } static int From b0cbd6154a9a3964490cafa0a9444be626271bd9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 30 Jun 2022 11:36:38 +0200 Subject: [PATCH 57/94] bpftool: Remove attach_type_name forward declaration The attach_type_name definition was removed in commit 1ba5ad36e00f ("bpftool: Use libbpf_bpf_attach_type_str"). Remove its forward declaration in main.h as well. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220630093638.25916-1-tklauser@distanz.ch --- tools/bpf/bpftool/main.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 589cb76b227a..5e5060c2ac04 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -63,8 +63,6 @@ static inline void *u64_to_ptr(__u64 ptr) #define HELP_SPEC_LINK \ "LINK := { id LINK_ID | pinned FILE }" -extern const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE]; - /* keep in sync with the definition in skeleton/pid_iter.bpf.c */ enum bpf_obj_type { BPF_OBJ_UNKNOWN, From 27b3f70553432114b3d26f4d9c72cf02f38b84ee Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 21:36:36 +0100 Subject: [PATCH 58/94] bpftool: Add feature list (prog/map/link/attach types, helpers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a "bpftool feature list" subcommand to list BPF "features". Contrarily to "bpftool feature probe", this is not about the features available on the system. Instead, it lists all features known to bpftool from compilation time; in other words, all program, map, attach, link types known to the libbpf version in use, and all helpers found in the UAPI BPF header. The first use case for this feature is bash completion: running the command provides a list of types that can be used to produce the list of candidate map types, for example. Now that bpftool uses "standard" names provided by libbpf for the program, map, link, and attach types, having the ability to list these types and helpers could also be useful in scripts to loop over existing items. Sample output: # bpftool feature list prog_types | grep -vw unspec | head -n 6 socket_filter kprobe sched_cls sched_act tracepoint xdp # bpftool -p feature list map_types | jq '.[1]' "hash" # bpftool feature list attach_types | grep '^cgroup_' cgroup_inet_ingress cgroup_inet_egress [...] cgroup_inet_sock_release # bpftool feature list helpers | grep -vw bpf_unspec | wc -l 207 The "unspec" types and helpers are not filtered out by bpftool, so as to remain closer to the enums, and to preserve the indices in the JSON arrays (e.g. "hash" at index 1 == BPF_MAP_TYPE_HASH in map types list). Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/20220629203637.138944-2-quentin@isovalent.com --- .../bpftool/Documentation/bpftool-feature.rst | 12 ++++ tools/bpf/bpftool/bash-completion/bpftool | 7 ++- tools/bpf/bpftool/feature.c | 55 +++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 4ce9a77bc1e0..c08064628d39 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -24,9 +24,11 @@ FEATURE COMMANDS ================ | **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list** *GROUP* | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } DESCRIPTION =========== @@ -70,6 +72,16 @@ DESCRIPTION The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. + **bpftool feature list** *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper + functions (**helpers**). The command does not probe the system, but + simply lists the elements that bpftool knows from compilation time, + as provided from libbpf (for all object types) or from the BPF UAPI + header (list of helpers). This can be used in scripts to iterate over + BPF types or helpers. + **bpftool feature help** Print short help message. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 91f89a9a5b36..9cef6516320b 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1175,9 +1175,14 @@ _bpftool() _bpftool_once_attr 'full unprivileged' return 0 ;; + list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W 'prog_types map_types \ + attach_types link_types helpers' -- "$cur" ) ) + ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help probe' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help list probe' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index bac4ef428a02..576cc6b90c6a 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1258,6 +1258,58 @@ exit_close_json: return 0; } +static const char *get_helper_name(unsigned int id) +{ + if (id >= ARRAY_SIZE(helper_name)) + return NULL; + + return helper_name[id]; +} + +static int do_list(int argc, char **argv) +{ + const char *(*get_name)(unsigned int id); + unsigned int id = 0; + + if (argc < 1) + usage(); + + if (is_prefix(*argv, "prog_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_prog_type_str; + } else if (is_prefix(*argv, "map_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_map_type_str; + } else if (is_prefix(*argv, "attach_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_attach_type_str; + } else if (is_prefix(*argv, "link_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_link_type_str; + } else if (is_prefix(*argv, "helpers")) { + get_name = get_helper_name; + } else { + p_err("expected 'prog_types', 'map_types', 'attach_types', 'link_types' or 'helpers', got: %s", *argv); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); /* root array */ + + while (true) { + const char *name; + + name = get_name(id++); + if (!name) + break; + if (json_output) + jsonw_string(json_wtr, name); + else + printf("%s\n", name); + } + + if (json_output) + jsonw_end_array(json_wtr); /* root array */ + + return 0; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1267,9 +1319,11 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" + " %1$s %2$s list GROUP\n" " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" + " GROUP := { prog_types | map_types | attach_types | link_types | helpers }\n" " " HELP_SPEC_OPTIONS " }\n" "", bin_name, argv[-2]); @@ -1279,6 +1333,7 @@ static int do_help(int argc, char **argv) static const struct cmd cmds[] = { { "probe", do_probe }, + { "list", do_list }, { "help", do_help }, { 0 } }; From 6d304871e3ef4c339c06aa9b4ab55b6c77642884 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 29 Jun 2022 21:36:37 +0100 Subject: [PATCH 59/94] bpftool: Use feature list in bash completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that bpftool is able to produce a list of known program, map, attach types, let's use as much of this as we can in the bash completion file, so that we don't have to expand the list each time a new type is added to the kernel. Also update the relevant test script to remove some checks that are no longer needed. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/20220629203637.138944-3-quentin@isovalent.com --- tools/bpf/bpftool/bash-completion/bpftool | 21 ++++--------------- .../selftests/bpf/test_bpftool_synctypes.py | 20 +++--------------- 2 files changed, 7 insertions(+), 34 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9cef6516320b..ee177f83b179 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -703,15 +703,8 @@ _bpftool() return 0 ;; type) - local BPFTOOL_MAP_CREATE_TYPES='hash array \ - prog_array perf_event_array percpu_hash \ - percpu_array stack_trace cgroup_array lru_hash \ - lru_percpu_hash lpm_trie array_of_maps \ - hash_of_maps devmap devmap_hash sockmap cpumap \ - xskmap sockhash cgroup_storage reuseport_sockarray \ - percpu_cgroup_storage queue stack sk_storage \ - struct_ops ringbuf inode_storage task_storage \ - bloom_filter' + local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list map_types | \ + grep -v '^unspec$')" COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) return 0 ;; @@ -1039,14 +1032,8 @@ _bpftool() return 0 ;; attach|detach) - local BPFTOOL_CGROUP_ATTACH_TYPES='cgroup_inet_ingress cgroup_inet_egress \ - cgroup_inet_sock_create cgroup_sock_ops cgroup_device cgroup_inet4_bind \ - cgroup_inet6_bind cgroup_inet4_post_bind cgroup_inet6_post_bind \ - cgroup_inet4_connect cgroup_inet6_connect cgroup_inet4_getpeername \ - cgroup_inet6_getpeername cgroup_inet4_getsockname cgroup_inet6_getsockname \ - cgroup_udp4_sendmsg cgroup_udp6_sendmsg cgroup_udp4_recvmsg \ - cgroup_udp6_recvmsg cgroup_sysctl cgroup_getsockopt cgroup_setsockopt \ - cgroup_inet_sock_release' + local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list attach_types | \ + grep '^cgroup_')" local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' # Check for $prev = $command first diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index e443e6542cb9..a6410bebe603 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -471,12 +471,6 @@ class BashcompExtractor(FileExtractor): def get_prog_attach_types(self): return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES') - def get_map_types(self): - return self.get_bashcomp_list('BPFTOOL_MAP_CREATE_TYPES') - - def get_cgroup_attach_types(self): - return self.get_bashcomp_list('BPFTOOL_CGROUP_ATTACH_TYPES') - def verify(first_set, second_set, message): """ Print all values that differ between two sets. @@ -516,17 +510,12 @@ def main(): man_map_types = man_map_info.get_map_types() man_map_info.close() - bashcomp_info = BashcompExtractor() - bashcomp_map_types = bashcomp_info.get_map_types() - verify(source_map_types, help_map_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {MapFileExtractor.filename} (do_help() TYPE):') verify(source_map_types, man_map_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {ManMapExtractor.filename} (TYPE):') verify(help_map_options, man_map_options, f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):') - verify(source_map_types, bashcomp_map_types, - f'Comparing {BpfHeaderExtractor.filename} (bpf_map_type) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):') # Attach types (names) @@ -542,8 +531,10 @@ def main(): man_prog_attach_types = man_prog_info.get_attach_types() man_prog_info.close() - bashcomp_info.reset_read() # We stopped at map types, rewind + + bashcomp_info = BashcompExtractor() bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types() + bashcomp_info.close() verify(source_prog_attach_types, help_prog_attach_types, f'Comparing {ProgFileExtractor.filename} (bpf_attach_type) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):') @@ -568,17 +559,12 @@ def main(): man_cgroup_attach_types = man_cgroup_info.get_attach_types() man_cgroup_info.close() - bashcomp_cgroup_attach_types = bashcomp_info.get_cgroup_attach_types() - bashcomp_info.close() - verify(source_cgroup_attach_types, help_cgroup_attach_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):') verify(source_cgroup_attach_types, man_cgroup_attach_types, f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {ManCgroupExtractor.filename} (ATTACH_TYPE):') verify(help_cgroup_options, man_cgroup_options, f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):') - verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types, - f'Comparing {BpfHeaderExtractor.filename} (bpf_attach_type) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):') # Options for remaining commands From 24d2e5d9da608445e2e5c2c9ab5cf418f0fc4612 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:55 +0200 Subject: [PATCH 60/94] selftests/xsk: Avoid bpf_link probe for existing xsk Currently bpf_link probe is done for each call of xsk_socket__create(). For cases where xsk context was previously created and current socket creation uses it, has_bpf_link will be overwritten, where it has already been initialized. Optimize this by moving the query to the xsk_create_ctx() so that when xsk_get_ctx() finds a ctx then no further bpf_link probes are needed. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-2-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index eb50c3f336f8..fa13d2c44517 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -958,6 +958,7 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, ctx->fill = fill; ctx->comp = comp; list_add(&ctx->list, &umem->ctx_list); + ctx->has_bpf_link = xsk_probe_bpf_link(); return ctx; } @@ -1059,7 +1060,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } xsk->ctx = ctx; - xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); if (rx && !rx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, From 61333008d01e18716a7050fdc9479cc8d6e63883 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:56 +0200 Subject: [PATCH 61/94] selftests/xsk: Introduce XDP prog load based on existing AF_XDP socket Currently, xsk_setup_xdp_prog() uses anonymous xsk_socket struct which means that during xsk_create_bpf_link() call, xsk->config.xdp_flags is always 0. This in turn means that from xdpxceiver it is impossible to use xdpgeneric attachment, so since commit 3b22523bca02 ("selftests, xsk: Fix bpf_res cleanup test") we were not testing SKB mode at all. To fix this, introduce a function, called xsk_setup_xdp_prog_xsk(), that will load XDP prog based on the existing xsk_socket, so that xsk context's refcount is correctly bumped and flags from application side are respected. Use this from xdpxceiver side so we get coverage of generic and native XDP program attach points. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-3-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 2 +- tools/testing/selftests/bpf/xsk.c | 5 +++++ tools/testing/selftests/bpf/xsk.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 019c567b6b4e..c024aa91ea02 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1130,7 +1130,7 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (!ifindex) exit_with_error(errno); - ret = xsk_setup_xdp_prog(ifindex, &ifobject->xsk_map_fd); + ret = xsk_setup_xdp_prog_xsk(ifobject->xsk->xsk, &ifobject->xsk_map_fd); if (ret) exit_with_error(-ret); diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index fa13d2c44517..db911127720e 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -880,6 +880,11 @@ static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) return err; } +int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd) +{ + return __xsk_setup_xdp_prog(xsk, xsks_map_fd); +} + static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, __u32 queue_id) { diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 915e7135337c..997723b0bfb2 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -269,6 +269,7 @@ struct xsk_umem_config { __u32 flags; }; +int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd); int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); From 6d4c767c032b165cc290c51f4e82bc54b14b1cf1 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:57 +0200 Subject: [PATCH 62/94] selftests/xsk: Verify correctness of XDP prog attach point To prevent the case we had previously where for TEST_MODE_SKB, XDP prog was attached in native mode, call bpf_xdp_query() after loading prog and make sure that attach_mode is as expected. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-4-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index c024aa91ea02..4c425a43e5b0 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1085,6 +1085,7 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) { u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + LIBBPF_OPTS(bpf_xdp_query_opts, opts); int ret, ifindex; void *bufs; u32 i; @@ -1134,6 +1135,22 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (ret) exit_with_error(-ret); + ret = bpf_xdp_query(ifindex, ifobject->xdp_flags, &opts); + if (ret) + exit_with_error(-ret); + + if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) { + if (opts.attach_mode != XDP_ATTACHED_SKB) { + ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n"); + exit_with_error(-EINVAL); + } + } else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) { + if (opts.attach_mode != XDP_ATTACHED_DRV) { + ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n"); + exit_with_error(-EINVAL); + } + } + ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd); if (ret) exit_with_error(-ret); From 39e940d4abfabb08b6937a315546b24d10be67e3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 29 Jun 2022 16:34:58 +0200 Subject: [PATCH 63/94] selftests/xsk: Destroy BPF resources only when ctx refcount drops to 0 Currently, xsk_socket__delete frees BPF resources regardless of ctx refcount. Xdpxceiver has a test to verify whether underlying BPF resources would not be wiped out after closing XSK socket that was bound to interface with other active sockets. From library's xsk part perspective it also means that the internal xsk context is shared and its refcount is bumped accordingly. After a switch to loading XDP prog based on previously opened XSK socket, mentioned xdpxceiver test fails with: not ok 16 [xdpxceiver.c:swap_xsk_resources:1334]: ERROR: 9/"Bad file descriptor which means that in swap_xsk_resources(), xsk_socket__delete() released xskmap which in turn caused a failure of xsk_socket__update_xskmap(). To fix this, when deleting socket, decrement ctx refcount before releasing BPF resources and do so only when refcount dropped to 0 which means there are no more active sockets for this ctx so BPF resources can be freed safely. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220629143458.934337-5-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xsk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index db911127720e..f2721a4ae7c5 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -1156,8 +1156,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, goto out_mmap_tx; } - ctx->prog_fd = -1; - if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { err = __xsk_setup_xdp_prog(xsk, NULL); if (err) @@ -1238,7 +1236,10 @@ void xsk_socket__delete(struct xsk_socket *xsk) ctx = xsk->ctx; umem = ctx->umem; - if (ctx->prog_fd != -1) { + + xsk_put_ctx(ctx, true); + + if (!ctx->refcount) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); if (ctx->has_bpf_link) @@ -1257,8 +1258,6 @@ void xsk_socket__delete(struct xsk_socket *xsk) } } - xsk_put_ctx(ctx, true); - umem->refcount--; /* Do not close an fd that also has an associated umem connected * to it. From 7a255ae77216237a4ce83ddea595aa4e0a812f46 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 29 Jun 2022 15:48:32 +0000 Subject: [PATCH 64/94] bpftool: Show also the name of type BPF_OBJ_LINK For example, /sys/fs/bpf/maps.debug is a BPF link. When you run `bpftool map show` to show it: Before: $ bpftool map show pinned /sys/fs/bpf/maps.debug Error: incorrect object type: unknown After: $ bpftool map show pinned /sys/fs/bpf/maps.debug Error: incorrect object type: link Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220629154832.56986-5-laoar.shao@gmail.com --- tools/bpf/bpftool/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index fc8172a4969a..067e9ea59e3b 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -316,6 +316,7 @@ const char *get_fd_type_name(enum bpf_obj_type type) [BPF_OBJ_UNKNOWN] = "unknown", [BPF_OBJ_PROG] = "prog", [BPF_OBJ_MAP] = "map", + [BPF_OBJ_LINK] = "link", }; if (type < 0 || type >= ARRAY_SIZE(names) || !names[type]) From b0d93b44641a83c28014ca38001e85bf6dc8501e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Jun 2022 15:42:03 -0700 Subject: [PATCH 65/94] selftests/bpf: Skip lsm_cgroup when we don't have trampolines With arch_prepare_bpf_trampoline removed on x86: [...] #98/1 lsm_cgroup/functional:SKIP #98 lsm_cgroup:SKIP Summary: 1/0 PASSED, 1 SKIPPED, 0 FAILED Fixes: dca85aac8895 ("selftests/bpf: lsm_cgroup functional test") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20220630224203.512815-1-sdf@google.com --- tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index d40810a742fa..c542d7e80a5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -9,6 +9,10 @@ #include "cgroup_helpers.h" #include "network_helpers.h" +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static struct btf *btf; static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func) @@ -100,6 +104,10 @@ static void test_lsm_cgroup_functional(void) ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 0, "prog count"); ASSERT_EQ(query_prog_cnt(cgroup_fd, NULL), 0, "total prog count"); err = bpf_prog_attach(alloc_prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0); + if (err == -ENOTSUPP) { + test__skip(); + goto close_cgroup; + } if (!ASSERT_OK(err, "attach alloc_prog_fd")) goto detach_cgroup; ASSERT_EQ(query_prog_cnt(cgroup_fd, "bpf_lsm_sk_alloc_security"), 1, "prog count"); From 2064a132c0de3426d5ba43023200994e0c77e652 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 30 Jun 2022 10:26:18 +0200 Subject: [PATCH 66/94] bpf: Omit superfluous address family check in __bpf_skc_lookup family is only set to either AF_INET or AF_INET6 based on len. In all other cases we return early. Thus the check against AF_UNSPEC can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220630082618.15649-1-tklauser@distanz.ch --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c6941ab0eb52..4fae91984359 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6516,8 +6516,8 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u64 flags) { struct sock *sk = NULL; - u8 family = AF_UNSPEC; struct net *net; + u8 family; int sdif; if (len == sizeof(tuple->ipv4)) @@ -6527,8 +6527,7 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, else return NULL; - if (unlikely(family == AF_UNSPEC || flags || - !((s32)netns_id < 0 || netns_id <= S32_MAX))) + if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (family == AF_INET) From 990a6194f7e16cc23334892287f32899e241b1a9 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 1 Jul 2022 10:38:05 +0100 Subject: [PATCH 67/94] bpftool: Rename "bpftool feature list" into "... feature list_builtins" To make it more explicit that the features listed with "bpftool feature list" are known to bpftool, but not necessary available on the system (as opposed to the probed features), rename the "feature list" command into "feature list_builtins". Note that "bpftool feature list" still works as before given that we recognise arguments from their prefixes; but the real name of the subcommand, in particular as displayed in the man page or the interactive help, will now include "_builtins". Since we update the bash completion accordingly, let's also take this chance to redirect error output to /dev/null in the completion script, to avoid displaying unexpected error messages when users attempt to tab-complete. Suggested-by: Daniel Borkmann Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220701093805.16920-1-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/bpftool-feature.rst | 4 ++-- tools/bpf/bpftool/bash-completion/bpftool | 8 ++++---- tools/bpf/bpftool/feature.c | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index c08064628d39..e44039f89be7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -24,7 +24,7 @@ FEATURE COMMANDS ================ | **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] -| **bpftool** **feature list** *GROUP* +| **bpftool** **feature list_builtins** *GROUP* | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } @@ -72,7 +72,7 @@ DESCRIPTION The keywords **full**, **macros** and **prefix** have the same role as when probing the kernel. - **bpftool feature list** *GROUP* + **bpftool feature list_builtins** *GROUP* List items known to bpftool. These can be BPF program types (**prog_types**), BPF map types (**map_types**), attach types (**attach_types**), link types (**link_types**), or BPF helper diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index ee177f83b179..dc1641e3670e 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -703,7 +703,7 @@ _bpftool() return 0 ;; type) - local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list map_types | \ + local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list_builtins map_types 2>/dev/null | \ grep -v '^unspec$')" COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) return 0 @@ -1032,7 +1032,7 @@ _bpftool() return 0 ;; attach|detach) - local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list attach_types | \ + local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ grep '^cgroup_')" local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' @@ -1162,14 +1162,14 @@ _bpftool() _bpftool_once_attr 'full unprivileged' return 0 ;; - list) + list_builtins) [[ $prev != "$command" ]] && return 0 COMPREPLY=( $( compgen -W 'prog_types map_types \ attach_types link_types helpers' -- "$cur" ) ) ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'help list probe' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'help list_builtins probe' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 576cc6b90c6a..7ecabf7947fb 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -1266,7 +1266,7 @@ static const char *get_helper_name(unsigned int id) return helper_name[id]; } -static int do_list(int argc, char **argv) +static int do_list_builtins(int argc, char **argv) { const char *(*get_name)(unsigned int id); unsigned int id = 0; @@ -1319,7 +1319,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" - " %1$s %2$s list GROUP\n" + " %1$s %2$s list_builtins GROUP\n" " %1$s %2$s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" @@ -1332,9 +1332,9 @@ static int do_help(int argc, char **argv) } static const struct cmd cmds[] = { - { "probe", do_probe }, - { "list", do_list }, - { "help", do_help }, + { "probe", do_probe }, + { "list_builtins", do_list_builtins }, + { "help", do_help }, { 0 } }; From cfb5a2dbf1413a0086e987d99ad591b91fc9cf5c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 30 Jun 2022 11:37:17 +0200 Subject: [PATCH 68/94] bpf, samples: Remove AF_XDP samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the AF_XDP samples from samples/bpf/ as they are dependent on the AF_XDP support in libbpf. This support has now been removed in the 1.0 release, so these samples cannot be compiled anymore. Please start to use libxdp instead. It is backwards compatible with the AF_XDP support that was offered in libbpf. New samples can be found in the various xdp-project repositories connected to libxdp and by googling. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Acked-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20220630093717.8664-1-magnus.karlsson@gmail.com --- MAINTAINERS | 2 - samples/bpf/Makefile | 9 - samples/bpf/xdpsock.h | 19 - samples/bpf/xdpsock_ctrl_proc.c | 190 --- samples/bpf/xdpsock_kern.c | 24 - samples/bpf/xdpsock_user.c | 2019 ------------------------------- samples/bpf/xsk_fwd.c | 1085 ----------------- 7 files changed, 3348 deletions(-) delete mode 100644 samples/bpf/xdpsock.h delete mode 100644 samples/bpf/xdpsock_ctrl_proc.c delete mode 100644 samples/bpf/xdpsock_kern.c delete mode 100644 samples/bpf/xdpsock_user.c delete mode 100644 samples/bpf/xsk_fwd.c diff --git a/MAINTAINERS b/MAINTAINERS index ca95b1833b97..ccd774c37c56 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21749,8 +21749,6 @@ F: include/uapi/linux/if_xdp.h F: include/uapi/linux/xdp_diag.h F: include/net/netns/xdp.h F: net/xdp/ -F: samples/bpf/xdpsock* -F: tools/lib/bpf/xsk* XEN BLOCK SUBSYSTEM M: Roger Pau Monné diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 03e3d3529ac9..5002a5b9a7da 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -45,9 +45,6 @@ tprogs-y += xdp_rxq_info tprogs-y += syscall_tp tprogs-y += cpustat tprogs-y += xdp_adjust_tail -tprogs-y += xdpsock -tprogs-y += xdpsock_ctrl_proc -tprogs-y += xsk_fwd tprogs-y += xdp_fwd tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts @@ -109,9 +106,6 @@ xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o -xdpsock-objs := xdpsock_user.o -xdpsock_ctrl_proc-objs := xdpsock_ctrl_proc.o -xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o @@ -179,7 +173,6 @@ always-y += xdp_sample_pkts_kern.o always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o -always-y += xdpsock_kern.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux @@ -224,8 +217,6 @@ TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt TPROGLDLIBS_test_overhead += -lrt -TPROGLDLIBS_xdpsock += -pthread -lcap -TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h deleted file mode 100644 index fd70cce60712..000000000000 --- a/samples/bpf/xdpsock.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright(c) 2019 Intel Corporation. - */ - -#ifndef XDPSOCK_H_ -#define XDPSOCK_H_ - -#define MAX_SOCKS 4 - -#define SOCKET_NAME "sock_cal_bpf_fd" -#define MAX_NUM_OF_CLIENTS 10 - -#define CLOSE_CONN 1 - -typedef __u64 u64; -typedef __u32 u32; - -#endif /* XDPSOCK_H */ diff --git a/samples/bpf/xdpsock_ctrl_proc.c b/samples/bpf/xdpsock_ctrl_proc.c deleted file mode 100644 index 28b5f2a9fa08..000000000000 --- a/samples/bpf/xdpsock_ctrl_proc.c +++ /dev/null @@ -1,190 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2017 - 2018 Intel Corporation. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "xdpsock.h" - -/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -static const char *opt_if = ""; - -static struct option long_options[] = { - {"interface", required_argument, 0, 'i'}, - {0, 0, 0, 0} -}; - -static void usage(const char *prog) -{ - const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -i, --interface=n Run on interface n\n" - "\n"; - fprintf(stderr, "%s\n", str); - - exit(0); -} - -static void parse_command_line(int argc, char **argv) -{ - int option_index, c; - - opterr = 0; - - for (;;) { - c = getopt_long(argc, argv, "i:", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'i': - opt_if = optarg; - break; - default: - usage(basename(argv[0])); - } - } -} - -static int send_xsks_map_fd(int sock, int fd) -{ - char cmsgbuf[CMSG_SPACE(sizeof(int))]; - struct msghdr msg; - struct iovec iov; - int value = 0; - - if (fd == -1) { - fprintf(stderr, "Incorrect fd = %d\n", fd); - return -1; - } - iov.iov_base = &value; - iov.iov_len = sizeof(int); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_flags = 0; - msg.msg_control = cmsgbuf; - msg.msg_controllen = CMSG_LEN(sizeof(int)); - - struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); - - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); - - *(int *)CMSG_DATA(cmsg) = fd; - int ret = sendmsg(sock, &msg, 0); - - if (ret == -1) { - fprintf(stderr, "Sendmsg failed with %s", strerror(errno)); - return -errno; - } - - return ret; -} - -int -main(int argc, char **argv) -{ - struct sockaddr_un server; - int listening = 1; - int rval, msgsock; - int ifindex = 0; - int flag = 1; - int cmd = 0; - int sock; - int err; - int xsks_map_fd; - - parse_command_line(argc, argv); - - ifindex = if_nametoindex(opt_if); - if (ifindex == 0) { - fprintf(stderr, "Unable to get ifindex for Interface %s. Reason:%s", - opt_if, strerror(errno)); - return -errno; - } - - sock = socket(AF_UNIX, SOCK_STREAM, 0); - if (sock < 0) { - fprintf(stderr, "Opening socket stream failed: %s", strerror(errno)); - return -errno; - } - - server.sun_family = AF_UNIX; - strcpy(server.sun_path, SOCKET_NAME); - - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &flag, sizeof(int)); - - if (bind(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un))) { - fprintf(stderr, "Binding to socket stream failed: %s", strerror(errno)); - return -errno; - } - - listen(sock, MAX_NUM_OF_CLIENTS); - - err = xsk_setup_xdp_prog(ifindex, &xsks_map_fd); - if (err) { - fprintf(stderr, "Setup of xdp program failed\n"); - goto close_sock; - } - - while (listening) { - msgsock = accept(sock, 0, 0); - if (msgsock == -1) { - fprintf(stderr, "Error accepting connection: %s", strerror(errno)); - err = -errno; - goto close_sock; - } - err = send_xsks_map_fd(msgsock, xsks_map_fd); - if (err <= 0) { - fprintf(stderr, "Error %d sending xsks_map_fd\n", err); - goto cleanup; - } - do { - rval = read(msgsock, &cmd, sizeof(int)); - if (rval < 0) { - fprintf(stderr, "Error reading stream message"); - } else { - if (cmd != CLOSE_CONN) - fprintf(stderr, "Recv unknown cmd = %d\n", cmd); - listening = 0; - break; - } - } while (rval > 0); - } - close(msgsock); - close(sock); - unlink(SOCKET_NAME); - - /* Unset fd for given ifindex */ - err = bpf_xdp_detach(ifindex, 0, NULL); - if (err) { - fprintf(stderr, "Error when unsetting bpf prog_fd for ifindex(%d)\n", ifindex); - return err; - } - - return 0; - -cleanup: - close(msgsock); -close_sock: - close(sock); - unlink(SOCKET_NAME); - return err; -} diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c deleted file mode 100644 index 05430484375c..000000000000 --- a/samples/bpf/xdpsock_kern.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "xdpsock.h" - -/* This XDP program is only needed for the XDP_SHARED_UMEM mode. - * If you do not use this mode, libbpf can supply an XDP program for you. - */ - -struct { - __uint(type, BPF_MAP_TYPE_XSKMAP); - __uint(max_entries, MAX_SOCKS); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} xsks_map SEC(".maps"); - -static unsigned int rr; - -SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) -{ - rr = (rr + 1) & (MAX_SOCKS - 1); - - return bpf_redirect_map(&xsks_map, rr, XDP_DROP); -} diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c deleted file mode 100644 index be7d2572e3e6..000000000000 --- a/samples/bpf/xdpsock_user.c +++ /dev/null @@ -1,2019 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2017 - 2018 Intel Corporation. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include "xdpsock.h" - -/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#ifndef SOL_XDP -#define SOL_XDP 283 -#endif - -#ifndef AF_XDP -#define AF_XDP 44 -#endif - -#ifndef PF_XDP -#define PF_XDP AF_XDP -#endif - -#define NUM_FRAMES (4 * 1024) -#define MIN_PKT_SIZE 64 - -#define DEBUG_HEXDUMP 0 - -#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ -#define VLAN_PRIO_SHIFT 13 -#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ -#define VLAN_VID__DEFAULT 1 -#define VLAN_PRI__DEFAULT 0 - -#define NSEC_PER_SEC 1000000000UL -#define NSEC_PER_USEC 1000 - -#define SCHED_PRI__DEFAULT 0 - -typedef __u64 u64; -typedef __u32 u32; -typedef __u16 u16; -typedef __u8 u8; - -static unsigned long prev_time; -static long tx_cycle_diff_min; -static long tx_cycle_diff_max; -static double tx_cycle_diff_ave; -static long tx_cycle_cnt; - -enum benchmark_type { - BENCH_RXDROP = 0, - BENCH_TXONLY = 1, - BENCH_L2FWD = 2, -}; - -static enum benchmark_type opt_bench = BENCH_RXDROP; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static const char *opt_if = ""; -static int opt_ifindex; -static int opt_queue; -static unsigned long opt_duration; -static unsigned long start_time; -static bool benchmark_done; -static u32 opt_batch_size = 64; -static int opt_pkt_count; -static u16 opt_pkt_size = MIN_PKT_SIZE; -static u32 opt_pkt_fill_pattern = 0x12345678; -static bool opt_vlan_tag; -static u16 opt_pkt_vlan_id = VLAN_VID__DEFAULT; -static u16 opt_pkt_vlan_pri = VLAN_PRI__DEFAULT; -static struct ether_addr opt_txdmac = {{ 0x3c, 0xfd, 0xfe, - 0x9e, 0x7f, 0x71 }}; -static struct ether_addr opt_txsmac = {{ 0xec, 0xb1, 0xd7, - 0x98, 0x3a, 0xc0 }}; -static bool opt_extra_stats; -static bool opt_quiet; -static bool opt_app_stats; -static const char *opt_irq_str = ""; -static u32 irq_no; -static int irqs_at_init = -1; -static u32 sequence; -static int opt_poll; -static int opt_interval = 1; -static int opt_retries = 3; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; -static u32 opt_umem_flags; -static int opt_unaligned_chunks; -static int opt_mmap_flags; -static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; -static int opt_timeout = 1000; -static bool opt_need_wakeup = true; -static u32 opt_num_xsks = 1; -static u32 prog_id; -static bool opt_busy_poll; -static bool opt_reduced_cap; -static clockid_t opt_clock = CLOCK_MONOTONIC; -static unsigned long opt_tx_cycle_ns; -static int opt_schpolicy = SCHED_OTHER; -static int opt_schprio = SCHED_PRI__DEFAULT; -static bool opt_tstamp; - -struct vlan_ethhdr { - unsigned char h_dest[6]; - unsigned char h_source[6]; - __be16 h_vlan_proto; - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; -}; - -#define PKTGEN_MAGIC 0xbe9be955 -struct pktgen_hdr { - __be32 pgh_magic; - __be32 seq_num; - __be32 tv_sec; - __be32 tv_usec; -}; - -struct xsk_ring_stats { - unsigned long rx_npkts; - unsigned long tx_npkts; - unsigned long rx_dropped_npkts; - unsigned long rx_invalid_npkts; - unsigned long tx_invalid_npkts; - unsigned long rx_full_npkts; - unsigned long rx_fill_empty_npkts; - unsigned long tx_empty_npkts; - unsigned long prev_rx_npkts; - unsigned long prev_tx_npkts; - unsigned long prev_rx_dropped_npkts; - unsigned long prev_rx_invalid_npkts; - unsigned long prev_tx_invalid_npkts; - unsigned long prev_rx_full_npkts; - unsigned long prev_rx_fill_empty_npkts; - unsigned long prev_tx_empty_npkts; -}; - -struct xsk_driver_stats { - unsigned long intrs; - unsigned long prev_intrs; -}; - -struct xsk_app_stats { - unsigned long rx_empty_polls; - unsigned long fill_fail_polls; - unsigned long copy_tx_sendtos; - unsigned long tx_wakeup_sendtos; - unsigned long opt_polls; - unsigned long prev_rx_empty_polls; - unsigned long prev_fill_fail_polls; - unsigned long prev_copy_tx_sendtos; - unsigned long prev_tx_wakeup_sendtos; - unsigned long prev_opt_polls; -}; - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - void *buffer; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - struct xsk_ring_stats ring_stats; - struct xsk_app_stats app_stats; - struct xsk_driver_stats drv_stats; - u32 outstanding_tx; -}; - -static const struct clockid_map { - const char *name; - clockid_t clockid; -} clockids_map[] = { - { "REALTIME", CLOCK_REALTIME }, - { "TAI", CLOCK_TAI }, - { "BOOTTIME", CLOCK_BOOTTIME }, - { "MONOTONIC", CLOCK_MONOTONIC }, - { NULL } -}; - -static const struct sched_map { - const char *name; - int policy; -} schmap[] = { - { "OTHER", SCHED_OTHER }, - { "FIFO", SCHED_FIFO }, - { NULL } -}; - -static int num_socks; -struct xsk_socket_info *xsks[MAX_SOCKS]; -int sock; - -static int get_clockid(clockid_t *id, const char *name) -{ - const struct clockid_map *clk; - - for (clk = clockids_map; clk->name; clk++) { - if (strcasecmp(clk->name, name) == 0) { - *id = clk->clockid; - return 0; - } - } - - return -1; -} - -static int get_schpolicy(int *policy, const char *name) -{ - const struct sched_map *sch; - - for (sch = schmap; sch->name; sch++) { - if (strcasecmp(sch->name, name) == 0) { - *policy = sch->policy; - return 0; - } - } - - return -1; -} - -static unsigned long get_nsecs(void) -{ - struct timespec ts; - - clock_gettime(opt_clock, &ts); - return ts.tv_sec * 1000000000UL + ts.tv_nsec; -} - -static void print_benchmark(bool running) -{ - const char *bench_str = "INVALID"; - - if (opt_bench == BENCH_RXDROP) - bench_str = "rxdrop"; - else if (opt_bench == BENCH_TXONLY) - bench_str = "txonly"; - else if (opt_bench == BENCH_L2FWD) - bench_str = "l2fwd"; - - printf("%s:%d %s ", opt_if, opt_queue, bench_str); - if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) - printf("xdp-skb "); - else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) - printf("xdp-drv "); - else - printf(" "); - - if (opt_poll) - printf("poll() "); - - if (running) { - printf("running..."); - fflush(stdout); - } -} - -static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) -{ - struct xdp_statistics stats; - socklen_t optlen; - int err; - - optlen = sizeof(stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) - return err; - - if (optlen == sizeof(struct xdp_statistics)) { - xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; - xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; - xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; - xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; - xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; - xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; - return 0; - } - - return -EINVAL; -} - -static void dump_app_stats(long dt) -{ - int i; - - for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-18s %'-14.0f %'-14lu\n"; - double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, - tx_wakeup_sendtos_ps, opt_polls_ps; - - rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - - xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; - fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - - xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; - copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - - xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; - tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - - xsks[i]->app_stats.prev_tx_wakeup_sendtos) - * 1000000000. / dt; - opt_polls_ps = (xsks[i]->app_stats.opt_polls - - xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; - - printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); - printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); - printf(fmt, "fill fail polls", fill_fail_polls_ps, - xsks[i]->app_stats.fill_fail_polls); - printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, - xsks[i]->app_stats.copy_tx_sendtos); - printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, - xsks[i]->app_stats.tx_wakeup_sendtos); - printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); - - xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; - xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; - xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; - xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; - xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; - } - - if (opt_tx_cycle_ns) { - printf("\n%-18s %-10s %-10s %-10s %-10s %-10s\n", - "", "period", "min", "ave", "max", "cycle"); - printf("%-18s %-10lu %-10lu %-10lu %-10lu %-10lu\n", - "Cyclic TX", opt_tx_cycle_ns, tx_cycle_diff_min, - (long)(tx_cycle_diff_ave / tx_cycle_cnt), - tx_cycle_diff_max, tx_cycle_cnt); - } -} - -static bool get_interrupt_number(void) -{ - FILE *f_int_proc; - char line[4096]; - bool found = false; - - f_int_proc = fopen("/proc/interrupts", "r"); - if (f_int_proc == NULL) { - printf("Failed to open /proc/interrupts.\n"); - return found; - } - - while (!feof(f_int_proc) && !found) { - /* Make sure to read a full line at a time */ - if (fgets(line, sizeof(line), f_int_proc) == NULL || - line[strlen(line) - 1] != '\n') { - printf("Error reading from interrupts file\n"); - break; - } - - /* Extract interrupt number from line */ - if (strstr(line, opt_irq_str) != NULL) { - irq_no = atoi(line); - found = true; - break; - } - } - - fclose(f_int_proc); - - return found; -} - -static int get_irqs(void) -{ - char count_path[PATH_MAX]; - int total_intrs = -1; - FILE *f_count_proc; - char line[4096]; - - snprintf(count_path, sizeof(count_path), - "/sys/kernel/irq/%i/per_cpu_count", irq_no); - f_count_proc = fopen(count_path, "r"); - if (f_count_proc == NULL) { - printf("Failed to open %s\n", count_path); - return total_intrs; - } - - if (fgets(line, sizeof(line), f_count_proc) == NULL || - line[strlen(line) - 1] != '\n') { - printf("Error reading from %s\n", count_path); - } else { - static const char com[2] = ","; - char *token; - - total_intrs = 0; - token = strtok(line, com); - while (token != NULL) { - /* sum up interrupts across all cores */ - total_intrs += atoi(token); - token = strtok(NULL, com); - } - } - - fclose(f_count_proc); - - return total_intrs; -} - -static void dump_driver_stats(long dt) -{ - int i; - - for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-18s %'-14.0f %'-14lu\n"; - double intrs_ps; - int n_ints = get_irqs(); - - if (n_ints < 0) { - printf("error getting intr info for intr %i\n", irq_no); - return; - } - xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; - - intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * - 1000000000. / dt; - - printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); - printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); - - xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; - } -} - -static void dump_stats(void) -{ - unsigned long now = get_nsecs(); - long dt = now - prev_time; - int i; - - prev_time = now; - - for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-18s %'-14.0f %'-14lu\n"; - double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, - tx_invalid_pps, tx_empty_pps; - - rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * - 1000000000. / dt; - tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * - 1000000000. / dt; - - printf("\n sock%d@", i); - print_benchmark(false); - printf("\n"); - - printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", - dt / 1000000000.); - printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); - printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); - - xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; - xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; - - if (opt_extra_stats) { - if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { - dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - - xsks[i]->ring_stats.prev_rx_dropped_npkts) * - 1000000000. / dt; - rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - - xsks[i]->ring_stats.prev_rx_invalid_npkts) * - 1000000000. / dt; - tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - - xsks[i]->ring_stats.prev_tx_invalid_npkts) * - 1000000000. / dt; - full_pps = (xsks[i]->ring_stats.rx_full_npkts - - xsks[i]->ring_stats.prev_rx_full_npkts) * - 1000000000. / dt; - fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - - xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * - 1000000000. / dt; - tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - - xsks[i]->ring_stats.prev_tx_empty_npkts) * - 1000000000. / dt; - - printf(fmt, "rx dropped", dropped_pps, - xsks[i]->ring_stats.rx_dropped_npkts); - printf(fmt, "rx invalid", rx_invalid_pps, - xsks[i]->ring_stats.rx_invalid_npkts); - printf(fmt, "tx invalid", tx_invalid_pps, - xsks[i]->ring_stats.tx_invalid_npkts); - printf(fmt, "rx queue full", full_pps, - xsks[i]->ring_stats.rx_full_npkts); - printf(fmt, "fill ring empty", fill_empty_pps, - xsks[i]->ring_stats.rx_fill_empty_npkts); - printf(fmt, "tx ring empty", tx_empty_pps, - xsks[i]->ring_stats.tx_empty_npkts); - - xsks[i]->ring_stats.prev_rx_dropped_npkts = - xsks[i]->ring_stats.rx_dropped_npkts; - xsks[i]->ring_stats.prev_rx_invalid_npkts = - xsks[i]->ring_stats.rx_invalid_npkts; - xsks[i]->ring_stats.prev_tx_invalid_npkts = - xsks[i]->ring_stats.tx_invalid_npkts; - xsks[i]->ring_stats.prev_rx_full_npkts = - xsks[i]->ring_stats.rx_full_npkts; - xsks[i]->ring_stats.prev_rx_fill_empty_npkts = - xsks[i]->ring_stats.rx_fill_empty_npkts; - xsks[i]->ring_stats.prev_tx_empty_npkts = - xsks[i]->ring_stats.tx_empty_npkts; - } else { - printf("%-15s\n", "Error retrieving extra stats"); - } - } - } - - if (opt_app_stats) - dump_app_stats(dt); - if (irq_no) - dump_driver_stats(dt); -} - -static bool is_benchmark_done(void) -{ - if (opt_duration > 0) { - unsigned long dt = (get_nsecs() - start_time); - - if (dt >= opt_duration) - benchmark_done = true; - } - return benchmark_done; -} - -static void *poller(void *arg) -{ - (void)arg; - while (!is_benchmark_done()) { - sleep(opt_interval); - dump_stats(); - } - - return NULL; -} - -static void remove_xdp_program(void) -{ - u32 curr_prog_id = 0; - - if (bpf_xdp_query_id(opt_ifindex, opt_xdp_flags, &curr_prog_id)) { - printf("bpf_xdp_query_id failed\n"); - exit(EXIT_FAILURE); - } - - if (prog_id == curr_prog_id) - bpf_xdp_detach(opt_ifindex, opt_xdp_flags, NULL); - else if (!curr_prog_id) - printf("couldn't find a prog id on a given interface\n"); - else - printf("program on interface changed, not removing\n"); -} - -static void int_exit(int sig) -{ - benchmark_done = true; -} - -static void __exit_with_error(int error, const char *file, const char *func, - int line) -{ - fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, - line, error, strerror(error)); - - if (opt_num_xsks > 1) - remove_xdp_program(); - exit(EXIT_FAILURE); -} - -#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) - -static void xdpsock_cleanup(void) -{ - struct xsk_umem *umem = xsks[0]->umem->umem; - int i, cmd = CLOSE_CONN; - - dump_stats(); - for (i = 0; i < num_socks; i++) - xsk_socket__delete(xsks[i]->xsk); - (void)xsk_umem__delete(umem); - - if (opt_reduced_cap) { - if (write(sock, &cmd, sizeof(int)) < 0) - exit_with_error(errno); - } - - if (opt_num_xsks > 1) - remove_xdp_program(); -} - -static void swap_mac_addresses(void *data) -{ - struct ether_header *eth = (struct ether_header *)data; - struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; - struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; - struct ether_addr tmp; - - tmp = *src_addr; - *src_addr = *dst_addr; - *dst_addr = tmp; -} - -static void hex_dump(void *pkt, size_t length, u64 addr) -{ - const unsigned char *address = (unsigned char *)pkt; - const unsigned char *line = address; - size_t line_size = 32; - unsigned char c; - char buf[32]; - int i = 0; - - if (!DEBUG_HEXDUMP) - return; - - sprintf(buf, "addr=%llu", addr); - printf("length = %zu\n", length); - printf("%s | ", buf); - while (length-- > 0) { - printf("%02X ", *address++); - if (!(++i % line_size) || (length == 0 && i % line_size)) { - if (length == 0) { - while (i++ % line_size) - printf("__ "); - } - printf(" | "); /* right close */ - while (line < address) { - c = *line++; - printf("%c", (c < 33 || c == 255) ? 0x2E : c); - } - printf("\n"); - if (length > 0) - printf("%s | ", buf); - } - } - printf("\n"); -} - -static void *memset32_htonl(void *dest, u32 val, u32 size) -{ - u32 *ptr = (u32 *)dest; - int i; - - val = htonl(val); - - for (i = 0; i < (size & (~0x3)); i += 4) - ptr[i >> 2] = val; - - for (; i < size; i++) - ((char *)dest)[i] = ((char *)&val)[i & 3]; - - return dest; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static unsigned int do_csum(const unsigned char *buff, int len) -{ - unsigned int result = 0; - int odd; - - if (len <= 0) - goto out; - odd = 1 & (unsigned long)buff; - if (odd) { -#ifdef __LITTLE_ENDIAN - result += (*buff << 8); -#else - result = *buff; -#endif - len--; - buff++; - } - if (len >= 2) { - if (2 & (unsigned long)buff) { - result += *(unsigned short *)buff; - len -= 2; - buff += 2; - } - if (len >= 4) { - const unsigned char *end = buff + - ((unsigned int)len & ~3); - unsigned int carry = 0; - - do { - unsigned int w = *(unsigned int *)buff; - - buff += 4; - result += carry; - result += w; - carry = (w > result); - } while (buff < end); - result += carry; - result = (result & 0xffff) + (result >> 16); - } - if (len & 2) { - result += *(unsigned short *)buff; - buff += 2; - } - } - if (len & 1) -#ifdef __LITTLE_ENDIAN - result += *buff; -#else - result += (*buff << 8); -#endif - result = from32to16(result); - if (odd) - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); -out: - return result; -} - -/* - * This is a version of ip_compute_csum() optimized for IP headers, - * which always checksum on 4 octet boundaries. - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) -{ - return (__sum16)~do_csum(iph, ihl * 4); -} - -/* - * Fold a partial checksum - * This function code has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static inline __sum16 csum_fold(__wsum csum) -{ - u32 sum = (u32)csum; - - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return (__sum16)~sum; -} - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline u32 from64to32(u64 x) -{ - /* add up 32-bit and 32-bit for 32+c bit */ - x = (x & 0xffffffff) + (x >> 32); - /* add up carry.. */ - x = (x & 0xffffffff) + (x >> 32); - return (u32)x; -} - -__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - __u32 len, __u8 proto, __wsum sum); - -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, - __u32 len, __u8 proto, __wsum sum) -{ - unsigned long long s = (u32)sum; - - s += (u32)saddr; - s += (u32)daddr; -#ifdef __BIG_ENDIAN__ - s += proto + len; -#else - s += (proto + len) << 8; -#endif - return (__wsum)from64to32(s); -} - -/* - * This function has been taken from - * Linux kernel include/asm-generic/checksum.h - */ -static inline __sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); -} - -static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, - u8 proto, u16 *udp_pkt) -{ - u32 csum = 0; - u32 cnt = 0; - - /* udp hdr and data */ - for (; cnt < len; cnt += 2) - csum += udp_pkt[cnt >> 1]; - - return csum_tcpudp_magic(saddr, daddr, len, proto, csum); -} - -#define ETH_FCS_SIZE 4 - -#define ETH_HDR_SIZE (opt_vlan_tag ? sizeof(struct vlan_ethhdr) : \ - sizeof(struct ethhdr)) -#define PKTGEN_HDR_SIZE (opt_tstamp ? sizeof(struct pktgen_hdr) : 0) -#define PKT_HDR_SIZE (ETH_HDR_SIZE + sizeof(struct iphdr) + \ - sizeof(struct udphdr) + PKTGEN_HDR_SIZE) -#define PKTGEN_HDR_OFFSET (ETH_HDR_SIZE + sizeof(struct iphdr) + \ - sizeof(struct udphdr)) -#define PKTGEN_SIZE_MIN (PKTGEN_HDR_OFFSET + sizeof(struct pktgen_hdr) + \ - ETH_FCS_SIZE) - -#define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) -#define IP_PKT_SIZE (PKT_SIZE - ETH_HDR_SIZE) -#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) -#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - \ - (sizeof(struct udphdr) + PKTGEN_HDR_SIZE)) - -static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; - -static void gen_eth_hdr_data(void) -{ - struct pktgen_hdr *pktgen_hdr; - struct udphdr *udp_hdr; - struct iphdr *ip_hdr; - - if (opt_vlan_tag) { - struct vlan_ethhdr *veth_hdr = (struct vlan_ethhdr *)pkt_data; - u16 vlan_tci = 0; - - udp_hdr = (struct udphdr *)(pkt_data + - sizeof(struct vlan_ethhdr) + - sizeof(struct iphdr)); - ip_hdr = (struct iphdr *)(pkt_data + - sizeof(struct vlan_ethhdr)); - pktgen_hdr = (struct pktgen_hdr *)(pkt_data + - sizeof(struct vlan_ethhdr) + - sizeof(struct iphdr) + - sizeof(struct udphdr)); - /* ethernet & VLAN header */ - memcpy(veth_hdr->h_dest, &opt_txdmac, ETH_ALEN); - memcpy(veth_hdr->h_source, &opt_txsmac, ETH_ALEN); - veth_hdr->h_vlan_proto = htons(ETH_P_8021Q); - vlan_tci = opt_pkt_vlan_id & VLAN_VID_MASK; - vlan_tci |= (opt_pkt_vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; - veth_hdr->h_vlan_TCI = htons(vlan_tci); - veth_hdr->h_vlan_encapsulated_proto = htons(ETH_P_IP); - } else { - struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; - - udp_hdr = (struct udphdr *)(pkt_data + - sizeof(struct ethhdr) + - sizeof(struct iphdr)); - ip_hdr = (struct iphdr *)(pkt_data + - sizeof(struct ethhdr)); - pktgen_hdr = (struct pktgen_hdr *)(pkt_data + - sizeof(struct ethhdr) + - sizeof(struct iphdr) + - sizeof(struct udphdr)); - /* ethernet header */ - memcpy(eth_hdr->h_dest, &opt_txdmac, ETH_ALEN); - memcpy(eth_hdr->h_source, &opt_txsmac, ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_IP); - } - - - /* IP header */ - ip_hdr->version = IPVERSION; - ip_hdr->ihl = 0x5; /* 20 byte header */ - ip_hdr->tos = 0x0; - ip_hdr->tot_len = htons(IP_PKT_SIZE); - ip_hdr->id = 0; - ip_hdr->frag_off = 0; - ip_hdr->ttl = IPDEFTTL; - ip_hdr->protocol = IPPROTO_UDP; - ip_hdr->saddr = htonl(0x0a0a0a10); - ip_hdr->daddr = htonl(0x0a0a0a20); - - /* IP header checksum */ - ip_hdr->check = 0; - ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); - - /* UDP header */ - udp_hdr->source = htons(0x1000); - udp_hdr->dest = htons(0x1000); - udp_hdr->len = htons(UDP_PKT_SIZE); - - if (opt_tstamp) - pktgen_hdr->pgh_magic = htonl(PKTGEN_MAGIC); - - /* UDP data */ - memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, - UDP_PKT_DATA_SIZE); - - /* UDP header checksum */ - udp_hdr->check = 0; - udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, - IPPROTO_UDP, (u16 *)udp_hdr); -} - -static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) -{ - memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, - PKT_SIZE); -} - -static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) -{ - struct xsk_umem_info *umem; - struct xsk_umem_config cfg = { - /* We recommend that you set the fill ring size >= HW RX ring size + - * AF_XDP RX ring size. Make sure you fill up the fill ring - * with buffers at regular intervals, and you will with this setting - * avoid allocation failures in the driver. These are usually quite - * expensive since drivers have not been written to assume that - * allocation failures are common. For regular sockets, kernel - * allocated memory is used that only runs out in OOM situations - * that should be rare. - */ - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = opt_xsk_frame_size, - .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, - .flags = opt_umem_flags - }; - int ret; - - umem = calloc(1, sizeof(*umem)); - if (!umem) - exit_with_error(errno); - - ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - &cfg); - if (ret) - exit_with_error(-ret); - - umem->buffer = buffer; - return umem; -} - -static void xsk_populate_fill_ring(struct xsk_umem_info *umem) -{ - int ret, i; - u32 idx; - - ret = xsk_ring_prod__reserve(&umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) - exit_with_error(-ret); - for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = - i * opt_xsk_frame_size; - xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); -} - -static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, - bool rx, bool tx) -{ - struct xsk_socket_config cfg; - struct xsk_socket_info *xsk; - struct xsk_ring_cons *rxr; - struct xsk_ring_prod *txr; - int ret; - - xsk = calloc(1, sizeof(*xsk)); - if (!xsk) - exit_with_error(errno); - - xsk->umem = umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - if (opt_num_xsks > 1 || opt_reduced_cap) - cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; - else - cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; - - rxr = rx ? &xsk->rx : NULL; - txr = tx ? &xsk->tx : NULL; - ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, - rxr, txr, &cfg); - if (ret) - exit_with_error(-ret); - - ret = bpf_xdp_query_id(opt_ifindex, opt_xdp_flags, &prog_id); - if (ret) - exit_with_error(-ret); - - xsk->app_stats.rx_empty_polls = 0; - xsk->app_stats.fill_fail_polls = 0; - xsk->app_stats.copy_tx_sendtos = 0; - xsk->app_stats.tx_wakeup_sendtos = 0; - xsk->app_stats.opt_polls = 0; - xsk->app_stats.prev_rx_empty_polls = 0; - xsk->app_stats.prev_fill_fail_polls = 0; - xsk->app_stats.prev_copy_tx_sendtos = 0; - xsk->app_stats.prev_tx_wakeup_sendtos = 0; - xsk->app_stats.prev_opt_polls = 0; - - return xsk; -} - -static struct option long_options[] = { - {"rxdrop", no_argument, 0, 'r'}, - {"txonly", no_argument, 0, 't'}, - {"l2fwd", no_argument, 0, 'l'}, - {"interface", required_argument, 0, 'i'}, - {"queue", required_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"interval", required_argument, 0, 'n'}, - {"retries", required_argument, 0, 'O'}, - {"zero-copy", no_argument, 0, 'z'}, - {"copy", no_argument, 0, 'c'}, - {"frame-size", required_argument, 0, 'f'}, - {"no-need-wakeup", no_argument, 0, 'm'}, - {"unaligned", no_argument, 0, 'u'}, - {"shared-umem", no_argument, 0, 'M'}, - {"force", no_argument, 0, 'F'}, - {"duration", required_argument, 0, 'd'}, - {"clock", required_argument, 0, 'w'}, - {"batch-size", required_argument, 0, 'b'}, - {"tx-pkt-count", required_argument, 0, 'C'}, - {"tx-pkt-size", required_argument, 0, 's'}, - {"tx-pkt-pattern", required_argument, 0, 'P'}, - {"tx-vlan", no_argument, 0, 'V'}, - {"tx-vlan-id", required_argument, 0, 'J'}, - {"tx-vlan-pri", required_argument, 0, 'K'}, - {"tx-dmac", required_argument, 0, 'G'}, - {"tx-smac", required_argument, 0, 'H'}, - {"tx-cycle", required_argument, 0, 'T'}, - {"tstamp", no_argument, 0, 'y'}, - {"policy", required_argument, 0, 'W'}, - {"schpri", required_argument, 0, 'U'}, - {"extra-stats", no_argument, 0, 'x'}, - {"quiet", no_argument, 0, 'Q'}, - {"app-stats", no_argument, 0, 'a'}, - {"irq-string", no_argument, 0, 'I'}, - {"busy-poll", no_argument, 0, 'B'}, - {"reduce-cap", no_argument, 0, 'R'}, - {0, 0, 0, 0} -}; - -static void usage(const char *prog) -{ - const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -r, --rxdrop Discard all incoming packets (default)\n" - " -t, --txonly Only send packets\n" - " -l, --l2fwd MAC swap L2 forwarding\n" - " -i, --interface=n Run on interface n\n" - " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP skb-mod\n" - " -N, --xdp-native=n Enforce XDP native mode\n" - " -n, --interval=n Specify statistics update interval (default 1 sec).\n" - " -O, --retries=n Specify time-out retries (1s interval) attempt (default 3).\n" - " -z, --zero-copy Force zero-copy mode.\n" - " -c, --copy Force copy mode.\n" - " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" - " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" - " -u, --unaligned Enable unaligned chunk placement\n" - " -M, --shared-umem Enable XDP_SHARED_UMEM (cannot be used with -R)\n" - " -F, --force Force loading the XDP prog\n" - " -d, --duration=n Duration in secs to run command.\n" - " Default: forever.\n" - " -w, --clock=CLOCK Clock NAME (default MONOTONIC).\n" - " -b, --batch-size=n Batch size for sending or receiving\n" - " packets. Default: %d\n" - " -C, --tx-pkt-count=n Number of packets to send.\n" - " Default: Continuous packets.\n" - " -s, --tx-pkt-size=n Transmit packet size.\n" - " (Default: %d bytes)\n" - " Min size: %d, Max size %d.\n" - " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" - " -V, --tx-vlan Send VLAN tagged packets (For -t|--txonly)\n" - " -J, --tx-vlan-id=n Tx VLAN ID [1-4095]. Default: %d (For -V|--tx-vlan)\n" - " -K, --tx-vlan-pri=n Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n" - " -G, --tx-dmac= Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" - " -H, --tx-smac= Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" - " -T, --tx-cycle=n Tx cycle time in micro-seconds (For -t|--txonly).\n" - " -y, --tstamp Add time-stamp to packet (For -t|--txonly).\n" - " -W, --policy=POLICY Schedule policy. Default: SCHED_OTHER\n" - " -U, --schpri=n Schedule priority. Default: %d\n" - " -x, --extra-stats Display extra statistics.\n" - " -Q, --quiet Do not display any stats.\n" - " -a, --app-stats Display application (syscall) statistics.\n" - " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" - " -B, --busy-poll Busy poll.\n" - " -R, --reduce-cap Use reduced capabilities (cannot be used with -M)\n" - "\n"; - fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, - opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, - XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern, - VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT, - SCHED_PRI__DEFAULT); - - exit(EXIT_FAILURE); -} - -static void parse_command_line(int argc, char **argv) -{ - int option_index, c; - - opterr = 0; - - for (;;) { - c = getopt_long(argc, argv, - "Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:yW:U:xQaI:BR", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'r': - opt_bench = BENCH_RXDROP; - break; - case 't': - opt_bench = BENCH_TXONLY; - break; - case 'l': - opt_bench = BENCH_L2FWD; - break; - case 'i': - opt_if = optarg; - break; - case 'q': - opt_queue = atoi(optarg); - break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'N': - /* default, set below */ - break; - case 'n': - opt_interval = atoi(optarg); - break; - case 'w': - if (get_clockid(&opt_clock, optarg)) { - fprintf(stderr, - "ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.\n", - optarg); - opt_clock = CLOCK_MONOTONIC; - } - break; - case 'O': - opt_retries = atoi(optarg); - break; - case 'z': - opt_xdp_bind_flags |= XDP_ZEROCOPY; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'u': - opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; - opt_unaligned_chunks = 1; - opt_mmap_flags = MAP_HUGETLB; - break; - case 'F': - opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; - break; - case 'f': - opt_xsk_frame_size = atoi(optarg); - break; - case 'm': - opt_need_wakeup = false; - opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; - break; - case 'M': - opt_num_xsks = MAX_SOCKS; - break; - case 'd': - opt_duration = atoi(optarg); - opt_duration *= 1000000000; - break; - case 'b': - opt_batch_size = atoi(optarg); - break; - case 'C': - opt_pkt_count = atoi(optarg); - break; - case 's': - opt_pkt_size = atoi(optarg); - if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || - opt_pkt_size < MIN_PKT_SIZE) { - fprintf(stderr, - "ERROR: Invalid frame size %d\n", - opt_pkt_size); - usage(basename(argv[0])); - } - break; - case 'P': - opt_pkt_fill_pattern = strtol(optarg, NULL, 16); - break; - case 'V': - opt_vlan_tag = true; - break; - case 'J': - opt_pkt_vlan_id = atoi(optarg); - break; - case 'K': - opt_pkt_vlan_pri = atoi(optarg); - break; - case 'G': - if (!ether_aton_r(optarg, - (struct ether_addr *)&opt_txdmac)) { - fprintf(stderr, "Invalid dmac address:%s\n", - optarg); - usage(basename(argv[0])); - } - break; - case 'H': - if (!ether_aton_r(optarg, - (struct ether_addr *)&opt_txsmac)) { - fprintf(stderr, "Invalid smac address:%s\n", - optarg); - usage(basename(argv[0])); - } - break; - case 'T': - opt_tx_cycle_ns = atoi(optarg); - opt_tx_cycle_ns *= NSEC_PER_USEC; - break; - case 'y': - opt_tstamp = 1; - break; - case 'W': - if (get_schpolicy(&opt_schpolicy, optarg)) { - fprintf(stderr, - "ERROR: Invalid policy %s. Default to SCHED_OTHER.\n", - optarg); - opt_schpolicy = SCHED_OTHER; - } - break; - case 'U': - opt_schprio = atoi(optarg); - break; - case 'x': - opt_extra_stats = 1; - break; - case 'Q': - opt_quiet = 1; - break; - case 'a': - opt_app_stats = 1; - break; - case 'I': - opt_irq_str = optarg; - if (get_interrupt_number()) - irqs_at_init = get_irqs(); - if (irqs_at_init < 0) { - fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); - usage(basename(argv[0])); - } - break; - case 'B': - opt_busy_poll = 1; - break; - case 'R': - opt_reduced_cap = true; - break; - default: - usage(basename(argv[0])); - } - } - - if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - - opt_ifindex = if_nametoindex(opt_if); - if (!opt_ifindex) { - fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", - opt_if); - usage(basename(argv[0])); - } - - if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && - !opt_unaligned_chunks) { - fprintf(stderr, "--frame-size=%d is not a power of two\n", - opt_xsk_frame_size); - usage(basename(argv[0])); - } - - if (opt_reduced_cap && opt_num_xsks > 1) { - fprintf(stderr, "ERROR: -M and -R cannot be used together\n"); - usage(basename(argv[0])); - } -} - -static void kick_tx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || - errno == EBUSY || errno == ENETDOWN) - return; - exit_with_error(errno); -} - -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) -{ - struct xsk_umem_info *umem = xsk->umem; - u32 idx_cq = 0, idx_fq = 0; - unsigned int rcvd; - size_t ndescs; - - if (!xsk->outstanding_tx) - return; - - /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to - * really send the packets. In zero-copy mode we do not have to do this, since Tx - * is driven by the NAPI loop. So as an optimization, we do not have to call - * sendto() all the time in zero-copy mode for l2fwd. - */ - if (opt_xdp_bind_flags & XDP_COPY) { - xsk->app_stats.copy_tx_sendtos++; - kick_tx(xsk); - } - - ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : - xsk->outstanding_tx; - - /* re-add completed Tx buffers */ - rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); - if (rcvd > 0) { - unsigned int i; - int ret; - - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&umem->fq)) { - xsk->app_stats.fill_fail_polls++; - recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, - NULL); - } - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - } - - for (i = 0; i < rcvd; i++) - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = - *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - } -} - -static inline void complete_tx_only(struct xsk_socket_info *xsk, - int batch_size) -{ - unsigned int rcvd; - u32 idx; - - if (!xsk->outstanding_tx) - return; - - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { - xsk->app_stats.tx_wakeup_sendtos++; - kick_tx(xsk); - } - - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); - if (rcvd > 0) { - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - } -} - -static void rx_drop(struct xsk_socket_info *xsk) -{ - unsigned int rcvd, i; - u32 idx_rx = 0, idx_fq = 0; - int ret; - - rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); - if (!rcvd) { - if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - xsk->app_stats.rx_empty_polls++; - recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - } - return; - } - - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - xsk->app_stats.fill_fail_polls++; - recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - } - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - } - - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - u64 orig = xsk_umem__extract_addr(addr); - - addr = xsk_umem__add_offset_to_addr(addr); - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - - hex_dump(pkt, len, addr); - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; - } - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->ring_stats.rx_npkts += rcvd; -} - -static void rx_drop_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - int i, ret; - - for (i = 0; i < num_socks; i++) { - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); - fds[i].events = POLLIN; - } - - for (;;) { - if (opt_poll) { - for (i = 0; i < num_socks; i++) - xsks[i]->app_stats.opt_polls++; - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - } - - for (i = 0; i < num_socks; i++) - rx_drop(xsks[i]); - - if (benchmark_done) - break; - } -} - -static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, - int batch_size, unsigned long tx_ns) -{ - u32 idx, tv_sec, tv_usec; - unsigned int i; - - while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < - batch_size) { - complete_tx_only(xsk, batch_size); - if (benchmark_done) - return 0; - } - - if (opt_tstamp) { - tv_sec = (u32)(tx_ns / NSEC_PER_SEC); - tv_usec = (u32)((tx_ns % NSEC_PER_SEC) / 1000); - } - - for (i = 0; i < batch_size; i++) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, - idx + i); - tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size; - tx_desc->len = PKT_SIZE; - - if (opt_tstamp) { - struct pktgen_hdr *pktgen_hdr; - u64 addr = tx_desc->addr; - char *pkt; - - pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - pktgen_hdr = (struct pktgen_hdr *)(pkt + PKTGEN_HDR_OFFSET); - - pktgen_hdr->seq_num = htonl(sequence++); - pktgen_hdr->tv_sec = htonl(tv_sec); - pktgen_hdr->tv_usec = htonl(tv_usec); - - hex_dump(pkt, PKT_SIZE, addr); - } - } - - xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->ring_stats.tx_npkts += batch_size; - xsk->outstanding_tx += batch_size; - *frame_nb += batch_size; - *frame_nb %= NUM_FRAMES; - complete_tx_only(xsk, batch_size); - - return batch_size; -} - -static inline int get_batch_size(int pkt_cnt) -{ - if (!opt_pkt_count) - return opt_batch_size; - - if (pkt_cnt + opt_batch_size <= opt_pkt_count) - return opt_batch_size; - - return opt_pkt_count - pkt_cnt; -} - -static void complete_tx_only_all(void) -{ - bool pending; - int i; - - do { - pending = false; - for (i = 0; i < num_socks; i++) { - if (xsks[i]->outstanding_tx) { - complete_tx_only(xsks[i], opt_batch_size); - pending = !!xsks[i]->outstanding_tx; - } - } - sleep(1); - } while (pending && opt_retries-- > 0); -} - -static void tx_only_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - u32 frame_nb[MAX_SOCKS] = {}; - unsigned long next_tx_ns = 0; - int pkt_cnt = 0; - int i, ret; - - if (opt_poll && opt_tx_cycle_ns) { - fprintf(stderr, - "Error: --poll and --tx-cycles are both set\n"); - return; - } - - for (i = 0; i < num_socks; i++) { - fds[0].fd = xsk_socket__fd(xsks[i]->xsk); - fds[0].events = POLLOUT; - } - - if (opt_tx_cycle_ns) { - /* Align Tx time to micro-second boundary */ - next_tx_ns = (get_nsecs() / NSEC_PER_USEC + 1) * - NSEC_PER_USEC; - next_tx_ns += opt_tx_cycle_ns; - - /* Initialize periodic Tx scheduling variance */ - tx_cycle_diff_min = 1000000000; - tx_cycle_diff_max = 0; - tx_cycle_diff_ave = 0.0; - } - - while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { - int batch_size = get_batch_size(pkt_cnt); - unsigned long tx_ns = 0; - struct timespec next; - int tx_cnt = 0; - long diff; - int err; - - if (opt_poll) { - for (i = 0; i < num_socks; i++) - xsks[i]->app_stats.opt_polls++; - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - - if (!(fds[0].revents & POLLOUT)) - continue; - } - - if (opt_tx_cycle_ns) { - next.tv_sec = next_tx_ns / NSEC_PER_SEC; - next.tv_nsec = next_tx_ns % NSEC_PER_SEC; - err = clock_nanosleep(opt_clock, TIMER_ABSTIME, &next, NULL); - if (err) { - if (err != EINTR) - fprintf(stderr, - "clock_nanosleep failed. Err:%d errno:%d\n", - err, errno); - break; - } - - /* Measure periodic Tx scheduling variance */ - tx_ns = get_nsecs(); - diff = tx_ns - next_tx_ns; - if (diff < tx_cycle_diff_min) - tx_cycle_diff_min = diff; - - if (diff > tx_cycle_diff_max) - tx_cycle_diff_max = diff; - - tx_cycle_diff_ave += (double)diff; - tx_cycle_cnt++; - } else if (opt_tstamp) { - tx_ns = get_nsecs(); - } - - for (i = 0; i < num_socks; i++) - tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size, tx_ns); - - pkt_cnt += tx_cnt; - - if (benchmark_done) - break; - - if (opt_tx_cycle_ns) - next_tx_ns += opt_tx_cycle_ns; - } - - if (opt_pkt_count) - complete_tx_only_all(); -} - -static void l2fwd(struct xsk_socket_info *xsk) -{ - unsigned int rcvd, i; - u32 idx_rx = 0, idx_tx = 0; - int ret; - - complete_tx_l2fwd(xsk); - - rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); - if (!rcvd) { - if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - xsk->app_stats.rx_empty_polls++; - recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - } - return; - } - xsk->ring_stats.rx_npkts += rcvd; - - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - complete_tx_l2fwd(xsk); - if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->tx)) { - xsk->app_stats.tx_wakeup_sendtos++; - kick_tx(xsk); - } - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - } - - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; - u64 orig = addr; - - addr = xsk_umem__add_offset_to_addr(addr); - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - - swap_mac_addresses(pkt); - - hex_dump(pkt, len, addr); - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; - } - - xsk_ring_prod__submit(&xsk->tx, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); - - xsk->ring_stats.tx_npkts += rcvd; - xsk->outstanding_tx += rcvd; -} - -static void l2fwd_all(void) -{ - struct pollfd fds[MAX_SOCKS] = {}; - int i, ret; - - for (;;) { - if (opt_poll) { - for (i = 0; i < num_socks; i++) { - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); - fds[i].events = POLLOUT | POLLIN; - xsks[i]->app_stats.opt_polls++; - } - ret = poll(fds, num_socks, opt_timeout); - if (ret <= 0) - continue; - } - - for (i = 0; i < num_socks; i++) - l2fwd(xsks[i]); - - if (benchmark_done) - break; - } -} - -static void load_xdp_program(char **argv, struct bpf_object **obj) -{ - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - char xdp_filename[256]; - int prog_fd; - - snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); - prog_load_attr.file = xdp_filename; - - if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd)) - exit(EXIT_FAILURE); - if (prog_fd < 0) { - fprintf(stderr, "ERROR: no program found: %s\n", - strerror(prog_fd)); - exit(EXIT_FAILURE); - } - - if (bpf_xdp_attach(opt_ifindex, prog_fd, opt_xdp_flags, NULL) < 0) { - fprintf(stderr, "ERROR: link set xdp fd failed\n"); - exit(EXIT_FAILURE); - } -} - -static void enter_xsks_into_map(struct bpf_object *obj) -{ - struct bpf_map *map; - int i, xsks_map; - - map = bpf_object__find_map_by_name(obj, "xsks_map"); - xsks_map = bpf_map__fd(map); - if (xsks_map < 0) { - fprintf(stderr, "ERROR: no xsks map found: %s\n", - strerror(xsks_map)); - exit(EXIT_FAILURE); - } - - for (i = 0; i < num_socks; i++) { - int fd = xsk_socket__fd(xsks[i]->xsk); - int key, ret; - - key = i; - ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); - if (ret) { - fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); - exit(EXIT_FAILURE); - } - } -} - -static void apply_setsockopt(struct xsk_socket_info *xsk) -{ - int sock_opt; - - if (!opt_busy_poll) - return; - - sock_opt = 1; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = 20; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = opt_batch_size; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); -} - -static int recv_xsks_map_fd_from_ctrl_node(int sock, int *_fd) -{ - char cms[CMSG_SPACE(sizeof(int))]; - struct cmsghdr *cmsg; - struct msghdr msg; - struct iovec iov; - int value; - int len; - - iov.iov_base = &value; - iov.iov_len = sizeof(int); - - msg.msg_name = 0; - msg.msg_namelen = 0; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_flags = 0; - msg.msg_control = (caddr_t)cms; - msg.msg_controllen = sizeof(cms); - - len = recvmsg(sock, &msg, 0); - - if (len < 0) { - fprintf(stderr, "Recvmsg failed length incorrect.\n"); - return -EINVAL; - } - - if (len == 0) { - fprintf(stderr, "Recvmsg failed no data\n"); - return -EINVAL; - } - - cmsg = CMSG_FIRSTHDR(&msg); - *_fd = *(int *)CMSG_DATA(cmsg); - - return 0; -} - -static int -recv_xsks_map_fd(int *xsks_map_fd) -{ - struct sockaddr_un server; - int err; - - sock = socket(AF_UNIX, SOCK_STREAM, 0); - if (sock < 0) { - fprintf(stderr, "Error opening socket stream: %s", strerror(errno)); - return errno; - } - - server.sun_family = AF_UNIX; - strcpy(server.sun_path, SOCKET_NAME); - - if (connect(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { - close(sock); - fprintf(stderr, "Error connecting stream socket: %s", strerror(errno)); - return errno; - } - - err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); - if (err) { - fprintf(stderr, "Error %d receiving fd\n", err); - return err; - } - return 0; -} - -int main(int argc, char **argv) -{ - struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; - struct __user_cap_data_struct data[2] = { { 0 } }; - bool rx = false, tx = false; - struct sched_param schparam; - struct xsk_umem_info *umem; - struct bpf_object *obj; - int xsks_map_fd = 0; - pthread_t pt; - int i, ret; - void *bufs; - - parse_command_line(argc, argv); - - if (opt_reduced_cap) { - if (capget(&hdr, data) < 0) - fprintf(stderr, "Error getting capabilities\n"); - - data->effective &= CAP_TO_MASK(CAP_NET_RAW); - data->permitted &= CAP_TO_MASK(CAP_NET_RAW); - - if (capset(&hdr, data) < 0) - fprintf(stderr, "Setting capabilities failed\n"); - - if (capget(&hdr, data) < 0) { - fprintf(stderr, "Error getting capabilities\n"); - } else { - fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", - data[0].effective, data[0].inheritable, data[0].permitted); - fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", - data[1].effective, data[1].inheritable, data[1].permitted); - } - } else { - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (opt_num_xsks > 1) - load_xdp_program(argv, &obj); - } - - /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ - bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); - if (bufs == MAP_FAILED) { - printf("ERROR: mmap failed\n"); - exit(EXIT_FAILURE); - } - - /* Create sockets... */ - umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); - if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) { - rx = true; - xsk_populate_fill_ring(umem); - } - if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY) - tx = true; - for (i = 0; i < opt_num_xsks; i++) - xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); - - for (i = 0; i < opt_num_xsks; i++) - apply_setsockopt(xsks[i]); - - if (opt_bench == BENCH_TXONLY) { - if (opt_tstamp && opt_pkt_size < PKTGEN_SIZE_MIN) - opt_pkt_size = PKTGEN_SIZE_MIN; - - gen_eth_hdr_data(); - - for (i = 0; i < NUM_FRAMES; i++) - gen_eth_frame(umem, i * opt_xsk_frame_size); - } - - if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) - enter_xsks_into_map(obj); - - if (opt_reduced_cap) { - ret = recv_xsks_map_fd(&xsks_map_fd); - if (ret) { - fprintf(stderr, "Error %d receiving xsks_map_fd\n", ret); - exit_with_error(ret); - } - if (xsks[0]->xsk) { - ret = xsk_socket__update_xskmap(xsks[0]->xsk, xsks_map_fd); - if (ret) { - fprintf(stderr, "Update of BPF map failed(%d)\n", ret); - exit_with_error(ret); - } - } - } - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - signal(SIGABRT, int_exit); - - setlocale(LC_ALL, ""); - - prev_time = get_nsecs(); - start_time = prev_time; - - if (!opt_quiet) { - ret = pthread_create(&pt, NULL, poller, NULL); - if (ret) - exit_with_error(ret); - } - - /* Configure sched priority for better wake-up accuracy */ - memset(&schparam, 0, sizeof(schparam)); - schparam.sched_priority = opt_schprio; - ret = sched_setscheduler(0, opt_schpolicy, &schparam); - if (ret) { - fprintf(stderr, "Error(%d) in setting priority(%d): %s\n", - errno, opt_schprio, strerror(errno)); - goto out; - } - - if (opt_bench == BENCH_RXDROP) - rx_drop_all(); - else if (opt_bench == BENCH_TXONLY) - tx_only_all(); - else - l2fwd_all(); - -out: - benchmark_done = true; - - if (!opt_quiet) - pthread_join(pt, NULL); - - xdpsock_cleanup(); - - munmap(bufs, NUM_FRAMES * opt_xsk_frame_size); - - return 0; -} diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c deleted file mode 100644 index 2324e18ccc7e..000000000000 --- a/samples/bpf/xsk_fwd.c +++ /dev/null @@ -1,1085 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2020 Intel Corporation. */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -typedef __u64 u64; -typedef __u32 u32; -typedef __u16 u16; -typedef __u8 u8; - -/* This program illustrates the packet forwarding between multiple AF_XDP - * sockets in multi-threaded environment. All threads are sharing a common - * buffer pool, with each socket having its own private buffer cache. - * - * Example 1: Single thread handling two sockets. The packets received by socket - * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue - * QB), while the packets received by socket B are forwarded to socket A. The - * thread is running on CPU core X: - * - * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X - * - * Example 2: Two threads, each handling two sockets. The thread running on CPU - * core X forwards all the packets received by socket A to socket B, and all the - * packets received by socket B to socket A. The thread running on CPU core Y is - * performing the same packet forwarding between sockets C and D: - * - * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD - * -c CX -c CY - */ - -/* - * Buffer pool and buffer cache - * - * For packet forwarding, the packet buffers are typically allocated from the - * pool for packet reception and freed back to the pool for further reuse once - * the packet transmission is completed. - * - * The buffer pool is shared between multiple threads. In order to minimize the - * access latency to the shared buffer pool, each thread creates one (or - * several) buffer caches, which, unlike the buffer pool, are private to the - * thread that creates them and therefore cannot be shared with other threads. - * The access to the shared pool is only needed either (A) when the cache gets - * empty due to repeated buffer allocations and it needs to be replenished from - * the pool, or (B) when the cache gets full due to repeated buffer free and it - * needs to be flushed back to the pull. - * - * In a packet forwarding system, a packet received on any input port can - * potentially be transmitted on any output port, depending on the forwarding - * configuration. For AF_XDP sockets, for this to work with zero-copy of the - * packet buffers when, it is required that the buffer pool memory fits into the - * UMEM area shared by all the sockets. - */ - -struct bpool_params { - u32 n_buffers; - u32 buffer_size; - int mmap_flags; - - u32 n_users_max; - u32 n_buffers_per_slab; -}; - -/* This buffer pool implementation organizes the buffers into equally sized - * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the - * pool that are completely filled with buffer pointers (full slabs). - * - * Each buffer cache has a slab for buffer allocation and a slab for buffer - * free, with both of these slabs initially empty. When the cache's allocation - * slab goes empty, it is swapped with one of the available full slabs from the - * pool, if any is available. When the cache's free slab goes full, it is - * swapped for one of the empty slabs from the pool, which is guaranteed to - * succeed. - * - * Partially filled slabs never get traded between the cache and the pool - * (except when the cache itself is destroyed), which enables fast operation - * through pointer swapping. - */ -struct bpool { - struct bpool_params params; - pthread_mutex_t lock; - void *addr; - - u64 **slabs; - u64 **slabs_reserved; - u64 *buffers; - u64 *buffers_reserved; - - u64 n_slabs; - u64 n_slabs_reserved; - u64 n_buffers; - - u64 n_slabs_available; - u64 n_slabs_reserved_available; - - struct xsk_umem_config umem_cfg; - struct xsk_ring_prod umem_fq; - struct xsk_ring_cons umem_cq; - struct xsk_umem *umem; -}; - -static struct bpool * -bpool_init(struct bpool_params *params, - struct xsk_umem_config *umem_cfg) -{ - u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved; - u64 slabs_size, slabs_reserved_size; - u64 buffers_size, buffers_reserved_size; - u64 total_size, i; - struct bpool *bp; - u8 *p; - int status; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - /* bpool internals dimensioning. */ - n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / - params->n_buffers_per_slab; - n_slabs_reserved = params->n_users_max * 2; - n_buffers = n_slabs * params->n_buffers_per_slab; - n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; - - slabs_size = n_slabs * sizeof(u64 *); - slabs_reserved_size = n_slabs_reserved * sizeof(u64 *); - buffers_size = n_buffers * sizeof(u64); - buffers_reserved_size = n_buffers_reserved * sizeof(u64); - - total_size = sizeof(struct bpool) + - slabs_size + slabs_reserved_size + - buffers_size + buffers_reserved_size; - - /* bpool memory allocation. */ - p = calloc(total_size, sizeof(u8)); - if (!p) - return NULL; - - /* bpool memory initialization. */ - bp = (struct bpool *)p; - memcpy(&bp->params, params, sizeof(*params)); - bp->params.n_buffers = n_buffers; - - bp->slabs = (u64 **)&p[sizeof(struct bpool)]; - bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) + - slabs_size]; - bp->buffers = (u64 *)&p[sizeof(struct bpool) + - slabs_size + slabs_reserved_size]; - bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) + - slabs_size + slabs_reserved_size + buffers_size]; - - bp->n_slabs = n_slabs; - bp->n_slabs_reserved = n_slabs_reserved; - bp->n_buffers = n_buffers; - - for (i = 0; i < n_slabs; i++) - bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab]; - bp->n_slabs_available = n_slabs; - - for (i = 0; i < n_slabs_reserved; i++) - bp->slabs_reserved[i] = &bp->buffers_reserved[i * - params->n_buffers_per_slab]; - bp->n_slabs_reserved_available = n_slabs_reserved; - - for (i = 0; i < n_buffers; i++) - bp->buffers[i] = i * params->buffer_size; - - /* lock. */ - status = pthread_mutex_init(&bp->lock, NULL); - if (status) { - free(p); - return NULL; - } - - /* mmap. */ - bp->addr = mmap(NULL, - n_buffers * params->buffer_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags, - -1, - 0); - if (bp->addr == MAP_FAILED) { - pthread_mutex_destroy(&bp->lock); - free(p); - return NULL; - } - - /* umem. */ - status = xsk_umem__create(&bp->umem, - bp->addr, - bp->params.n_buffers * bp->params.buffer_size, - &bp->umem_fq, - &bp->umem_cq, - umem_cfg); - if (status) { - munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); - pthread_mutex_destroy(&bp->lock); - free(p); - return NULL; - } - memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg)); - - return bp; -} - -static void -bpool_free(struct bpool *bp) -{ - if (!bp) - return; - - xsk_umem__delete(bp->umem); - munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); - pthread_mutex_destroy(&bp->lock); - free(bp); -} - -struct bcache { - struct bpool *bp; - - u64 *slab_cons; - u64 *slab_prod; - - u64 n_buffers_cons; - u64 n_buffers_prod; -}; - -static u32 -bcache_slab_size(struct bcache *bc) -{ - struct bpool *bp = bc->bp; - - return bp->params.n_buffers_per_slab; -} - -static struct bcache * -bcache_init(struct bpool *bp) -{ - struct bcache *bc; - - bc = calloc(1, sizeof(struct bcache)); - if (!bc) - return NULL; - - bc->bp = bp; - bc->n_buffers_cons = 0; - bc->n_buffers_prod = 0; - - pthread_mutex_lock(&bp->lock); - if (bp->n_slabs_reserved_available == 0) { - pthread_mutex_unlock(&bp->lock); - free(bc); - return NULL; - } - - bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1]; - bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2]; - bp->n_slabs_reserved_available -= 2; - pthread_mutex_unlock(&bp->lock); - - return bc; -} - -static void -bcache_free(struct bcache *bc) -{ - struct bpool *bp; - - if (!bc) - return; - - /* In order to keep this example simple, the case of freeing any - * existing buffers from the cache back to the pool is ignored. - */ - - bp = bc->bp; - pthread_mutex_lock(&bp->lock); - bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod; - bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons; - bp->n_slabs_reserved_available += 2; - pthread_mutex_unlock(&bp->lock); - - free(bc); -} - -/* To work correctly, the implementation requires that the *n_buffers* input - * argument is never greater than the buffer pool's *n_buffers_per_slab*. This - * is typically the case, with one exception taking place when large number of - * buffers are allocated at init time (e.g. for the UMEM fill queue setup). - */ -static inline u32 -bcache_cons_check(struct bcache *bc, u32 n_buffers) -{ - struct bpool *bp = bc->bp; - u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; - u64 n_buffers_cons = bc->n_buffers_cons; - u64 n_slabs_available; - u64 *slab_full; - - /* - * Consumer slab is not empty: Use what's available locally. Do not - * look for more buffers from the pool when the ask can only be - * partially satisfied. - */ - if (n_buffers_cons) - return (n_buffers_cons < n_buffers) ? - n_buffers_cons : - n_buffers; - - /* - * Consumer slab is empty: look to trade the current consumer slab - * (full) for a full slab from the pool, if any is available. - */ - pthread_mutex_lock(&bp->lock); - n_slabs_available = bp->n_slabs_available; - if (!n_slabs_available) { - pthread_mutex_unlock(&bp->lock); - return 0; - } - - n_slabs_available--; - slab_full = bp->slabs[n_slabs_available]; - bp->slabs[n_slabs_available] = bc->slab_cons; - bp->n_slabs_available = n_slabs_available; - pthread_mutex_unlock(&bp->lock); - - bc->slab_cons = slab_full; - bc->n_buffers_cons = n_buffers_per_slab; - return n_buffers; -} - -static inline u64 -bcache_cons(struct bcache *bc) -{ - u64 n_buffers_cons = bc->n_buffers_cons - 1; - u64 buffer; - - buffer = bc->slab_cons[n_buffers_cons]; - bc->n_buffers_cons = n_buffers_cons; - return buffer; -} - -static inline void -bcache_prod(struct bcache *bc, u64 buffer) -{ - struct bpool *bp = bc->bp; - u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; - u64 n_buffers_prod = bc->n_buffers_prod; - u64 n_slabs_available; - u64 *slab_empty; - - /* - * Producer slab is not yet full: store the current buffer to it. - */ - if (n_buffers_prod < n_buffers_per_slab) { - bc->slab_prod[n_buffers_prod] = buffer; - bc->n_buffers_prod = n_buffers_prod + 1; - return; - } - - /* - * Producer slab is full: trade the cache's current producer slab - * (full) for an empty slab from the pool, then store the current - * buffer to the new producer slab. As one full slab exists in the - * cache, it is guaranteed that there is at least one empty slab - * available in the pool. - */ - pthread_mutex_lock(&bp->lock); - n_slabs_available = bp->n_slabs_available; - slab_empty = bp->slabs[n_slabs_available]; - bp->slabs[n_slabs_available] = bc->slab_prod; - bp->n_slabs_available = n_slabs_available + 1; - pthread_mutex_unlock(&bp->lock); - - slab_empty[0] = buffer; - bc->slab_prod = slab_empty; - bc->n_buffers_prod = 1; -} - -/* - * Port - * - * Each of the forwarding ports sits on top of an AF_XDP socket. In order for - * packet forwarding to happen with no packet buffer copy, all the sockets need - * to share the same UMEM area, which is used as the buffer pool memory. - */ -#ifndef MAX_BURST_RX -#define MAX_BURST_RX 64 -#endif - -#ifndef MAX_BURST_TX -#define MAX_BURST_TX 64 -#endif - -struct burst_rx { - u64 addr[MAX_BURST_RX]; - u32 len[MAX_BURST_RX]; -}; - -struct burst_tx { - u64 addr[MAX_BURST_TX]; - u32 len[MAX_BURST_TX]; - u32 n_pkts; -}; - -struct port_params { - struct xsk_socket_config xsk_cfg; - struct bpool *bp; - const char *iface; - u32 iface_queue; -}; - -struct port { - struct port_params params; - - struct bcache *bc; - - struct xsk_ring_cons rxq; - struct xsk_ring_prod txq; - struct xsk_ring_prod umem_fq; - struct xsk_ring_cons umem_cq; - struct xsk_socket *xsk; - int umem_fq_initialized; - - u64 n_pkts_rx; - u64 n_pkts_tx; -}; - -static void -port_free(struct port *p) -{ - if (!p) - return; - - /* To keep this example simple, the code to free the buffers from the - * socket's receive and transmit queues, as well as from the UMEM fill - * and completion queues, is not included. - */ - - if (p->xsk) - xsk_socket__delete(p->xsk); - - bcache_free(p->bc); - - free(p); -} - -static struct port * -port_init(struct port_params *params) -{ - struct port *p; - u32 umem_fq_size, pos = 0; - int status, i; - - /* Memory allocation and initialization. */ - p = calloc(sizeof(struct port), 1); - if (!p) - return NULL; - - memcpy(&p->params, params, sizeof(p->params)); - umem_fq_size = params->bp->umem_cfg.fill_size; - - /* bcache. */ - p->bc = bcache_init(params->bp); - if (!p->bc || - (bcache_slab_size(p->bc) < umem_fq_size) || - (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) { - port_free(p); - return NULL; - } - - /* xsk socket. */ - status = xsk_socket__create_shared(&p->xsk, - params->iface, - params->iface_queue, - params->bp->umem, - &p->rxq, - &p->txq, - &p->umem_fq, - &p->umem_cq, - ¶ms->xsk_cfg); - if (status) { - port_free(p); - return NULL; - } - - /* umem fq. */ - xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos); - - for (i = 0; i < umem_fq_size; i++) - *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = - bcache_cons(p->bc); - - xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); - p->umem_fq_initialized = 1; - - return p; -} - -static inline u32 -port_rx_burst(struct port *p, struct burst_rx *b) -{ - u32 n_pkts, pos, i; - - /* Free buffers for FQ replenish. */ - n_pkts = ARRAY_SIZE(b->addr); - - n_pkts = bcache_cons_check(p->bc, n_pkts); - if (!n_pkts) - return 0; - - /* RXQ. */ - n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); - if (!n_pkts) { - if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { - struct pollfd pollfd = { - .fd = xsk_socket__fd(p->xsk), - .events = POLLIN, - }; - - poll(&pollfd, 1, 0); - } - return 0; - } - - for (i = 0; i < n_pkts; i++) { - b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr; - b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len; - } - - xsk_ring_cons__release(&p->rxq, n_pkts); - p->n_pkts_rx += n_pkts; - - /* UMEM FQ. */ - for ( ; ; ) { - int status; - - status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); - if (status == n_pkts) - break; - - if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { - struct pollfd pollfd = { - .fd = xsk_socket__fd(p->xsk), - .events = POLLIN, - }; - - poll(&pollfd, 1, 0); - } - } - - for (i = 0; i < n_pkts; i++) - *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = - bcache_cons(p->bc); - - xsk_ring_prod__submit(&p->umem_fq, n_pkts); - - return n_pkts; -} - -static inline void -port_tx_burst(struct port *p, struct burst_tx *b) -{ - u32 n_pkts, pos, i; - int status; - - /* UMEM CQ. */ - n_pkts = p->params.bp->umem_cfg.comp_size; - - n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos); - - for (i = 0; i < n_pkts; i++) { - u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i); - - bcache_prod(p->bc, addr); - } - - xsk_ring_cons__release(&p->umem_cq, n_pkts); - - /* TXQ. */ - n_pkts = b->n_pkts; - - for ( ; ; ) { - status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); - if (status == n_pkts) - break; - - if (xsk_ring_prod__needs_wakeup(&p->txq)) - sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, - NULL, 0); - } - - for (i = 0; i < n_pkts; i++) { - xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i]; - xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i]; - } - - xsk_ring_prod__submit(&p->txq, n_pkts); - if (xsk_ring_prod__needs_wakeup(&p->txq)) - sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - p->n_pkts_tx += n_pkts; -} - -/* - * Thread - * - * Packet forwarding threads. - */ -#ifndef MAX_PORTS_PER_THREAD -#define MAX_PORTS_PER_THREAD 16 -#endif - -struct thread_data { - struct port *ports_rx[MAX_PORTS_PER_THREAD]; - struct port *ports_tx[MAX_PORTS_PER_THREAD]; - u32 n_ports_rx; - struct burst_rx burst_rx; - struct burst_tx burst_tx[MAX_PORTS_PER_THREAD]; - u32 cpu_core_id; - int quit; -}; - -static void swap_mac_addresses(void *data) -{ - struct ether_header *eth = (struct ether_header *)data; - struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; - struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; - struct ether_addr tmp; - - tmp = *src_addr; - *src_addr = *dst_addr; - *dst_addr = tmp; -} - -static void * -thread_func(void *arg) -{ - struct thread_data *t = arg; - cpu_set_t cpu_cores; - u32 i; - - CPU_ZERO(&cpu_cores); - CPU_SET(t->cpu_core_id, &cpu_cores); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); - - for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { - struct port *port_rx = t->ports_rx[i]; - struct port *port_tx = t->ports_tx[i]; - struct burst_rx *brx = &t->burst_rx; - struct burst_tx *btx = &t->burst_tx[i]; - u32 n_pkts, j; - - /* RX. */ - n_pkts = port_rx_burst(port_rx, brx); - if (!n_pkts) - continue; - - /* Process & TX. */ - for (j = 0; j < n_pkts; j++) { - u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); - u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr, - addr); - - swap_mac_addresses(pkt); - - btx->addr[btx->n_pkts] = brx->addr[j]; - btx->len[btx->n_pkts] = brx->len[j]; - btx->n_pkts++; - - if (btx->n_pkts == MAX_BURST_TX) { - port_tx_burst(port_tx, btx); - btx->n_pkts = 0; - } - } - } - - return NULL; -} - -/* - * Process - */ -static const struct bpool_params bpool_params_default = { - .n_buffers = 64 * 1024, - .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .mmap_flags = 0, - - .n_users_max = 16, - .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, -}; - -static const struct xsk_umem_config umem_cfg_default = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, - .flags = 0, -}; - -static const struct port_params port_params_default = { - .xsk_cfg = { - .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, - .libbpf_flags = 0, - .xdp_flags = XDP_FLAGS_DRV_MODE, - .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY, - }, - - .bp = NULL, - .iface = NULL, - .iface_queue = 0, -}; - -#ifndef MAX_PORTS -#define MAX_PORTS 64 -#endif - -#ifndef MAX_THREADS -#define MAX_THREADS 64 -#endif - -static struct bpool_params bpool_params; -static struct xsk_umem_config umem_cfg; -static struct bpool *bp; - -static struct port_params port_params[MAX_PORTS]; -static struct port *ports[MAX_PORTS]; -static u64 n_pkts_rx[MAX_PORTS]; -static u64 n_pkts_tx[MAX_PORTS]; -static int n_ports; - -static pthread_t threads[MAX_THREADS]; -static struct thread_data thread_data[MAX_THREADS]; -static int n_threads; - -static void -print_usage(char *prog_name) -{ - const char *usage = - "Usage:\n" - "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n" - "\n" - "-c CORE CPU core to run a packet forwarding thread\n" - " on. May be invoked multiple times.\n" - "\n" - "-b SIZE Number of buffers in the buffer pool shared\n" - " by all the forwarding threads. Default: %u.\n" - "\n" - "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n" - " pair specifies one forwarding port. May be\n" - " invoked multiple times.\n" - "\n" - "-q QUEUE Network interface queue for RX and TX. Each\n" - " (INTERFACE, QUEUE) pair specified one\n" - " forwarding port. Default: %u. May be invoked\n" - " multiple times.\n" - "\n"; - printf(usage, - prog_name, - bpool_params_default.n_buffers, - port_params_default.iface_queue); -} - -static int -parse_args(int argc, char **argv) -{ - struct option lgopts[] = { - { NULL, 0, 0, 0 } - }; - int opt, option_index; - - /* Parse the input arguments. */ - for ( ; ;) { - opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index); - if (opt == EOF) - break; - - switch (opt) { - case 'b': - bpool_params.n_buffers = atoi(optarg); - break; - - case 'c': - if (n_threads == MAX_THREADS) { - printf("Max number of threads (%d) reached.\n", - MAX_THREADS); - return -1; - } - - thread_data[n_threads].cpu_core_id = atoi(optarg); - n_threads++; - break; - - case 'i': - if (n_ports == MAX_PORTS) { - printf("Max number of ports (%d) reached.\n", - MAX_PORTS); - return -1; - } - - port_params[n_ports].iface = optarg; - port_params[n_ports].iface_queue = 0; - n_ports++; - break; - - case 'q': - if (n_ports == 0) { - printf("No port specified for queue.\n"); - return -1; - } - port_params[n_ports - 1].iface_queue = atoi(optarg); - break; - - default: - printf("Illegal argument.\n"); - return -1; - } - } - - optind = 1; /* reset getopt lib */ - - /* Check the input arguments. */ - if (!n_ports) { - printf("No ports specified.\n"); - return -1; - } - - if (!n_threads) { - printf("No threads specified.\n"); - return -1; - } - - if (n_ports % n_threads) { - printf("Ports cannot be evenly distributed to threads.\n"); - return -1; - } - - return 0; -} - -static void -print_port(u32 port_id) -{ - struct port *port = ports[port_id]; - - printf("Port %u: interface = %s, queue = %u\n", - port_id, port->params.iface, port->params.iface_queue); -} - -static void -print_thread(u32 thread_id) -{ - struct thread_data *t = &thread_data[thread_id]; - u32 i; - - printf("Thread %u (CPU core %u): ", - thread_id, t->cpu_core_id); - - for (i = 0; i < t->n_ports_rx; i++) { - struct port *port_rx = t->ports_rx[i]; - struct port *port_tx = t->ports_tx[i]; - - printf("(%s, %u) -> (%s, %u), ", - port_rx->params.iface, - port_rx->params.iface_queue, - port_tx->params.iface, - port_tx->params.iface_queue); - } - - printf("\n"); -} - -static void -print_port_stats_separator(void) -{ - printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n", - "----", - "------------", - "-------------", - "------------", - "-------------"); -} - -static void -print_port_stats_header(void) -{ - print_port_stats_separator(); - printf("| %4s | %12s | %13s | %12s | %13s |\n", - "Port", - "RX packets", - "RX rate (pps)", - "TX packets", - "TX_rate (pps)"); - print_port_stats_separator(); -} - -static void -print_port_stats_trailer(void) -{ - print_port_stats_separator(); - printf("\n"); -} - -static void -print_port_stats(int port_id, u64 ns_diff) -{ - struct port *p = ports[port_id]; - double rx_pps, tx_pps; - - rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff; - tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff; - - printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n", - port_id, - p->n_pkts_rx, - rx_pps, - p->n_pkts_tx, - tx_pps); - - n_pkts_rx[port_id] = p->n_pkts_rx; - n_pkts_tx[port_id] = p->n_pkts_tx; -} - -static void -print_port_stats_all(u64 ns_diff) -{ - int i; - - print_port_stats_header(); - for (i = 0; i < n_ports; i++) - print_port_stats(i, ns_diff); - print_port_stats_trailer(); -} - -static int quit; - -static void -signal_handler(int sig) -{ - quit = 1; -} - -static void remove_xdp_program(void) -{ - int i; - - for (i = 0 ; i < n_ports; i++) - bpf_xdp_detach(if_nametoindex(port_params[i].iface), - port_params[i].xsk_cfg.xdp_flags, NULL); -} - -int main(int argc, char **argv) -{ - struct timespec time; - u64 ns0; - int i; - - /* Parse args. */ - memcpy(&bpool_params, &bpool_params_default, - sizeof(struct bpool_params)); - memcpy(&umem_cfg, &umem_cfg_default, - sizeof(struct xsk_umem_config)); - for (i = 0; i < MAX_PORTS; i++) - memcpy(&port_params[i], &port_params_default, - sizeof(struct port_params)); - - if (parse_args(argc, argv)) { - print_usage(argv[0]); - return -1; - } - - /* Buffer pool initialization. */ - bp = bpool_init(&bpool_params, &umem_cfg); - if (!bp) { - printf("Buffer pool initialization failed.\n"); - return -1; - } - printf("Buffer pool created successfully.\n"); - - /* Ports initialization. */ - for (i = 0; i < MAX_PORTS; i++) - port_params[i].bp = bp; - - for (i = 0; i < n_ports; i++) { - ports[i] = port_init(&port_params[i]); - if (!ports[i]) { - printf("Port %d initialization failed.\n", i); - return -1; - } - print_port(i); - } - printf("All ports created successfully.\n"); - - /* Threads. */ - for (i = 0; i < n_threads; i++) { - struct thread_data *t = &thread_data[i]; - u32 n_ports_per_thread = n_ports / n_threads, j; - - for (j = 0; j < n_ports_per_thread; j++) { - t->ports_rx[j] = ports[i * n_ports_per_thread + j]; - t->ports_tx[j] = ports[i * n_ports_per_thread + - (j + 1) % n_ports_per_thread]; - } - - t->n_ports_rx = n_ports_per_thread; - - print_thread(i); - } - - for (i = 0; i < n_threads; i++) { - int status; - - status = pthread_create(&threads[i], - NULL, - thread_func, - &thread_data[i]); - if (status) { - printf("Thread %d creation failed.\n", i); - return -1; - } - } - printf("All threads created successfully.\n"); - - /* Print statistics. */ - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - signal(SIGABRT, signal_handler); - - clock_gettime(CLOCK_MONOTONIC, &time); - ns0 = time.tv_sec * 1000000000UL + time.tv_nsec; - for ( ; !quit; ) { - u64 ns1, ns_diff; - - sleep(1); - clock_gettime(CLOCK_MONOTONIC, &time); - ns1 = time.tv_sec * 1000000000UL + time.tv_nsec; - ns_diff = ns1 - ns0; - ns0 = ns1; - - print_port_stats_all(ns_diff); - } - - /* Threads completion. */ - printf("Quit.\n"); - for (i = 0; i < n_threads; i++) - thread_data[i].quit = 1; - - for (i = 0; i < n_threads; i++) - pthread_join(threads[i], NULL); - - for (i = 0; i < n_ports; i++) - port_free(ports[i]); - - bpool_free(bp); - - remove_xdp_program(); - - return 0; -} From 3c660a5d86f4c01cf641bfea004a49f5860a5bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:18 +0000 Subject: [PATCH 69/94] bpf: Introduce TYPE_MATCH related constants/macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to provide type match support we require a new type of relocation which, in turn, requires toolchain support. Recent LLVM/Clang versions support a new value for the last argument to the __builtin_preserve_type_info builtin, for example. With this change we introduce the necessary constants into relevant header files, mirroring what the compiler may support. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-2-deso@posteo.net --- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/bpf_core_read.h | 1 + 3 files changed, 3 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ad9e7311c4cf..379e68fb866f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind { BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ }; /* diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ad9e7311c4cf..379e68fb866f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind { BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ }; /* diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index fd48b1ff59ca..2308f4990e96 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -29,6 +29,7 @@ enum bpf_type_id_kind { enum bpf_type_info_kind { BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ }; /* second argument to __builtin_preserve_enum_value() built-in */ From 633e7ceb2cbbae9b2f5ca69106b0de65728c5988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:19 +0000 Subject: [PATCH 70/94] bpftool: Honor BPF_CORE_TYPE_MATCHES relocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpftool needs to know about the newly introduced BPF_CORE_TYPE_MATCHES relocation for its 'gen min_core_btf' command to work properly in the present of this relocation. Specifically, we need to make sure to mark types and fields so that they are present in the minimized BTF for "type match" checks to work out. However, contrary to the existing btfgen_record_field_relo, we need to rely on the BTF -- and not the spec -- to find fields. With this change we handle this new variant correctly. The functionality will be tested with follow on changes to BPF selftests, which already run against a minimized BTF created with bpftool. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220628160127.607834-3-deso@posteo.net --- tools/bpf/bpftool/gen.c | 108 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 480cbd859359..3d35fbc5fe16 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1856,6 +1856,112 @@ static int btfgen_record_field_relo(struct btfgen_info *info, struct bpf_core_sp return 0; } +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + * + * The `behind_ptr` argument is used to stop marking of composite types reached + * through a pointer. This way, we can keep BTF size in check while providing + * reasonable match semantics. + */ +static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool behind_ptr) +{ + const struct btf_type *btf_type; + struct btf *btf = info->src_btf; + struct btf_type *cloned_type; + int i, err; + + if (type_id == 0) + return 0; + + btf_type = btf__type_by_id(btf, type_id); + /* mark type on cloned BTF as used */ + cloned_type = (struct btf_type *)btf__type_by_id(info->marked_btf, type_id); + cloned_type->name_off = MARKED; + + switch (btf_kind(btf_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(btf_type); + __u16 vlen = btf_vlen(btf_type); + + if (behind_ptr) + break; + + for (i = 0; i < vlen; i++, m++) { + /* mark member */ + btfgen_mark_member(info, type_id, i); + + /* mark member's type */ + err = btfgen_mark_type_match(info, m->type, false); + if (err) + return err; + } + break; + } + case BTF_KIND_CONST: + case BTF_KIND_FWD: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + return btfgen_mark_type_match(info, btf_type->type, behind_ptr); + case BTF_KIND_PTR: + return btfgen_mark_type_match(info, btf_type->type, true); + case BTF_KIND_ARRAY: { + struct btf_array *array; + + array = btf_array(btf_type); + /* mark array type */ + err = btfgen_mark_type_match(info, array->type, false); + /* mark array's index type */ + err = err ? : btfgen_mark_type_match(info, array->index_type, false); + if (err) + return err; + break; + } + case BTF_KIND_FUNC_PROTO: { + __u16 vlen = btf_vlen(btf_type); + struct btf_param *param; + + /* mark ret type */ + err = btfgen_mark_type_match(info, btf_type->type, false); + if (err) + return err; + + /* mark parameters types */ + param = btf_params(btf_type); + for (i = 0; i < vlen; i++) { + err = btfgen_mark_type_match(info, param->type, false); + if (err) + return err; + param++; + } + break; + } + /* tells if some other type needs to be handled */ + default: + p_err("unsupported kind: %s (%d)", btf_kind_str(btf_type), type_id); + return -EINVAL; + } + + return 0; +} + +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + */ +static int btfgen_record_type_match_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + return btfgen_mark_type_match(info, targ_spec->root_type_id, false); +} + static int btfgen_record_type_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) { return btfgen_mark_type(info, targ_spec->root_type_id, true); @@ -1882,6 +1988,8 @@ static int btfgen_record_reloc(struct btfgen_info *info, struct bpf_core_spec *r case BPF_CORE_TYPE_EXISTS: case BPF_CORE_TYPE_SIZE: return btfgen_record_type_relo(info, res); + case BPF_CORE_TYPE_MATCHES: + return btfgen_record_type_match_relo(info, res); case BPF_CORE_ENUMVAL_EXISTS: case BPF_CORE_ENUMVAL_VALUE: return btfgen_record_enumval_relo(info, res); From ec6209c8d42f815bc3bef10934637ca92114cd1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:21 +0000 Subject: [PATCH 71/94] bpf, libbpf: Add type match support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for the proposed type match relation to relo_core where it is shared between userspace and kernel. It plumbs through both kernel-side and libbpf-side support. The matching relation is defined as follows (copy from source): - modifiers and typedefs are stripped (and, hence, effectively ignored) - generally speaking types need to be of same kind (struct vs. struct, union vs. union, etc.) - exceptions are struct/union behind a pointer which could also match a forward declaration of a struct or union, respectively, and enum vs. enum64 (see below) Then, depending on type: - integers: - match if size and signedness match - arrays & pointers: - target types are recursively matched - structs & unions: - local members need to exist in target with the same name - for each member we recursively check match unless it is already behind a pointer, in which case we only check matching names and compatible kind - enums: - local variants have to have a match in target by symbolic name (but not numeric value) - size has to match (but enum may match enum64 and vice versa) - function pointers: - number and position of arguments in local type has to match target - for each argument and the return value we recursively check match Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-5-deso@posteo.net --- kernel/bpf/btf.c | 9 ++ tools/lib/bpf/libbpf.c | 6 + tools/lib/bpf/relo_core.c | 279 +++++++++++++++++++++++++++++++++++++- tools/lib/bpf/relo_core.h | 4 + 4 files changed, 294 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d3c7ab8af46..4f2408a4df08 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7443,6 +7443,15 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8a45a84eb9b2..64c4cc6140d3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5470,6 +5470,12 @@ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); } +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} + static size_t bpf_core_hash_fn(const void *key, void *ctx) { return (size_t)key; diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index e070123332cd..fe2533022aa9 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -95,6 +95,7 @@ static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; case BPF_CORE_TYPE_SIZE: return "type_size"; case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; @@ -123,6 +124,7 @@ static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: case BPF_CORE_TYPE_ID_TARGET: case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: case BPF_CORE_TYPE_SIZE: return true; default: @@ -251,7 +253,7 @@ recur: * - field 'a' access (corresponds to '2' in low-level spec); * - array element #3 access (corresponds to '3' in low-level spec). * - * Type-based relocations (TYPE_EXISTS/TYPE_SIZE, + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their * spec and raw_spec are kept empty. * @@ -568,9 +570,14 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, targ_spec->relo_kind = local_spec->relo_kind; if (core_relo_is_type_based(local_spec->relo_kind)) { - return bpf_core_types_are_compat(local_spec->btf, - local_spec->root_type_id, - targ_btf, targ_id); + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); } local_acc = &local_spec->spec[0]; @@ -819,6 +826,7 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, *validate = false; break; case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: *val = 1; break; case BPF_CORE_TYPE_SIZE: @@ -1410,3 +1418,266 @@ int bpf_core_calc_relo_insn(const char *prog_name, return 0; } + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + + local_t = btf_type_by_id(local_btf, local_id); + targ_t = btf_type_by_id(targ_btf, targ_id); + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3fd3842d4230..1c0566daf8e8 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -72,6 +72,10 @@ int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id, int level); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); size_t bpf_core_essential_name_len(const char *name); From b8a195dc299391bc171c47971974ec6d5ea6fb14 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 20:56:48 -0700 Subject: [PATCH 72/94] libbpf: add bpf_core_type_matches() helper macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch finalizes support for the proposed type match relation in libbpf by adding bpf_core_type_matches() macro which emits TYPE_MATCH relocation. Clang support for this relocation was added in [0]. [0] https://reviews.llvm.org/D126838 Signed-off-by: Daniel Müller ¬ Signed-off-by: Andrii Nakryiko ¬ Link: https://lore.kernel.org/bpf/20220628160127.607834-7-deso@posteo.net¬ --- tools/lib/bpf/bpf_core_read.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 2308f4990e96..496e6a8ee0dc 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -184,6 +184,16 @@ enum bpf_enum_value_kind { #define bpf_core_type_exists(type) \ __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + /* * Convenience macro to get the byte size of a provided named type * (struct/union/enum/typedef) in a target kernel. From 67d8ed4295258cb17e2bed7ed5ada92526a643f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:24 +0000 Subject: [PATCH 73/94] selftests/bpf: Add type-match checks to type-based tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have type-match logic in both libbpf and the kernel, this change adjusts the existing BPF self tests to check this functionality. Specifically, we extend the existing type-based tests to check the previously introduced bpf_core_type_matches macro. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-8-deso@posteo.net --- .../selftests/bpf/prog_tests/core_reloc.c | 31 ++++++++++++++++-- .../selftests/bpf/progs/core_reloc_types.h | 14 +++++++- .../bpf/progs/test_core_reloc_type_based.c | 32 ++++++++++++++++++- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 2f92feb809be..328dd744e5bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -543,7 +543,6 @@ static int __trigger_module_test_read(const struct core_reloc_test_case *test) return 0; } - static const struct core_reloc_test_case test_cases[] = { /* validate we can find kernel image and use its BTF for relocs */ { @@ -752,7 +751,7 @@ static const struct core_reloc_test_case test_cases[] = { SIZE_CASE(size___diff_offs), SIZE_ERR_CASE(size___err_ambiguous), - /* validate type existence and size relocations */ + /* validate type existence, match, and size relocations */ TYPE_BASED_CASE(type_based, { .struct_exists = 1, .union_exists = 1, @@ -765,6 +764,19 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_void_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, + + .struct_matches = 1, + .union_matches = 1, + .enum_matches = 1, + .typedef_named_struct_matches = 1, + .typedef_anon_struct_matches = 1, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 1, + .typedef_enum_matches = 1, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 1, + .typedef_arr_matches = 1, + .struct_sz = sizeof(struct a_struct), .union_sz = sizeof(union a_union), .enum_sz = sizeof(enum an_enum), @@ -792,6 +804,19 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_void_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, + + .struct_matches = 0, + .union_matches = 0, + .enum_matches = 0, + .typedef_named_struct_matches = 0, + .typedef_anon_struct_matches = 0, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 0, + .typedef_enum_matches = 0, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 0, + .typedef_arr_matches = 0, + .struct_sz = sizeof(struct a_struct___diff_sz), .union_sz = sizeof(union a_union___diff_sz), .enum_sz = sizeof(enum an_enum___diff_sz), @@ -806,10 +831,12 @@ static const struct core_reloc_test_case test_cases[] = { }), TYPE_BASED_CASE(type_based___incompat, { .enum_exists = 1, + .enum_matches = 1, .enum_sz = sizeof(enum an_enum), }), TYPE_BASED_CASE(type_based___fn_wrong_args, { .struct_exists = 1, + .struct_matches = 1, .struct_sz = sizeof(struct a_struct), }), diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 26e103302c05..6a44f3e63ae5 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -860,7 +860,7 @@ struct core_reloc_size___err_ambiguous2 { }; /* - * TYPE EXISTENCE & SIZE + * TYPE EXISTENCE, MATCH & SIZE */ struct core_reloc_type_based_output { bool struct_exists; @@ -875,6 +875,18 @@ struct core_reloc_type_based_output { bool typedef_func_proto_exists; bool typedef_arr_exists; + bool struct_matches; + bool union_matches; + bool enum_matches; + bool typedef_named_struct_matches; + bool typedef_anon_struct_matches; + bool typedef_struct_ptr_matches; + bool typedef_int_matches; + bool typedef_enum_matches; + bool typedef_void_ptr_matches; + bool typedef_func_proto_matches; + bool typedef_arr_matches; + int struct_sz; int union_sz; int enum_sz; diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index fb60f8195c53..325ead666130 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -61,6 +61,18 @@ struct core_reloc_type_based_output { bool typedef_func_proto_exists; bool typedef_arr_exists; + bool struct_matches; + bool union_matches; + bool enum_matches; + bool typedef_named_struct_matches; + bool typedef_anon_struct_matches; + bool typedef_struct_ptr_matches; + bool typedef_int_matches; + bool typedef_enum_matches; + bool typedef_void_ptr_matches; + bool typedef_func_proto_matches; + bool typedef_arr_matches; + int struct_sz; int union_sz; int enum_sz; @@ -77,7 +89,13 @@ struct core_reloc_type_based_output { SEC("raw_tracepoint/sys_enter") int test_core_type_based(void *ctx) { -#if __has_builtin(__builtin_preserve_type_info) + /* Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what we require for this test. Part of it + * could run with merely __builtin_preserve_type_info (which could be checked + * separately), but we have to find an upper bound. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 struct core_reloc_type_based_output *out = (void *)&data.out; out->struct_exists = bpf_core_type_exists(struct a_struct); @@ -92,6 +110,18 @@ int test_core_type_based(void *ctx) out->typedef_func_proto_exists = bpf_core_type_exists(func_proto_typedef); out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); + out->struct_matches = bpf_core_type_matches(struct a_struct); + out->union_matches = bpf_core_type_matches(union a_union); + out->enum_matches = bpf_core_type_matches(enum an_enum); + out->typedef_named_struct_matches = bpf_core_type_matches(named_struct_typedef); + out->typedef_anon_struct_matches = bpf_core_type_matches(anon_struct_typedef); + out->typedef_struct_ptr_matches = bpf_core_type_matches(struct_ptr_typedef); + out->typedef_int_matches = bpf_core_type_matches(int_typedef); + out->typedef_enum_matches = bpf_core_type_matches(enum_typedef); + out->typedef_void_ptr_matches = bpf_core_type_matches(void_ptr_typedef); + out->typedef_func_proto_matches = bpf_core_type_matches(func_proto_typedef); + out->typedef_arr_matches = bpf_core_type_matches(arr_typedef); + out->struct_sz = bpf_core_type_size(struct a_struct); out->union_sz = bpf_core_type_size(union a_union); out->enum_sz = bpf_core_type_size(enum an_enum); From bed56a6dd4cbf76ff757679a9333f9d3a72ed7ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:25 +0000 Subject: [PATCH 74/94] selftests/bpf: Add test checking more characteristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds another type-based self-test that specifically aims to test some more characteristics of the TYPE_MATCH logic. Specifically, it covers a few more potential differences between types, such as different orders, enum variant values, and integer signedness. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-9-deso@posteo.net --- .../selftests/bpf/prog_tests/core_reloc.c | 37 ++++++++++++++ .../progs/btf__core_reloc_type_based___diff.c | 3 ++ .../selftests/bpf/progs/core_reloc_types.h | 51 +++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 328dd744e5bd..eb47bfde459c 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -792,6 +792,43 @@ static const struct core_reloc_test_case test_cases[] = { TYPE_BASED_CASE(type_based___all_missing, { /* all zeros */ }), + TYPE_BASED_CASE(type_based___diff, { + .struct_exists = 1, + .union_exists = 1, + .enum_exists = 1, + .typedef_named_struct_exists = 1, + .typedef_anon_struct_exists = 1, + .typedef_struct_ptr_exists = 1, + .typedef_int_exists = 1, + .typedef_enum_exists = 1, + .typedef_void_ptr_exists = 1, + .typedef_func_proto_exists = 1, + .typedef_arr_exists = 1, + + .struct_matches = 1, + .union_matches = 1, + .enum_matches = 1, + .typedef_named_struct_matches = 1, + .typedef_anon_struct_matches = 1, + .typedef_struct_ptr_matches = 1, + .typedef_int_matches = 0, + .typedef_enum_matches = 1, + .typedef_void_ptr_matches = 1, + .typedef_func_proto_matches = 0, + .typedef_arr_matches = 0, + + .struct_sz = sizeof(struct a_struct___diff), + .union_sz = sizeof(union a_union___diff), + .enum_sz = sizeof(enum an_enum___diff), + .typedef_named_struct_sz = sizeof(named_struct_typedef___diff), + .typedef_anon_struct_sz = sizeof(anon_struct_typedef___diff), + .typedef_struct_ptr_sz = sizeof(struct_ptr_typedef___diff), + .typedef_int_sz = sizeof(int_typedef___diff), + .typedef_enum_sz = sizeof(enum_typedef___diff), + .typedef_void_ptr_sz = sizeof(void_ptr_typedef___diff), + .typedef_func_proto_sz = sizeof(func_proto_typedef___diff), + .typedef_arr_sz = sizeof(arr_typedef___diff), + }), TYPE_BASED_CASE(type_based___diff_sz, { .struct_exists = 1, .union_exists = 1, diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c new file mode 100644 index 000000000000..57ae2c258928 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_type_based___diff x) {} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 6a44f3e63ae5..e326b6a781e5 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -951,6 +951,57 @@ struct core_reloc_type_based { struct core_reloc_type_based___all_missing { }; +/* different member orders, enum variant values, signedness, etc */ +struct a_struct___diff { + int x; + int a; +}; + +union a_union___diff { + int z; + int y; +}; + +typedef struct a_struct___diff named_struct_typedef___diff; + +typedef struct { int z, x, y; } anon_struct_typedef___diff; + +typedef struct { + int c; + int b; + int a; +} *struct_ptr_typedef___diff; + +enum an_enum___diff { + AN_ENUM_VAL2___diff = 0, + AN_ENUM_VAL1___diff = 42, + AN_ENUM_VAL3___diff = 1, +}; + +typedef unsigned int int_typedef___diff; + +typedef enum { TYPEDEF_ENUM_VAL2___diff, TYPEDEF_ENUM_VAL1___diff = 50 } enum_typedef___diff; + +typedef const void *void_ptr_typedef___diff; + +typedef int_typedef___diff (*func_proto_typedef___diff)(long); + +typedef char arr_typedef___diff[3]; + +struct core_reloc_type_based___diff { + struct a_struct___diff f1; + union a_union___diff f2; + enum an_enum___diff f3; + named_struct_typedef___diff f4; + anon_struct_typedef___diff f5; + struct_ptr_typedef___diff f6; + int_typedef___diff f7; + enum_typedef___diff f8; + void_ptr_typedef___diff f9; + func_proto_typedef___diff f10; + arr_typedef___diff f11; +}; + /* different type sizes, extra modifiers, anon vs named enums, etc */ struct a_struct___diff_sz { long x; From 537905c4b68fe3a32c4925122fe792eb2f594b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:26 +0000 Subject: [PATCH 75/94] selftests/bpf: Add nested type to type based tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change extends the type based tests with another struct type (in addition to a_struct) to check relocations against: a_complex_struct. This type is nested more deeply to provide additional coverage of certain paths in the type match logic. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-10-deso@posteo.net --- .../selftests/bpf/prog_tests/core_reloc.c | 4 ++ .../selftests/bpf/progs/core_reloc_types.h | 62 +++++++++++++------ .../bpf/progs/test_core_reloc_type_based.c | 12 ++++ 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index eb47bfde459c..8882c9c1519b 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -754,6 +754,7 @@ static const struct core_reloc_test_case test_cases[] = { /* validate type existence, match, and size relocations */ TYPE_BASED_CASE(type_based, { .struct_exists = 1, + .complex_struct_exists = 1, .union_exists = 1, .enum_exists = 1, .typedef_named_struct_exists = 1, @@ -766,6 +767,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_arr_exists = 1, .struct_matches = 1, + .complex_struct_matches = 1, .union_matches = 1, .enum_matches = 1, .typedef_named_struct_matches = 1, @@ -794,6 +796,7 @@ static const struct core_reloc_test_case test_cases[] = { }), TYPE_BASED_CASE(type_based___diff, { .struct_exists = 1, + .complex_struct_exists = 1, .union_exists = 1, .enum_exists = 1, .typedef_named_struct_exists = 1, @@ -806,6 +809,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_arr_exists = 1, .struct_matches = 1, + .complex_struct_matches = 1, .union_matches = 1, .enum_matches = 1, .typedef_named_struct_matches = 1, diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index e326b6a781e5..474411c4b908 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -864,6 +864,7 @@ struct core_reloc_size___err_ambiguous2 { */ struct core_reloc_type_based_output { bool struct_exists; + bool complex_struct_exists; bool union_exists; bool enum_exists; bool typedef_named_struct_exists; @@ -876,6 +877,7 @@ struct core_reloc_type_based_output { bool typedef_arr_exists; bool struct_matches; + bool complex_struct_matches; bool union_matches; bool enum_matches; bool typedef_named_struct_matches; @@ -904,6 +906,14 @@ struct a_struct { int x; }; +struct a_complex_struct { + union { + struct a_struct * restrict a; + void *b; + } x; + volatile long y; +}; + union a_union { int y; int z; @@ -935,16 +945,17 @@ typedef char arr_typedef[20]; struct core_reloc_type_based { struct a_struct f1; - union a_union f2; - enum an_enum f3; - named_struct_typedef f4; - anon_struct_typedef f5; - struct_ptr_typedef f6; - int_typedef f7; - enum_typedef f8; - void_ptr_typedef f9; - func_proto_typedef f10; - arr_typedef f11; + struct a_complex_struct f2; + union a_union f3; + enum an_enum f4; + named_struct_typedef f5; + anon_struct_typedef f6; + struct_ptr_typedef f7; + int_typedef f8; + enum_typedef f9; + void_ptr_typedef f10; + func_proto_typedef f11; + arr_typedef f12; }; /* no types in target */ @@ -957,6 +968,16 @@ struct a_struct___diff { int a; }; +struct a_struct___forward; + +struct a_complex_struct___diff { + union { + struct a_struct___forward *a; + void *b; + } x; + volatile long y; +}; + union a_union___diff { int z; int y; @@ -990,16 +1011,17 @@ typedef char arr_typedef___diff[3]; struct core_reloc_type_based___diff { struct a_struct___diff f1; - union a_union___diff f2; - enum an_enum___diff f3; - named_struct_typedef___diff f4; - anon_struct_typedef___diff f5; - struct_ptr_typedef___diff f6; - int_typedef___diff f7; - enum_typedef___diff f8; - void_ptr_typedef___diff f9; - func_proto_typedef___diff f10; - arr_typedef___diff f11; + struct a_complex_struct___diff f2; + union a_union___diff f3; + enum an_enum___diff f4; + named_struct_typedef___diff f5; + anon_struct_typedef___diff f6; + struct_ptr_typedef___diff f7; + int_typedef___diff f8; + enum_typedef___diff f9; + void_ptr_typedef___diff f10; + func_proto_typedef___diff f11; + arr_typedef___diff f12; }; /* different type sizes, extra modifiers, anon vs named enums, etc */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index 325ead666130..d95bc08b75c1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -19,6 +19,14 @@ struct a_struct { int x; }; +struct a_complex_struct { + union { + struct a_struct *a; + void *b; + } x; + volatile long y; +}; + union a_union { int y; int z; @@ -50,6 +58,7 @@ typedef char arr_typedef[20]; struct core_reloc_type_based_output { bool struct_exists; + bool complex_struct_exists; bool union_exists; bool enum_exists; bool typedef_named_struct_exists; @@ -62,6 +71,7 @@ struct core_reloc_type_based_output { bool typedef_arr_exists; bool struct_matches; + bool complex_struct_matches; bool union_matches; bool enum_matches; bool typedef_named_struct_matches; @@ -99,6 +109,7 @@ int test_core_type_based(void *ctx) struct core_reloc_type_based_output *out = (void *)&data.out; out->struct_exists = bpf_core_type_exists(struct a_struct); + out->complex_struct_exists = bpf_core_type_exists(struct a_complex_struct); out->union_exists = bpf_core_type_exists(union a_union); out->enum_exists = bpf_core_type_exists(enum an_enum); out->typedef_named_struct_exists = bpf_core_type_exists(named_struct_typedef); @@ -111,6 +122,7 @@ int test_core_type_based(void *ctx) out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); out->struct_matches = bpf_core_type_matches(struct a_struct); + out->complex_struct_matches = bpf_core_type_matches(struct a_complex_struct); out->union_matches = bpf_core_type_matches(union a_union); out->enum_matches = bpf_core_type_matches(enum an_enum); out->typedef_named_struct_matches = bpf_core_type_matches(named_struct_typedef); From 950b347787224e62f59c099e3e3f3f6ecc720d61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Tue, 28 Jun 2022 16:01:27 +0000 Subject: [PATCH 76/94] selftests/bpf: Add type match test against kernel's task_struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change extends the existing core_reloc/kernel test to include a type match check of a local task_struct against the kernel's definition -- which we assume to succeed. Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220628160127.607834-11-deso@posteo.net --- .../selftests/bpf/prog_tests/core_reloc.c | 1 + .../selftests/bpf/progs/core_reloc_types.h | 1 + .../bpf/progs/test_core_reloc_kernel.c | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 8882c9c1519b..a6f65e2236f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -555,6 +555,7 @@ static const struct core_reloc_test_case test_cases[] = { .valid = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }, .comm = "test_progs", .comm_len = sizeof("test_progs"), + .local_task_struct_matches = true, }, .output_len = sizeof(struct core_reloc_kernel_output), .raw_tp_name = "sys_enter", diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 474411c4b908..7ef91d19c66e 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -13,6 +13,7 @@ struct core_reloc_kernel_output { int valid[10]; char comm[sizeof("test_progs")]; int comm_len; + bool local_task_struct_matches; }; /* diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c index 145028b52ad8..a17dd83eae67 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c @@ -21,6 +21,7 @@ struct core_reloc_kernel_output { /* we have test_progs[-flavor], so cut flavor part */ char comm[sizeof("test_progs")]; int comm_len; + bool local_task_struct_matches; }; struct task_struct { @@ -30,11 +31,25 @@ struct task_struct { struct task_struct *group_leader; }; +struct mm_struct___wrong { + int abc_whatever_should_not_exist; +}; + +struct task_struct___local { + int pid; + struct mm_struct___wrong *mm; +}; + #define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src) SEC("raw_tracepoint/sys_enter") int test_core_kernel(void *ctx) { + /* Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what we require for this test. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 struct task_struct *task = (void *)bpf_get_current_task(); struct core_reloc_kernel_output *out = (void *)&data.out; uint64_t pid_tgid = bpf_get_current_pid_tgid(); @@ -93,6 +108,10 @@ int test_core_kernel(void *ctx) group_leader, group_leader, group_leader, group_leader, comm); + out->local_task_struct_matches = bpf_core_type_matches(struct task_struct___local); +#else + data.skip = true; +#endif return 0; } From 8094029330a2f03fb406ecff80671cf27ce28d42 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:45 +0800 Subject: [PATCH 77/94] libbpf: Cleanup the legacy kprobe_event on failed add/attach_event() Before the 0bc11ed5ab60 commit ("kprobes: Allow kprobes coexist with livepatch"), in a scenario where livepatch and kprobe coexist on the same function entry, the creation of kprobe_event using add_kprobe_event_legacy() will be successful, at the same time as a trace event (e.g. /debugfs/tracing/events/kprobe/XXX) will exist, but perf_event_open() will return an error because both livepatch and kprobe use FTRACE_OPS_FL_IPMODIFY. As follows: 1) add a livepatch $ insmod livepatch-XXX.ko 2) add a kprobe using tracefs API (i.e. add_kprobe_event_legacy) $ echo 'p:mykprobe XXX' > /sys/kernel/debug/tracing/kprobe_events 3) enable this kprobe (i.e. sys_perf_event_open) This will return an error, -EBUSY. On Andrii Nakryiko's comment, few error paths in bpf_program__attach_kprobe_opts() that should need to call remove_kprobe_event_legacy(). With this patch, whenever an error is returned after add_kprobe_event_legacy() or bpf_program__attach_perf_event_opts(), this ensures that the created kprobe_event is cleaned. Signed-off-by: Chuang Wang Signed-off-by: Jingren Zhou Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-2-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 64c4cc6140d3..8a441d304924 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9877,10 +9877,11 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_kprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", kfunc_name, offset, - libbpf_strerror_r(type, errmsg, sizeof(errmsg))); - return type; + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; } attr.size = sizeof(attr); attr.config = type; @@ -9894,9 +9895,14 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, err = -errno; pr_warn("legacy kprobe perf_event_open() failed: %s\n", libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return err; + goto err_clean_legacy; } return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; } struct bpf_link * @@ -9953,7 +9959,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, prog->name, retprobe ? "kretprobe" : "kprobe", func_name, offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -9964,6 +9970,10 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, } return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); From 5666fc997ccb93859ea1d4437936c64c6d75c060 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:46 +0800 Subject: [PATCH 78/94] libbpf: Fix wrong variable used in perf_event_uprobe_open_legacy() Use "type" as opposed to "err" in pr_warn() after determine_uprobe_perf_type_legacy() returns an error. Signed-off-by: Chuang Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-3-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8a441d304924..571acd614dce 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10249,7 +10249,7 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, type = determine_uprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", - binary_path, offset, err); + binary_path, offset, type); return type; } From 2655144fb49bae26eae038c6d056f824a7db2726 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Wed, 29 Jun 2022 23:18:47 +0800 Subject: [PATCH 79/94] libbpf: Cleanup the legacy uprobe_event on failed add/attach_event() A potential scenario, when an error is returned after add_uprobe_event_legacy() in perf_event_uprobe_open_legacy(), or bpf_program__attach_perf_event_opts() in bpf_program__attach_uprobe_opts() returns an error, the uprobe_event that was previously created is not cleaned. So, with this patch, when an error is returned, fix this by adding remove_uprobe_event_legacy() Signed-off-by: Chuang Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220629151848.65587-4-nashuiliang@gmail.com --- tools/lib/bpf/libbpf.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 571acd614dce..cb49408eb298 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10248,9 +10248,10 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_uprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", - binary_path, offset, type); - return type; + binary_path, offset, err); + goto err_clean_legacy; } memset(&attr, 0, sizeof(attr)); @@ -10265,9 +10266,14 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, if (pfd < 0) { err = -errno; pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); - return err; + goto err_clean_legacy; } return pfd; + +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; } /* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ @@ -10601,7 +10607,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -10611,10 +10617,13 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, perf_link->legacy_is_retprobe = retprobe; } return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); - } /* Format of u[ret]probe section definition supporting auto-attach: From 450a8dcb8c7f819431b09e5c1debbf0b6c2e824e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 5 Jul 2022 21:04:56 +0100 Subject: [PATCH 80/94] bpftool: Remove zlib feature test from Makefile The feature test to detect the availability of zlib in bpftool's Makefile does not bring much. The library is not optional: it may or may not be required along libbfd for disassembling instructions, but in any case it is necessary to build feature.o or even libbpf, on which bpftool depends. If we remove the feature test, we lose the nicely formatted error message, but we get a compiler error about "zlib.h: No such file or directory", which is equally informative. Let's get rid of the test. Suggested-by: Andrii Nakryiko Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220705200456.285943-1-quentin@isovalent.com --- tools/bpf/bpftool/Makefile | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c19e0e4c41bd..6b5b3a99f79d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -93,10 +93,8 @@ INSTALL ?= install RM ?= rm -f FEATURE_USER = .bpftool -FEATURE_TESTS = libbfd disassembler-four-args zlib libcap \ - clang-bpf-co-re -FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \ - clang-bpf-co-re +FEATURE_TESTS = libbfd disassembler-four-args libcap clang-bpf-co-re +FEATURE_DISPLAY = libbfd disassembler-four-args libcap clang-bpf-co-re check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall @@ -204,11 +202,6 @@ $(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ -$(OUTPUT)feature.o: -ifneq ($(feature-zlib), 1) - $(error "No zlib found") -endif - $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ From 645d5d3bc001a7d77459cb8360c36a60954d6008 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:16 -0700 Subject: [PATCH 81/94] selftests/bpf: Fix bogus uninitialized variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling selftests/bpf in optimized mode (-O2), GCC erroneously complains about uninitialized token variable: In file included from network_helpers.c:22: network_helpers.c: In function ‘open_netns’: test_progs.h:355:22: error: ‘token’ may be used uninitialized [-Werror=maybe-uninitialized] 355 | int ___err = libbpf_get_error(___res); \ | ^~~~~~~~~~~~~~~~~~~~~~~~ network_helpers.c:440:14: note: in expansion of macro ‘ASSERT_OK_PTR’ 440 | if (!ASSERT_OK_PTR(token, "malloc token")) | ^~~~~~~~~~~~~ In file included from /data/users/andriin/linux/tools/testing/selftests/bpf/tools/include/bpf/libbpf.h:21, from bpf_util.h:9, from network_helpers.c:20: /data/users/andriin/linux/tools/testing/selftests/bpf/tools/include/bpf/libbpf_legacy.h:113:17: note: by argument 1 of type ‘const void *’ to ‘libbpf_get_error’ declared here 113 | LIBBPF_API long libbpf_get_error(const void *ptr); | ^~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make: *** [Makefile:522: /data/users/andriin/linux/tools/testing/selftests/bpf/network_helpers.o] Error 1 This is completely bogus becuase libbpf_get_error() doesn't dereference pointer, but the only easy way to silence this is to allocate initialized memory with calloc(). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-2-andrii@kernel.org --- tools/testing/selftests/bpf/network_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 59cf81ec55af..bec15558fd93 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -436,7 +436,7 @@ struct nstoken *open_netns(const char *name) int err; struct nstoken *token; - token = malloc(sizeof(struct nstoken)); + token = calloc(1, sizeof(struct nstoken)); if (!ASSERT_OK_PTR(token, "malloc token")) return NULL; From c46a12200114ae5ad7b0922e6c8969a22c3afbd7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:17 -0700 Subject: [PATCH 82/94] selftests/bpf: Fix few more compiler warnings When compiling with -O2, GCC detects few problems with selftests/bpf, so fix all of them. Two are real issues (uninitialized err and nums out-of-bounds access), but two other uninitialized variables warnings are due to GCC not being able to prove that variables are indeed initialized under conditions under which they are used. Fix all 4 cases, though. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-3-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/usdt.c | 2 +- tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 586dc52d6fb9..a8cb8a96ddaf 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -329,7 +329,7 @@ static int get_syms(char ***symsp, size_t *cntp) struct hashmap *map; char buf[256]; FILE *f; - int err; + int err = 0; /* * The available_filter_functions contains many duplicates, @@ -404,7 +404,7 @@ static void test_bench_attach(void) double attach_delta, detach_delta; struct bpf_link *link = NULL; char **syms = NULL; - size_t cnt, i; + size_t cnt = 0, i; if (!ASSERT_OK(get_syms(&syms, &cnt), "get_syms")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 5f733d50b0d7..9ad9da0f215e 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -12,7 +12,7 @@ int lets_test_this(int); static volatile int idx = 2; static volatile __u64 bla = 0xFEDCBA9876543210ULL; -static volatile short nums[] = {-1, -2, -3, }; +static volatile short nums[] = {-1, -2, -3, -4}; static volatile struct { int x; diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c index fb77a123fe89..874a846e298c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -63,7 +63,7 @@ static bool expect_str(char *buf, size_t size, const char *str, const char *name static void test_synproxy(bool xdp) { int server_fd = -1, client_fd = -1, accept_fd = -1; - char *prog_id, *prog_id_end; + char *prog_id = NULL, *prog_id_end; struct nstoken *ns = NULL; FILE *ctrl_file = NULL; char buf[CMD_OUT_BUF_SIZE]; From 7c8121af1bfe29feedfa4fcb3154886660ecbe3a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Jul 2022 15:48:18 -0700 Subject: [PATCH 83/94] libbpf: Remove unnecessary usdt_rel_ip assignments Coverity detected that usdt_rel_ip is unconditionally overwritten anyways, so there is no need to unnecessarily initialize it with unused value. Clean this up. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20220705224818.4026623-4-andrii@kernel.org --- tools/lib/bpf/usdt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 5159207cbfd9..d18e37982344 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -652,11 +652,9 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * * * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation */ - usdt_rel_ip = usdt_abs_ip = note.loc_addr; - if (base_addr) { + usdt_abs_ip = note.loc_addr; + if (base_addr) usdt_abs_ip += base_addr - note.base_addr; - usdt_rel_ip += base_addr - note.base_addr; - } /* When attaching uprobes (which is what USDTs basically are) * kernel expects file offset to be specified, not a relative From 935dc35c75318fa213d26808ad8bb130fb0b486e Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 6 Jul 2022 22:02:04 +0800 Subject: [PATCH 84/94] libbpf, riscv: Use a0 for RC register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the RISC-V calling convention register usage here [0], a0 is used as return value register, so rename it to make it consistent with the spec. [0] section 18.2, table 18.2 https://riscv.org/wp-content/uploads/2015/01/riscv-calling.pdf Fixes: 589fed479ba1 ("riscv, libbpf: Add RISC-V (RV64) support to bpf_tracing.h") Signed-off-by: Yixun Lan Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: Amjad OULED-AMEUR Link: https://lore.kernel.org/bpf/20220706140204.47926-1-dlan@gentoo.org --- tools/lib/bpf/bpf_tracing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 01ce121c302d..11f9096407fc 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -233,7 +233,7 @@ struct pt_regs___arm64 { #define __PT_PARM5_REG a4 #define __PT_RET_REG ra #define __PT_FP_REG s0 -#define __PT_RC_REG a5 +#define __PT_RC_REG a0 #define __PT_SP_REG sp #define __PT_IP_REG pc /* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ From 2b4b2621fd6401865b31b9f403e4b936b7439e94 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 5 Jul 2022 12:00:18 -0700 Subject: [PATCH 85/94] selftests/bpf: Add benchmark for local_storage RCU Tasks Trace usage This benchmark measures grace period latency and kthread cpu usage of RCU Tasks Trace when many processes are creating/deleting BPF local_storage. Intent here is to quantify improvement on these metrics after Paul's recent RCU Tasks patches [0]. Specifically, fork 15k tasks which call a bpf prog that creates/destroys task local_storage and sleep in a loop, resulting in many call_rcu_tasks_trace calls. To determine grace period latency, trace time elapsed between rcu_tasks_trace_pregp_step and rcu_tasks_trace_postgp; for cpu usage look at rcu_task_trace_kthread's stime in /proc/PID/stat. On my virtualized test environment (Skylake, 8 cpus) benchmark results demonstrate significant improvement: BEFORE Paul's patches: SUMMARY tasks_trace grace period latency avg 22298.551 us stddev 1302.165 us SUMMARY ticks per tasks_trace grace period avg 2.291 stddev 0.324 AFTER Paul's patches: SUMMARY tasks_trace grace period latency avg 16969.197 us stddev 2525.053 us SUMMARY ticks per tasks_trace grace period avg 1.146 stddev 0.178 Note that since these patches are not in bpf-next benchmarking was done by cherry-picking this patch onto rcu tree. [0] https://lore.kernel.org/rcu/20220620225402.GA3842369@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220705190018.3239050-1-davemarchevsky@fb.com --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 42 +++ tools/testing/selftests/bpf/bench.h | 12 + .../bench_local_storage_rcu_tasks_trace.c | 281 ++++++++++++++++++ ...run_bench_local_storage_rcu_tasks_trace.sh | 11 + .../local_storage_rcu_tasks_trace_bench.c | 67 +++++ 6 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh create mode 100644 tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e32a28fe8bc1..dfaac97222af 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h +$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_loop.o \ $(OUTPUT)/bench_strncmp.o \ $(OUTPUT)/bench_bpf_hashmap_full_update.o \ - $(OUTPUT)/bench_local_storage.o + $(OUTPUT)/bench_local_storage.o \ + $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1e7b5d4b1f11..c1f20a147462 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec); } +void +grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat) +{ + int i; + + memset(gp_stat, 0, sizeof(struct basic_stats)); + + for (i = 0; i < res_cnt; i++) + gp_stat->mean += res[i].gp_ns / 1000.0 / (double)res[i].gp_ct / (0.0 + res_cnt); + +#define IT_MEAN_DIFF (res[i].gp_ns / 1000.0 / (double)res[i].gp_ct - gp_stat->mean) + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) + gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0); + } + gp_stat->stddev = sqrt(gp_stat->stddev); +#undef IT_MEAN_DIFF +} + +void +grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat) +{ + int i; + + memset(gp_stat, 0, sizeof(struct basic_stats)); + for (i = 0; i < res_cnt; i++) + gp_stat->mean += res[i].stime / (double)res[i].gp_ct / (0.0 + res_cnt); + +#define IT_MEAN_DIFF (res[i].stime / (double)res[i].gp_ct - gp_stat->mean) + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) + gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0); + } + gp_stat->stddev = sqrt(gp_stat->stddev); +#undef IT_MEAN_DIFF +} + void hits_drops_report_final(struct bench_res res[], int res_cnt) { int i; @@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp; extern struct argp bench_bloom_map_argp; extern struct argp bench_bpf_loop_argp; extern struct argp bench_local_storage_argp; +extern struct argp bench_local_storage_rcu_tasks_trace_argp; extern struct argp bench_strncmp_argp; static const struct argp_child bench_parsers[] = { @@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = { { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, { &bench_local_storage_argp, 0, "local_storage benchmark", 0 }, { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 }, + { &bench_local_storage_rcu_tasks_trace_argp, 0, + "local_storage RCU Tasks Trace slowdown benchmark", 0 }, {}, }; @@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update; extern const struct bench bench_local_storage_cache_seq_get; extern const struct bench bench_local_storage_cache_interleaved_get; extern const struct bench bench_local_storage_cache_hashmap_control; +extern const struct bench bench_local_storage_tasks_trace; static const struct bench *benchs[] = { &bench_count_global, @@ -487,6 +528,7 @@ static const struct bench *benchs[] = { &bench_local_storage_cache_seq_get, &bench_local_storage_cache_interleaved_get, &bench_local_storage_cache_hashmap_control, + &bench_local_storage_tasks_trace, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 4b15286753ba..d748255877e2 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -30,11 +30,19 @@ struct env { struct cpu_set cons_cpus; }; +struct basic_stats { + double mean; + double stddev; +}; + struct bench_res { long hits; long drops; long false_hits; long important_hits; + unsigned long gp_ns; + unsigned long gp_ct; + unsigned int stime; }; struct bench { @@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt); void local_storage_report_progress(int iter, struct bench_res *res, long delta_ns); void local_storage_report_final(struct bench_res res[], int res_cnt); +void grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, + struct basic_stats *gp_stat); +void grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, + struct basic_stats *gp_stat); static inline __u64 get_time_ns(void) { diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c new file mode 100644 index 000000000000..43f109d93130 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include + +#include +#include "local_storage_rcu_tasks_trace_bench.skel.h" +#include "bench.h" + +#include + +static struct { + __u32 nr_procs; + __u32 kthread_pid; + bool quiet; +} args = { + .nr_procs = 1000, + .kthread_pid = 0, + .quiet = false, +}; + +enum { + ARG_NR_PROCS = 7000, + ARG_KTHREAD_PID = 7001, + ARG_QUIET = 7002, +}; + +static const struct argp_option opts[] = { + { "nr_procs", ARG_NR_PROCS, "NR_PROCS", 0, + "Set number of user processes to spin up"}, + { "kthread_pid", ARG_KTHREAD_PID, "PID", 0, + "Pid of rcu_tasks_trace kthread for ticks tracking"}, + { "quiet", ARG_QUIET, "{0,1}", 0, + "If true, don't report progress"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_PROCS: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "invalid nr_procs\n"); + argp_usage(state); + } + args.nr_procs = ret; + break; + case ARG_KTHREAD_PID: + ret = strtol(arg, NULL, 10); + if (ret < 1) { + fprintf(stderr, "invalid kthread_pid\n"); + argp_usage(state); + } + args.kthread_pid = ret; + break; + case ARG_QUIET: + ret = strtol(arg, NULL, 10); + if (ret < 0 || ret > 1) { + fprintf(stderr, "invalid quiet %ld\n", ret); + argp_usage(state); + } + args.quiet = ret; + break; +break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_local_storage_rcu_tasks_trace_argp = { + .options = opts, + .parser = parse_arg, +}; + +#define MAX_SLEEP_PROCS 150000 + +static void validate(void) +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.nr_procs > MAX_SLEEP_PROCS) { + fprintf(stderr, "benchmark supports up to %u sleeper procs!\n", + MAX_SLEEP_PROCS); + exit(1); + } +} + +static long kthread_pid_ticks(void) +{ + char procfs_path[100]; + long stime; + FILE *f; + + if (!args.kthread_pid) + return -1; + + sprintf(procfs_path, "/proc/%u/stat", args.kthread_pid); + f = fopen(procfs_path, "r"); + if (!f) { + fprintf(stderr, "couldn't open %s, exiting\n", procfs_path); + goto err_out; + } + if (fscanf(f, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %ld", &stime) != 1) { + fprintf(stderr, "fscanf of %s failed, exiting\n", procfs_path); + goto err_out; + } + fclose(f); + return stime; + +err_out: + if (f) + fclose(f); + exit(1); + return 0; +} + +static struct { + struct local_storage_rcu_tasks_trace_bench *skel; + long prev_kthread_stime; +} ctx; + +static void sleep_and_loop(void) +{ + while (true) { + sleep(rand() % 4); + syscall(__NR_getpgid); + } +} + +static void local_storage_tasks_trace_setup(void) +{ + int i, err, forkret, runner_pid; + + runner_pid = getpid(); + + for (i = 0; i < args.nr_procs; i++) { + forkret = fork(); + if (forkret < 0) { + fprintf(stderr, "Error forking sleeper proc %u of %u, exiting\n", i, + args.nr_procs); + goto err_out; + } + + if (!forkret) { + err = prctl(PR_SET_PDEATHSIG, SIGKILL); + if (err < 0) { + fprintf(stderr, "prctl failed with err %d, exiting\n", errno); + goto err_out; + } + + if (getppid() != runner_pid) { + fprintf(stderr, "Runner died while spinning up procs, exiting\n"); + goto err_out; + } + sleep_and_loop(); + } + } + printf("Spun up %u procs (our pid %d)\n", args.nr_procs, runner_pid); + + setup_libbpf(); + + ctx.skel = local_storage_rcu_tasks_trace_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "Error doing open_and_load, exiting\n"); + goto err_out; + } + + ctx.prev_kthread_stime = kthread_pid_ticks(); + + if (!bpf_program__attach(ctx.skel->progs.get_local)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + if (!bpf_program__attach(ctx.skel->progs.pregp_step)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + if (!bpf_program__attach(ctx.skel->progs.postgp)) { + fprintf(stderr, "Error attaching bpf program\n"); + goto err_out; + } + + return; +err_out: + exit(1); +} + +static void measure(struct bench_res *res) +{ + long ticks; + + res->gp_ct = atomic_swap(&ctx.skel->bss->gp_hits, 0); + res->gp_ns = atomic_swap(&ctx.skel->bss->gp_times, 0); + ticks = kthread_pid_ticks(); + res->stime = ticks - ctx.prev_kthread_stime; + ctx.prev_kthread_stime = ticks; +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void *producer(void *input) +{ + while (true) + syscall(__NR_getpgid); + return NULL; +} + +static void report_progress(int iter, struct bench_res *res, long delta_ns) +{ + if (ctx.skel->bss->unexpected) { + fprintf(stderr, "Error: Unexpected order of bpf prog calls (postgp after pregp)."); + fprintf(stderr, "Data can't be trusted, exiting\n"); + exit(1); + } + + if (args.quiet) + return; + + printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n", + iter, res->gp_ns / (double)res->gp_ct); + printf("Iter %d\t avg ticks per tasks_trace grace period\t%lf\n", + iter, res->stime / (double)res->gp_ct); +} + +static void report_final(struct bench_res res[], int res_cnt) +{ + struct basic_stats gp_stat; + + grace_period_latency_basic_stats(res, res_cnt, &gp_stat); + printf("SUMMARY tasks_trace grace period latency"); + printf("\tavg %.3lf us\tstddev %.3lf us\n", gp_stat.mean, gp_stat.stddev); + grace_period_ticks_basic_stats(res, res_cnt, &gp_stat); + printf("SUMMARY ticks per tasks_trace grace period"); + printf("\tavg %.3lf\tstddev %.3lf\n", gp_stat.mean, gp_stat.stddev); +} + +/* local-storage-tasks-trace: Benchmark performance of BPF local_storage's use + * of RCU Tasks-Trace. + * + * Stress RCU Tasks Trace by forking many tasks, all of which do no work aside + * from sleep() loop, and creating/destroying BPF task-local storage on wakeup. + * The number of forked tasks is configurable. + * + * exercising code paths which call call_rcu_tasks_trace while there are many + * thousands of tasks on the system should result in RCU Tasks-Trace having to + * do a noticeable amount of work. + * + * This should be observable by measuring rcu_tasks_trace_kthread CPU usage + * after the grace period has ended, or by measuring grace period latency. + * + * This benchmark uses both approaches, attaching to rcu_tasks_trace_pregp_step + * and rcu_tasks_trace_postgp functions to measure grace period latency and + * using /proc/PID/stat to measure rcu_tasks_trace_kthread kernel ticks + */ +const struct bench bench_local_storage_tasks_trace = { + .name = "local-storage-tasks-trace", + .validate = validate, + .setup = local_storage_tasks_trace_setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = report_progress, + .report_final = report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh new file mode 100755 index 000000000000..5dac1f02892c --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +kthread_pid=`pgrep rcu_tasks_trace_kthread` + +if [ -z $kthread_pid ]; then + echo "error: Couldn't find rcu_tasks_trace_kthread" + exit 1 +fi + +./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace diff --git a/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c b/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c new file mode 100644 index 000000000000..03bf69f49075 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} task_storage SEC(".maps"); + +long hits; +long gp_hits; +long gp_times; +long current_gp_start; +long unexpected; +bool postgp_seen; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int get_local(void *ctx) +{ + struct task_struct *task; + int idx; + int *s; + + idx = 0; + task = bpf_get_current_task_btf(); + s = bpf_task_storage_get(&task_storage, task, &idx, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!s) + return 0; + + *s = 3; + bpf_task_storage_delete(&task_storage, task); + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fentry/rcu_tasks_trace_pregp_step") +int pregp_step(struct pt_regs *ctx) +{ + current_gp_start = bpf_ktime_get_ns(); + return 0; +} + +SEC("fentry/rcu_tasks_trace_postgp") +int postgp(struct pt_regs *ctx) +{ + if (!current_gp_start && postgp_seen) { + /* Will only happen if prog tracing rcu_tasks_trace_pregp_step doesn't + * execute before this prog + */ + __sync_add_and_fetch(&unexpected, 1); + return 0; + } + + __sync_add_and_fetch(&gp_times, bpf_ktime_get_ns() - current_gp_start); + __sync_add_and_fetch(&gp_hits, 1); + current_gp_start = 0; + postgp_seen = true; + return 0; +} + +char _license[] SEC("license") = "GPL"; From fb8ddf24c71dc97d6b07ecb7791b4fa5e7f48efc Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Fri, 8 Jul 2022 12:27:36 +0800 Subject: [PATCH 86/94] bpf, docs: Remove deprecated xsk libbpf APIs description Since xsk APIs has been removed from libbpf, let's clean up the BPF docs simutaneously. Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220708042736.669132-1-pulehui@huawei.com --- .../bpf/libbpf/libbpf_naming_convention.rst | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/Documentation/bpf/libbpf/libbpf_naming_convention.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst index f86360f734a8..c5ac97f3d4c4 100644 --- a/Documentation/bpf/libbpf/libbpf_naming_convention.rst +++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst @@ -9,8 +9,8 @@ described here. It's recommended to follow these conventions whenever a new function or type is added to keep libbpf API clean and consistent. All types and functions provided by libbpf API should have one of the -following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``, -``btf_dump_``, ``ring_buffer_``, ``perf_buffer_``. +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``btf_dump_``, +``ring_buffer_``, ``perf_buffer_``. System call wrappers -------------------- @@ -59,15 +59,6 @@ Auxiliary functions and types that don't fit well in any of categories described above should have ``libbpf_`` prefix, e.g. ``libbpf_get_error`` or ``libbpf_prog_type_by_name``. -AF_XDP functions -------------------- - -AF_XDP functions should have an ``xsk_`` prefix, e.g. -``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists -of both low-level ring access functions and high-level configuration -functions. These can be mixed and matched. Note that these functions -are not reentrant for performance reasons. - ABI --- From 018a8e75b49cb846ebfa48076bc4fe0bb67c9c24 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 7 Jul 2022 13:16:12 +0200 Subject: [PATCH 87/94] selftests, xsk: Rename AF_XDP testing app Recently, xsk part of libbpf was moved to selftests/bpf directory and lives on its own because there is an AF_XDP testing application that needs it called xdpxceiver. That name makes it a bit hard to indicate who maintains it as there are other XDP samples in there, whereas this one is strictly about AF_XDP. Do s/xdpxceiver/xskxceiver so that it will be easier to figure out who maintains it. A follow-up patch will correct MAINTAINERS file. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220707111613.49031-2-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/.gitignore | 2 +- tools/testing/selftests/bpf/Makefile | 4 ++-- tools/testing/selftests/bpf/test_xsk.sh | 6 +++--- tools/testing/selftests/bpf/xsk_prereqs.sh | 4 ++-- tools/testing/selftests/bpf/{xdpxceiver.c => xskxceiver.c} | 4 ++-- tools/testing/selftests/bpf/{xdpxceiver.h => xskxceiver.h} | 6 +++--- 6 files changed, 13 insertions(+), 13 deletions(-) rename tools/testing/selftests/bpf/{xdpxceiver.c => xskxceiver.c} (99%) rename tools/testing/selftests/bpf/{xdpxceiver.h => xskxceiver.h} (98%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index ca2f47f45670..3a8cb2404ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -41,6 +41,6 @@ test_cpp /bench *.ko *.tmp -xdpxceiver +xskxceiver xdp_redirect_multi xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dfaac97222af..8d59ec7f4c2d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xdpxceiver xdp_redirect_multi xdp_synproxy + xskxceiver xdp_redirect_multi xdp_synproxy TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read @@ -231,7 +231,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) -$(OUTPUT)/xdpxceiver: $(OUTPUT)/xsk.o +$(OUTPUT)/xskxceiver: $(OUTPUT)/xsk.o BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 567500299231..096a957594cd 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -47,7 +47,7 @@ # conflict with any existing interface # * tests the veth and xsk layers of the topology # -# See the source xdpxceiver.c for information on each test +# See the source xskxceiver.c for information on each test # # Kernel configuration: # --------------------- @@ -160,14 +160,14 @@ statusList=() TEST_NAME="XSK_SELFTESTS_SOFTIRQ" -execxdpxceiver +exec_xskxceiver cleanup_exit ${VETH0} ${VETH1} ${NS1} TEST_NAME="XSK_SELFTESTS_BUSY_POLL" busy_poll=1 setup_vethPairs -execxdpxceiver +exec_xskxceiver ## END TESTS diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 684e813803ec..a0b71723a818 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -8,7 +8,7 @@ ksft_xfail=2 ksft_xpass=3 ksft_skip=4 -XSKOBJ=xdpxceiver +XSKOBJ=xskxceiver validate_root_exec() { @@ -77,7 +77,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip; } } -execxdpxceiver() +exec_xskxceiver() { if [[ $busy_poll -eq 1 ]]; then ARGS+="-b " diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c similarity index 99% rename from tools/testing/selftests/bpf/xdpxceiver.c rename to tools/testing/selftests/bpf/xskxceiver.c index 4c425a43e5b0..74d56d971baf 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -98,11 +98,11 @@ #include #include #include "xsk.h" -#include "xdpxceiver.h" +#include "xskxceiver.h" #include "../kselftest.h" /* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf. - * Until xdpxceiver is either moved or re-writed into libxdp, suppress + * Until xskxceiver is either moved or re-writed into libxdp, suppress * deprecation warnings in this file */ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h similarity index 98% rename from tools/testing/selftests/bpf/xdpxceiver.h rename to tools/testing/selftests/bpf/xskxceiver.h index 8f672b0fe0e1..3d17053f98e5 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -2,8 +2,8 @@ * Copyright(c) 2020 Intel Corporation. */ -#ifndef XDPXCEIVER_H_ -#define XDPXCEIVER_H_ +#ifndef XSKXCEIVER_H_ +#define XSKXCEIVER_H_ #ifndef SOL_XDP #define SOL_XDP 283 @@ -169,4 +169,4 @@ pthread_cond_t pacing_cond = PTHREAD_COND_INITIALIZER; int pkts_in_flight; -#endif /* XDPXCEIVER_H */ +#endif /* XSKXCEIVER_H_ */ From d6f34f7f77fba25b07b0a4f1e55054637e4027b3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 7 Jul 2022 13:16:13 +0200 Subject: [PATCH 88/94] MAINTAINERS: Add entry for AF_XDP selftests files Lukas reported that after commit f36600634282 ("libbpf: move xsk.{c,h} into selftests/bpf") MAINTAINERS file needed an update. In the meantime, Magnus removed AF_XDP samples in commit cfb5a2dbf141 ("bpf, samples: Remove AF_XDP samples"), but selftests part still misses its entry in MAINTAINERS. Now that xdpxceiver became xskxceiver, tools/testing/selftests/bpf/*xsk* will match all of the files related to AF_XDP testing (test_xsk.sh, xskxceiver, xsk_prereqs.sh, xsk.{c,h}). Reported-by: Lukas Bulwahn Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220707111613.49031-3-maciej.fijalkowski@intel.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ccd774c37c56..1da8a8d287d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21749,6 +21749,7 @@ F: include/uapi/linux/if_xdp.h F: include/uapi/linux/xdp_diag.h F: include/net/netns/xdp.h F: net/xdp/ +F: tools/testing/selftests/bpf/*xsk* XEN BLOCK SUBSYSTEM M: Roger Pau Monné From aad53f17f0ad7485872d66fbcb53cc0c60e811f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Wed, 6 Jul 2022 21:28:54 +0000 Subject: [PATCH 89/94] bpftool: Add support for KIND_RESTRICT to gen min_core_btf command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adjusts bpftool's type marking logic, as used in conjunction with TYPE_EXISTS relocations, to correctly recognize and handle the RESTRICT BTF kind. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220623212205.2805002-1-deso@posteo.net/T/#m4c75205145701762a4b398e0cdb911d5b5305ffc Link: https://lore.kernel.org/bpf/20220706212855.1700615-2-deso@posteo.net --- tools/bpf/bpftool/gen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 3d35fbc5fe16..1cf53bb01936 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1762,6 +1762,7 @@ btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_poi } break; case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: case BTF_KIND_VOLATILE: case BTF_KIND_TYPEDEF: err = btfgen_mark_type(info, btf_type->type, follow_pointers); From 32e0d9b3104845e0b3f24d89033a17a317ba37f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Wed, 6 Jul 2022 21:28:55 +0000 Subject: [PATCH 90/94] selftests/bpf: Add test involving restrict type qualifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a type based test involving the restrict type qualifier to the BPF selftests. On the btfgen path, this will verify that bpftool correctly handles the corresponding RESTRICT BTF kind. Signed-off-by: Daniel Müller Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220706212855.1700615-3-deso@posteo.net --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 2 ++ tools/testing/selftests/bpf/progs/core_reloc_types.h | 8 ++++++-- .../selftests/bpf/progs/test_core_reloc_type_based.c | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index a6f65e2236f4..c8655ba9a88f 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -764,6 +764,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_int_exists = 1, .typedef_enum_exists = 1, .typedef_void_ptr_exists = 1, + .typedef_restrict_ptr_exists = 1, .typedef_func_proto_exists = 1, .typedef_arr_exists = 1, @@ -777,6 +778,7 @@ static const struct core_reloc_test_case test_cases[] = { .typedef_int_matches = 1, .typedef_enum_matches = 1, .typedef_void_ptr_matches = 1, + .typedef_restrict_ptr_matches = 1, .typedef_func_proto_matches = 1, .typedef_arr_matches = 1, diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 7ef91d19c66e..fd8e1b4c6762 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -874,6 +874,7 @@ struct core_reloc_type_based_output { bool typedef_int_exists; bool typedef_enum_exists; bool typedef_void_ptr_exists; + bool typedef_restrict_ptr_exists; bool typedef_func_proto_exists; bool typedef_arr_exists; @@ -887,6 +888,7 @@ struct core_reloc_type_based_output { bool typedef_int_matches; bool typedef_enum_matches; bool typedef_void_ptr_matches; + bool typedef_restrict_ptr_matches; bool typedef_func_proto_matches; bool typedef_arr_matches; @@ -939,6 +941,7 @@ typedef int int_typedef; typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef; typedef void *void_ptr_typedef; +typedef int *restrict restrict_ptr_typedef; typedef int (*func_proto_typedef)(long); @@ -955,8 +958,9 @@ struct core_reloc_type_based { int_typedef f8; enum_typedef f9; void_ptr_typedef f10; - func_proto_typedef f11; - arr_typedef f12; + restrict_ptr_typedef f11; + func_proto_typedef f12; + arr_typedef f13; }; /* no types in target */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c index d95bc08b75c1..2edb4df35e6e 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c @@ -51,6 +51,7 @@ typedef int int_typedef; typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef; typedef void *void_ptr_typedef; +typedef int *restrict restrict_ptr_typedef; typedef int (*func_proto_typedef)(long); @@ -67,6 +68,7 @@ struct core_reloc_type_based_output { bool typedef_int_exists; bool typedef_enum_exists; bool typedef_void_ptr_exists; + bool typedef_restrict_ptr_exists; bool typedef_func_proto_exists; bool typedef_arr_exists; @@ -80,6 +82,7 @@ struct core_reloc_type_based_output { bool typedef_int_matches; bool typedef_enum_matches; bool typedef_void_ptr_matches; + bool typedef_restrict_ptr_matches; bool typedef_func_proto_matches; bool typedef_arr_matches; @@ -118,6 +121,7 @@ int test_core_type_based(void *ctx) out->typedef_int_exists = bpf_core_type_exists(int_typedef); out->typedef_enum_exists = bpf_core_type_exists(enum_typedef); out->typedef_void_ptr_exists = bpf_core_type_exists(void_ptr_typedef); + out->typedef_restrict_ptr_exists = bpf_core_type_exists(restrict_ptr_typedef); out->typedef_func_proto_exists = bpf_core_type_exists(func_proto_typedef); out->typedef_arr_exists = bpf_core_type_exists(arr_typedef); @@ -131,6 +135,7 @@ int test_core_type_based(void *ctx) out->typedef_int_matches = bpf_core_type_matches(int_typedef); out->typedef_enum_matches = bpf_core_type_matches(enum_typedef); out->typedef_void_ptr_matches = bpf_core_type_matches(void_ptr_typedef); + out->typedef_restrict_ptr_matches = bpf_core_type_matches(restrict_ptr_typedef); out->typedef_func_proto_matches = bpf_core_type_matches(func_proto_typedef); out->typedef_arr_matches = bpf_core_type_matches(arr_typedef); From d1a6edecc1fddfb6ef92c8f720631d2c02bf2744 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 Jul 2022 10:50:00 -0700 Subject: [PATCH 91/94] bpf: Check attach_func_proto more carefully in check_return_code Syzkaller reports the following crash: RIP: 0010:check_return_code kernel/bpf/verifier.c:10575 [inline] RIP: 0010:do_check kernel/bpf/verifier.c:12346 [inline] RIP: 0010:do_check_common+0xb3d2/0xd250 kernel/bpf/verifier.c:14610 With the following reproducer: bpf$PROG_LOAD_XDP(0x5, &(0x7f00000004c0)={0xd, 0x3, &(0x7f0000000000)=ANY=[@ANYBLOB="1800000000000019000000000000000095"], &(0x7f0000000300)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, 0x2b, 0xffffffffffffffff, 0x8, 0x0, 0x0, 0x10, 0x0}, 0x80) Because we don't enforce expected_attach_type for XDP programs, we end up in hitting 'if (prog->expected_attach_type == BPF_LSM_CGROUP' part in check_return_code and follow up with testing `prog->aux->attach_func_proto->type`, but `prog->aux->attach_func_proto` is NULL. Add explicit prog_type check for the "Note, BPF_LSM_CGROUP that attach ..." condition. Also, don't skip return code check for LSM/STRUCT_OPS. The above actually brings an issue with existing selftest which tries to return EPERM from void inet_csk_clone. Fix the test (and move called_socket_clone to make sure it's not incremented in case of an error) and add a new one to explicitly verify this condition. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: syzbot+5cc0730bd4b4d2c5f152@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220708175000.2603078-1-sdf@google.com --- kernel/bpf/verifier.c | 21 ++++++++++++++----- .../selftests/bpf/prog_tests/lsm_cgroup.c | 12 +++++++++++ .../testing/selftests/bpf/progs/lsm_cgroup.c | 12 +++++------ .../selftests/bpf/progs/lsm_cgroup_nonvoid.c | 14 +++++++++++++ 4 files changed, 48 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3ec6b05f05..e3cf6194c24f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10444,11 +10444,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -10572,6 +10582,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index c542d7e80a5b..1102e4f42d2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -6,6 +6,7 @@ #include #include "lsm_cgroup.skel.h" +#include "lsm_cgroup_nonvoid.skel.h" #include "cgroup_helpers.h" #include "network_helpers.h" @@ -293,9 +294,20 @@ close_cgroup: lsm_cgroup__destroy(skel); } +static void test_lsm_cgroup_nonvoid(void) +{ + struct lsm_cgroup_nonvoid *skel = NULL; + + skel = lsm_cgroup_nonvoid__open_and_load(); + ASSERT_NULL(skel, "open succeeds"); + lsm_cgroup_nonvoid__destroy(skel); +} + void test_lsm_cgroup(void) { if (test__start_subtest("functional")) test_lsm_cgroup_functional(); + if (test__start_subtest("nonvoid")) + test_lsm_cgroup_nonvoid(); btf__free(btf); } diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 89f3b1e961a8..4f2d60b87b75 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -156,25 +156,25 @@ int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) { int prio = 234; - called_socket_clone++; - if (!newsk) return 1; /* Accepted request sockets get a different priority. */ if (bpf_setsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) - return 0; /* EPERM */ + return 1; /* Make sure bpf_getsockopt is allowed and works. */ prio = 0; if (bpf_getsockopt(newsk, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio))) - return 0; /* EPERM */ + return 1; if (prio != 234) - return 0; /* EPERM */ + return 1; /* Can access cgroup local storage. */ if (!test_local_storage()) - return 0; /* EPERM */ + return 1; + + called_socket_clone++; return 1; } diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c b/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c new file mode 100644 index 000000000000..6cb0f161f417 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup_nonvoid.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("lsm_cgroup/inet_csk_clone") +int BPF_PROG(nonvoid_socket_clone, struct sock *newsk, const struct request_sock *req) +{ + /* Can not return any errors from void LSM hooks. */ + return 0; +} From 18410251f66aee7e82234073ce6656ca20a732a9 Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Wed, 6 Jul 2022 05:18:38 -0600 Subject: [PATCH 92/94] libbpf: Disable SEC pragma macro on GCC It seems the gcc preprocessor breaks with pragmas when surrounding __attribute__. Disable these pragmas on GCC due to upstream bugs see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 Fixes errors like: error: expected identifier or '(' before '#pragma' 106 | SEC("cgroup/bind6") | ^~~ error: expected '=', ',', ';', 'asm' or '__attribute__' before '#pragma' 114 | char _license[] SEC("license") = "GPL"; | ^~~ Signed-off-by: James Hilliard Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220706111839.1247911-1-james.hilliard1@gmail.com --- tools/lib/bpf/bpf_helpers.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index fb04eaf367f1..7349b16b8e2f 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -22,12 +22,25 @@ * To allow use of SEC() with externs (e.g., for extern .maps declarations), * make sure __attribute__((unused)) doesn't trigger compilation warning. */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + #define SEC(name) \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ __attribute__((section(name), used)) \ _Pragma("GCC diagnostic pop") \ +#endif + /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline #define __always_inline inline __attribute__((always_inline)) From 06cd4e9d5d969d713e6b710f0e5ca0bc8476ae41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Thu, 7 Jul 2022 21:19:31 +0000 Subject: [PATCH 93/94] bpf: Correctly propagate errors up from bpf_core_composites_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change addresses a comment made earlier [0] about a missing return of an error when __bpf_core_types_match is invoked from bpf_core_composites_match, which could have let to us erroneously ignoring errors. Regarding the typedef name check pointed out in the same context, it is not actually an issue, because callers of the function perform a name check for the root type anyway. To make that more obvious, let's add comments to the function (similar to what we have for bpf_core_types_are_compat, which is called in pretty much the same context). [0]: https://lore.kernel.org/bpf/165708121449.4919.13204634393477172905.git-patchwork-notify@kernel.org/T/#m55141e8f8cfd2e8d97e65328fa04852870d01af6 Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Müller Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220707211931.3415440-1-deso@posteo.net --- tools/lib/bpf/relo_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index fe2533022aa9..c4b0e81ae293 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -1500,6 +1500,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; if (err > 0) { matched = true; break; @@ -1512,7 +1514,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b return 1; } -/* Check that two types "match". +/* Check that two types "match". This function assumes that root types were + * already checked for name match. * * The matching relation is defined as follows: * - modifiers and typedefs are stripped (and, hence, effectively ignored) @@ -1561,6 +1564,10 @@ recur: if (!local_t || !targ_t) return -EINVAL; + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) return 0; From 24bdfdd2ec343c94adf38fb5bc699f12e543713b Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 8 Jul 2022 16:03:19 +0300 Subject: [PATCH 94/94] selftests/bpf: Fix xdp_synproxy build failure if CONFIG_NF_CONNTRACK=m/n When CONFIG_NF_CONNTRACK=m, struct bpf_ct_opts and enum member BPF_F_CURRENT_NETNS are not exposed. This commit allows building the xdp_synproxy selftest in such cases. Note that nf_conntrack must be loaded before running the test if it's compiled as a module. This commit also allows this selftest to be successfully compiled when CONFIG_NF_CONNTRACK is disabled. One unused local variable of type struct bpf_ct_opts is also removed. Fixes: fb5cd0ce70d4 ("selftests/bpf: Add selftests for raw syncookie helpers") Reported-by: Yauheni Kaliuta Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220708130319.1016294-1-maximmi@nvidia.com --- .../selftests/bpf/progs/xdp_synproxy_kern.c | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 9fd62e94b5e6..736686e903f6 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -77,16 +77,30 @@ struct { __uint(max_entries, MAX_ALLOWED_PORTS); } allowed_ports SEC(".maps"); +/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in + * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. + */ + +struct bpf_ct_opts___local { + s32 netns_id; + s32 error; + u8 l4proto; + u8 dir; + u8 reserved[2]; +} __attribute__((preserve_access_index)); + +#define BPF_F_CURRENT_NETNS (-1) + extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, __u32 len_tuple, - struct bpf_ct_opts *opts, + struct bpf_ct_opts___local *opts, __u32 len_opts) __ksym; extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, u32 len_tuple, - struct bpf_ct_opts *opts, + struct bpf_ct_opts___local *opts, u32 len_opts) __ksym; extern void bpf_ct_release(struct nf_conn *ct) __ksym; @@ -393,7 +407,7 @@ static __always_inline int tcp_dissect(void *data, void *data_end, static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) { - struct bpf_ct_opts ct_lookup_opts = { + struct bpf_ct_opts___local ct_lookup_opts = { .netns_id = BPF_F_CURRENT_NETNS, .l4proto = IPPROTO_TCP, }; @@ -714,10 +728,6 @@ static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, struct header_pointers *hdr, bool xdp) { - struct bpf_ct_opts ct_lookup_opts = { - .netns_id = BPF_F_CURRENT_NETNS, - .l4proto = IPPROTO_TCP, - }; int ret; ret = tcp_dissect(data, data_end, hdr);