From a6e944f25cdbe6b82275402b8bc9a55ad7aac10b Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 14 Jun 2022 07:07:46 +0000 Subject: [PATCH 01/12] xsk: Fix generic transmit when completion queue reservation fails Two points of potential failure in the generic transmit function are: 1. completion queue (cq) reservation failure. 2. skb allocation failure Originally the cq reservation was performed first, followed by the skb allocation. Commit 675716400da6 ("xdp: fix possible cq entry leak") reversed the order because at the time there was no mechanism available to undo the cq reservation which could have led to possible cq entry leaks in the event of skb allocation failure. However if the skb allocation is performed first and the cq reservation then fails, the xsk skb destructor is called which blindly adds the skb address to the already full cq leading to undefined behavior. This commit restores the original order (cq reservation followed by skb allocation) and uses the xskq_prod_cancel helper to undo the cq reserve in event of skb allocation failure. Fixes: 675716400da6 ("xdp: fix possible cq entry leak") Signed-off-by: Ciara Loftus Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220614070746.8871-1-ciara.loftus@intel.com --- net/xdp/xsk.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 19ac872a6624..09002387987e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -538,12 +538,6 @@ static int xsk_generic_xmit(struct sock *sk) goto out; } - skb = xsk_build_skb(xs, &desc); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - goto out; - } - /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement @@ -552,11 +546,19 @@ static int xsk_generic_xmit(struct sock *sk) spin_lock_irqsave(&xs->pool->cq_lock, flags); if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + goto out; + } + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ From 3046a827316c0e55fc563b4fb78c93b9ca5c7c37 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Wed, 15 Jun 2022 11:15:40 +1000 Subject: [PATCH 02/12] bpf: Fix request_sock leak in sk lookup helpers A customer reported a request_socket leak in a Calico cloud environment. We found that a BPF program was doing a socket lookup with takes a refcnt on the socket and that it was finding the request_socket but returning the parent LISTEN socket via sk_to_full_sk() without decrementing the child request socket 1st, resulting in request_sock slab object leak. This patch retains the existing behaviour of returning full socks to the caller but it also decrements the child request_socket if one is present before doing so to prevent the leak. Thanks to Curtis Taylor for all the help in diagnosing and testing this. And thanks to Antoine Tenart for the reproducer and patch input. v2 of this patch contains, refactor as per Daniel Borkmann's suggestions to validate RCU flags on the listen socket so that it balances with bpf_sk_release() and update comments as per Martin KaFai Lau's suggestion. One small change to Daniels suggestion, put "sk = sk2" under "if (sk2 != sk)" to avoid an extra instruction. Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()") Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper") Co-developed-by: Antoine Tenart Signed-off-by: Antoine Tenart Signed-off-by: Jon Maxwell Signed-off-by: Daniel Borkmann Tested-by: Curtis Taylor Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/56d6f898-bde0-bb25-3427-12a330b29fb8@iogearbox.net Link: https://lore.kernel.org/bpf/20220615011540.813025-1-jmaxwell37@gmail.com --- net/core/filter.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 5af58eb48587..5d16d66727fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6516,10 +6516,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6553,10 +6564,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } From d1a374a1aeb7e31191448e225ed2f9c5e894f280 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 15 Jun 2022 09:51:51 +0530 Subject: [PATCH 03/12] bpf: Limit maximum modifier chain length in btf_check_type_tags On processing a module BTF of module built for an older kernel, we might sometimes find that some type points to itself forming a loop. If such a type is a modifier, btf_check_type_tags's while loop following modifier chain will be caught in an infinite loop. Fix this by defining a maximum chain length and bailing out if we spin any longer than that. Fixes: eb596b090558 ("bpf: Ensure type tags precede modifiers in BTF") Reported-by: Daniel Borkmann Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220615042151.2266537-1-memxor@gmail.com --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 63d0ac7dfe2f..eb12d4f705cc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env, n = btf_nr_types(btf); for (i = start_id; i < n; i++) { const struct btf_type *t; + int chain_limit = 32; u32 cur_id = i; t = btf_type_by_id(btf, i); @@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, in_tags = btf_type_is_type_tag(t); while (btf_type_is_modifier(t)) { + if (!chain_limit--) { + btf_verifier_log(env, "Max chain length or cycle detected"); + return -ELOOP; + } if (btf_type_is_type_tag(t)) { if (!in_tags) { btf_verifier_log(env, "Type tags don't precede modifiers"); From ff672c67ee7635ca1e28fb13729e8ef0d1f08ce5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 16 Jun 2022 18:20:36 +0200 Subject: [PATCH 04/12] bpf, x86: Fix tail call count offset calculation on bpf2bpf call On x86-64 the tail call count is passed from one BPF function to another through %rax. Additionally, on function entry, the tail call count value is stored on stack right after the BPF program stack, due to register shortage. The stored count is later loaded from stack either when performing a tail call - to check if we have not reached the tail call limit - or before calling another BPF function call in order to pass it via %rax. In the latter case, we miscalculate the offset at which the tail call count was stored on function entry. The JIT does not take into account that the allocated BPF program stack is always a multiple of 8 on x86, while the actual stack depth does not have to be. This leads to a load from an offset that belongs to the BPF stack, as shown in the example below: SEC("tc") int entry(struct __sk_buff *skb) { /* Have data on stack which size is not a multiple of 8 */ volatile char arr[1] = {}; return subprog_tail(skb); } int entry(struct __sk_buff * skb): 0: (b4) w2 = 0 1: (73) *(u8 *)(r10 -1) = r2 2: (85) call pc+1#bpf_prog_ce2f79bb5f3e06dd_F 3: (95) exit int entry(struct __sk_buff * skb): 0xffffffffa0201788: nop DWORD PTR [rax+rax*1+0x0] 0xffffffffa020178d: xor eax,eax 0xffffffffa020178f: push rbp 0xffffffffa0201790: mov rbp,rsp 0xffffffffa0201793: sub rsp,0x8 0xffffffffa020179a: push rax 0xffffffffa020179b: xor esi,esi 0xffffffffa020179d: mov BYTE PTR [rbp-0x1],sil 0xffffffffa02017a1: mov rax,QWORD PTR [rbp-0x9] !!! tail call count 0xffffffffa02017a8: call 0xffffffffa02017d8 !!! is at rbp-0x10 0xffffffffa02017ad: leave 0xffffffffa02017ae: ret Fix it by rounding up the BPF stack depth to a multiple of 8, when calculating the tail call count offset on stack. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: Maciej Fijalkowski Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220616162037.535469-2-jakub@cloudflare.com --- arch/x86/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f298b18a9a3d..c98b8c0ed3b8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1420,8 +1420,9 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { + /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ EMIT3_off32(0x48, 0x8B, 0x85, - -(bpf_prog->aux->stack_depth + 8)); + -round_up(bpf_prog->aux->stack_depth, 8) - 8); if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) return -EINVAL; } else { From 5e0b0a4c52d30bb09659446f40b77a692361600d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 16 Jun 2022 18:20:37 +0200 Subject: [PATCH 05/12] selftests/bpf: Test tail call counting with bpf2bpf and data on stack Cover the case when tail call count needs to be passed from BPF function to BPF function, and the caller has data on stack. Specifically when the size of data allocated on BPF stack is not a multiple on 8. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220616162037.535469-3-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/tailcalls.c | 55 +++++++++++++++++++ .../selftests/bpf/progs/tailcall_bpf2bpf6.c | 42 ++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index c4da87ec3ba4..19c70880cfb3 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -831,6 +831,59 @@ out: bpf_object__close(obj); } +#include "tailcall_bpf2bpf6.skel.h" + +/* Tail call counting works even when there is data on stack which is + * not aligned to 8 bytes. + */ +static void test_tailcall_bpf2bpf_6(void) +{ + struct tailcall_bpf2bpf6 *obj; + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + obj = tailcall_bpf2bpf6__open_and_load(); + if (!ASSERT_OK_PTR(obj, "open and load")) + return; + + main_fd = bpf_program__fd(obj->progs.entry); + if (!ASSERT_GE(main_fd, 0, "entry prog fd")) + goto out; + + map_fd = bpf_map__fd(obj->maps.jmp_table); + if (!ASSERT_GE(map_fd, 0, "jmp_table map fd")) + goto out; + + prog_fd = bpf_program__fd(obj->progs.classifier_0); + if (!ASSERT_GE(prog_fd, 0, "classifier_0 prog fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "jmp_table map update")) + goto out; + + err = bpf_prog_test_run_opts(main_fd, &topts); + ASSERT_OK(err, "entry prog test run"); + ASSERT_EQ(topts.retval, 0, "tailcall retval"); + + data_fd = bpf_map__fd(obj->maps.bss); + if (!ASSERT_GE(map_fd, 0, "bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + ASSERT_OK(err, "bss map lookup"); + ASSERT_EQ(val, 1, "done flag is set"); + +out: + tailcall_bpf2bpf6__destroy(obj); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -855,4 +908,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_4(false); if (test__start_subtest("tailcall_bpf2bpf_5")) test_tailcall_bpf2bpf_4(true); + if (test__start_subtest("tailcall_bpf2bpf_6")) + test_tailcall_bpf2bpf_6(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c new file mode 100644 index 000000000000..41ce83da78e8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#define __unused __attribute__((unused)) + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int done = 0; + +SEC("tc") +int classifier_0(struct __sk_buff *skb __unused) +{ + done = 1; + return 0; +} + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + /* Don't propagate the constant to the caller */ + volatile int ret = 1; + + bpf_tail_call_static(skb, &jmp_table, 0); + return ret; +} + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + /* Have data on stack which size is not a multiple of 8 */ + volatile char arr[1] = {}; + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; From ad8848535e97f4a5374fc68f7a5d16e2565940cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:15 +0200 Subject: [PATCH 06/12] selftests/bpf: Shuffle cookies symbols in kprobe multi test There's a kernel bug that causes cookies to be misplaced and the reason we did not catch this with this test is that we provide bpf_fentry_test* functions already sorted by name. Shuffling function bpf_fentry_test2 deeper in the list and keeping the current cookie values as before will trigger the bug. The kernel fix is coming in following changes. Acked-by: Song Liu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_cookie.c | 78 +++++++++---------- .../selftests/bpf/progs/kprobe_multi.c | 24 +++--- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 83ef55e3caa4..2974b44f80fa 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -121,24 +121,24 @@ static void kprobe_multi_link_api_subtest(void) }) GET_ADDR("bpf_fentry_test1", addrs[0]); - GET_ADDR("bpf_fentry_test2", addrs[1]); - GET_ADDR("bpf_fentry_test3", addrs[2]); - GET_ADDR("bpf_fentry_test4", addrs[3]); - GET_ADDR("bpf_fentry_test5", addrs[4]); - GET_ADDR("bpf_fentry_test6", addrs[5]); - GET_ADDR("bpf_fentry_test7", addrs[6]); + GET_ADDR("bpf_fentry_test3", addrs[1]); + GET_ADDR("bpf_fentry_test4", addrs[2]); + GET_ADDR("bpf_fentry_test5", addrs[3]); + GET_ADDR("bpf_fentry_test6", addrs[4]); + GET_ADDR("bpf_fentry_test7", addrs[5]); + GET_ADDR("bpf_fentry_test2", addrs[6]); GET_ADDR("bpf_fentry_test8", addrs[7]); #undef GET_ADDR - cookies[0] = 1; - cookies[1] = 2; - cookies[2] = 3; - cookies[3] = 4; - cookies[4] = 5; - cookies[5] = 6; - cookies[6] = 7; - cookies[7] = 8; + cookies[0] = 1; /* bpf_fentry_test1 */ + cookies[1] = 2; /* bpf_fentry_test3 */ + cookies[2] = 3; /* bpf_fentry_test4 */ + cookies[3] = 4; /* bpf_fentry_test5 */ + cookies[4] = 5; /* bpf_fentry_test6 */ + cookies[5] = 6; /* bpf_fentry_test7 */ + cookies[6] = 7; /* bpf_fentry_test2 */ + cookies[7] = 8; /* bpf_fentry_test8 */ opts.kprobe_multi.addrs = (const unsigned long *) &addrs; opts.kprobe_multi.cnt = ARRAY_SIZE(addrs); @@ -149,14 +149,14 @@ static void kprobe_multi_link_api_subtest(void) if (!ASSERT_GE(link1_fd, 0, "link1_fd")) goto cleanup; - cookies[0] = 8; - cookies[1] = 7; - cookies[2] = 6; - cookies[3] = 5; - cookies[4] = 4; - cookies[5] = 3; - cookies[6] = 2; - cookies[7] = 1; + cookies[0] = 8; /* bpf_fentry_test1 */ + cookies[1] = 7; /* bpf_fentry_test3 */ + cookies[2] = 6; /* bpf_fentry_test4 */ + cookies[3] = 5; /* bpf_fentry_test5 */ + cookies[4] = 4; /* bpf_fentry_test6 */ + cookies[5] = 3; /* bpf_fentry_test7 */ + cookies[6] = 2; /* bpf_fentry_test2 */ + cookies[7] = 1; /* bpf_fentry_test8 */ opts.kprobe_multi.flags = BPF_F_KPROBE_MULTI_RETURN; prog_fd = bpf_program__fd(skel->progs.test_kretprobe); @@ -181,12 +181,12 @@ static void kprobe_multi_attach_api_subtest(void) struct kprobe_multi *skel = NULL; const char *syms[8] = { "bpf_fentry_test1", - "bpf_fentry_test2", "bpf_fentry_test3", "bpf_fentry_test4", "bpf_fentry_test5", "bpf_fentry_test6", "bpf_fentry_test7", + "bpf_fentry_test2", "bpf_fentry_test8", }; __u64 cookies[8]; @@ -198,14 +198,14 @@ static void kprobe_multi_attach_api_subtest(void) skel->bss->pid = getpid(); skel->bss->test_cookie = true; - cookies[0] = 1; - cookies[1] = 2; - cookies[2] = 3; - cookies[3] = 4; - cookies[4] = 5; - cookies[5] = 6; - cookies[6] = 7; - cookies[7] = 8; + cookies[0] = 1; /* bpf_fentry_test1 */ + cookies[1] = 2; /* bpf_fentry_test3 */ + cookies[2] = 3; /* bpf_fentry_test4 */ + cookies[3] = 4; /* bpf_fentry_test5 */ + cookies[4] = 5; /* bpf_fentry_test6 */ + cookies[5] = 6; /* bpf_fentry_test7 */ + cookies[6] = 7; /* bpf_fentry_test2 */ + cookies[7] = 8; /* bpf_fentry_test8 */ opts.syms = syms; opts.cnt = ARRAY_SIZE(syms); @@ -216,14 +216,14 @@ static void kprobe_multi_attach_api_subtest(void) if (!ASSERT_OK_PTR(link1, "bpf_program__attach_kprobe_multi_opts")) goto cleanup; - cookies[0] = 8; - cookies[1] = 7; - cookies[2] = 6; - cookies[3] = 5; - cookies[4] = 4; - cookies[5] = 3; - cookies[6] = 2; - cookies[7] = 1; + cookies[0] = 8; /* bpf_fentry_test1 */ + cookies[1] = 7; /* bpf_fentry_test3 */ + cookies[2] = 6; /* bpf_fentry_test4 */ + cookies[3] = 5; /* bpf_fentry_test5 */ + cookies[4] = 4; /* bpf_fentry_test6 */ + cookies[5] = 3; /* bpf_fentry_test7 */ + cookies[6] = 2; /* bpf_fentry_test2 */ + cookies[7] = 1; /* bpf_fentry_test8 */ opts.retprobe = true; diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi.c b/tools/testing/selftests/bpf/progs/kprobe_multi.c index 93510f4f0f3a..08f95a8155d1 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi.c @@ -54,21 +54,21 @@ static void kprobe_multi_check(void *ctx, bool is_return) if (is_return) { SET(kretprobe_test1_result, &bpf_fentry_test1, 8); - SET(kretprobe_test2_result, &bpf_fentry_test2, 7); - SET(kretprobe_test3_result, &bpf_fentry_test3, 6); - SET(kretprobe_test4_result, &bpf_fentry_test4, 5); - SET(kretprobe_test5_result, &bpf_fentry_test5, 4); - SET(kretprobe_test6_result, &bpf_fentry_test6, 3); - SET(kretprobe_test7_result, &bpf_fentry_test7, 2); + SET(kretprobe_test2_result, &bpf_fentry_test2, 2); + SET(kretprobe_test3_result, &bpf_fentry_test3, 7); + SET(kretprobe_test4_result, &bpf_fentry_test4, 6); + SET(kretprobe_test5_result, &bpf_fentry_test5, 5); + SET(kretprobe_test6_result, &bpf_fentry_test6, 4); + SET(kretprobe_test7_result, &bpf_fentry_test7, 3); SET(kretprobe_test8_result, &bpf_fentry_test8, 1); } else { SET(kprobe_test1_result, &bpf_fentry_test1, 1); - SET(kprobe_test2_result, &bpf_fentry_test2, 2); - SET(kprobe_test3_result, &bpf_fentry_test3, 3); - SET(kprobe_test4_result, &bpf_fentry_test4, 4); - SET(kprobe_test5_result, &bpf_fentry_test5, 5); - SET(kprobe_test6_result, &bpf_fentry_test6, 6); - SET(kprobe_test7_result, &bpf_fentry_test7, 7); + SET(kprobe_test2_result, &bpf_fentry_test2, 7); + SET(kprobe_test3_result, &bpf_fentry_test3, 2); + SET(kprobe_test4_result, &bpf_fentry_test4, 3); + SET(kprobe_test5_result, &bpf_fentry_test5, 4); + SET(kprobe_test6_result, &bpf_fentry_test6, 5); + SET(kprobe_test7_result, &bpf_fentry_test7, 6); SET(kprobe_test8_result, &bpf_fentry_test8, 8); } From eb1b2985fe5c5f02e43e4c0d47bbe7ed835007f3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:16 +0200 Subject: [PATCH 07/12] ftrace: Keep address offset in ftrace_lookup_symbols We want to store the resolved address on the same index as the symbol string, because that's the user (bpf kprobe link) code assumption. Also making sure we don't store duplicates that might be present in kallsyms. Acked-by: Song Liu Acked-by: Steven Rostedt (Google) Fixes: bed0d9a50dac ("ftrace: Add ftrace_lookup_symbols function") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e750fe141a60..601ccf1b2f09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name, struct module *mod, unsigned long addr) { struct kallsyms_data *args = data; + const char **sym; + int idx; - if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp)) + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) return 0; addr = ftrace_location(addr); if (!addr) return 0; - args->addrs[args->found++] = addr; + args->addrs[idx] = addr; + args->found++; return args->found == args->cnt ? 1 : 0; } @@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int err; + memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; args.cnt = cnt; From eb5fb0325698d05f0bf78d322de82c451a3685a2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:17 +0200 Subject: [PATCH 08/12] bpf: Force cookies array to follow symbols sorting When user specifies symbols and cookies for kprobe_multi link interface it's very likely the cookies will be misplaced and returned to wrong functions (via get_attach_cookie helper). The reason is that to resolve the provided functions we sort them before passing them to ftrace_lookup_symbols, but we do not do the same sort on the cookie values. Fixing this by using sort_r function with custom swap callback that swaps cookie values as well. Fixes: 0236fec57a15 ("bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 70 ++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7a13e6ac6327..88589d74a892 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, kprobe_multi_link_prog_run(link, entry_ip, regs); } -static int symbols_cmp(const void *a, const void *b) +static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; @@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b) return strcmp(*str_a, *str_b); } +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2468,25 +2490,6 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!addrs) return -ENOMEM; - if (uaddrs) { - if (copy_from_user(addrs, uaddrs, size)) { - err = -EFAULT; - goto error; - } - } else { - struct user_syms us; - - err = copy_user_syms(&us, usyms, cnt); - if (err) - goto error; - - sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL); - err = ftrace_lookup_symbols(us.syms, cnt, addrs); - free_user_syms(&us); - if (err) - goto error; - } - ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); @@ -2500,6 +2503,33 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } } + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); + if (err) + goto error; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; From 730067022c0137691b27726377c2d088f7f8e33c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Jun 2022 13:21:18 +0200 Subject: [PATCH 09/12] selftest/bpf: Fix kprobe_multi bench test With [1] the available_filter_functions file contains records starting with __ftrace_invalid_address___ and marking disabled entries. We need to filter them out for the bench test to pass only resolvable symbols to kernel. [1] commit b39181f7c690 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function") Fixes: b39181f7c690 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220615112118.497303-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 586dc52d6fb9..5b93d5d0bd93 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -364,6 +364,9 @@ static int get_syms(char ***symsp, size_t *cntp) continue; if (!strncmp(name, "rcu_", 4)) continue; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + continue; err = hashmap__add(map, name, NULL); if (err) { free(name); From 63ce81d1c40459e2d9d28f90e2a3e3863e2f63d4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 17 Jun 2022 21:42:33 +0200 Subject: [PATCH 10/12] bpf, docs: Update some of the JIT/maintenance entries Various minor updates around some of the BPF-related entries: JITs for ARM32/NFP/SPARC/X86-32 haven't seen updates in quite a while, thus for now, mark them as 'Odd Fixes' until they become more actively developed. JITs for POWERPC/S390 are in good shape and receive active development and review, thus bump to 'Supported' similar as we have with X86-64/ARM64. JITs for MIPS/RISC-V are in similar good shape as the ones mentioned above, but looked after mostly in spare time, thus leave for now in 'Maintained' state. Add Michael to PPC JIT given he's picking up the patches there, so it better reflects today's state. Also, I haven't done much reviewing around BPF sockmap/kTLS after John and I did the big rework back in the days to integrate sockmap with kTLS. These days, most of this is taken care by John, Jakub {Sitnicki,Kicinski} and others in the community, so remove myself from these two. Lastly, move all BPF-related entries into one place, that is, move the sockmap one over near rest of BPF. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/r/f9b8a63a0b48dc764bd4c50f87632889f5813f69.1655494758.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 96158b337b40..795f7e40230c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3662,7 +3662,7 @@ BPF JIT for ARM M: Shubham Bansal L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/arm/net/ BPF JIT for ARM64 @@ -3686,14 +3686,15 @@ BPF JIT for NFP NICs M: Jakub Kicinski L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Supported +S: Odd Fixes F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) M: Naveen N. Rao +M: Michael Ellerman L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Supported F: arch/powerpc/net/ BPF JIT for RISC-V (32-bit) @@ -3719,7 +3720,7 @@ M: Heiko Carstens M: Vasily Gorbik L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Supported F: arch/s390/net/ X: arch/s390/net/pnet.c @@ -3727,14 +3728,14 @@ BPF JIT for SPARC (32-BIT AND 64-BIT) M: David S. Miller L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/sparc/net/ BPF JIT for X86 32-BIT M: Wang YanQing L: netdev@vger.kernel.org L: bpf@vger.kernel.org -S: Maintained +S: Odd Fixes F: arch/x86/net/bpf_jit_comp32.c BPF JIT for X86 64-BIT @@ -3757,6 +3758,19 @@ F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c F: security/bpf/ +BPF L7 FRAMEWORK +M: John Fastabend +M: Jakub Sitnicki +L: netdev@vger.kernel.org +L: bpf@vger.kernel.org +S: Maintained +F: include/linux/skmsg.h +F: net/core/skmsg.c +F: net/core/sock_map.c +F: net/ipv4/tcp_bpf.c +F: net/ipv4/udp_bpf.c +F: net/unix/unix_bpf.c + BPFTOOL M: Quentin Monnet L: bpf@vger.kernel.org @@ -11095,20 +11109,6 @@ S: Maintained F: include/net/l3mdev.h F: net/l3mdev -L7 BPF FRAMEWORK -M: John Fastabend -M: Daniel Borkmann -M: Jakub Sitnicki -L: netdev@vger.kernel.org -L: bpf@vger.kernel.org -S: Maintained -F: include/linux/skmsg.h -F: net/core/skmsg.c -F: net/core/sock_map.c -F: net/ipv4/tcp_bpf.c -F: net/ipv4/udp_bpf.c -F: net/unix/unix_bpf.c - LANDLOCK SECURITY MODULE M: Mickaël Salaün L: linux-security-module@vger.kernel.org @@ -13950,7 +13950,6 @@ F: net/ipv6/tcp*.c NETWORKING [TLS] M: Boris Pismenny M: John Fastabend -M: Daniel Borkmann M: Jakub Kicinski L: netdev@vger.kernel.org S: Maintained From c88dbbcd88c233cb759ec857b57864c5bfcea26a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 8 Jun 2022 01:11:02 +0900 Subject: [PATCH 11/12] fprobe, samples: Add use_trace option and show hit/missed counter Add use_trace option to use trace_printk() instead of pr_info() so that the handler doesn't involve the RCU operations. And show the hit and missed counter so that the user can check how many times the probe handler hit and missed. Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/165461826247.280167.11939123218334322352.stgit@devnote2 --- samples/fprobe/fprobe_example.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c index 24d3cf109140..01ee6c8c8382 100644 --- a/samples/fprobe/fprobe_example.c +++ b/samples/fprobe/fprobe_example.c @@ -21,6 +21,7 @@ #define BACKTRACE_DEPTH 16 #define MAX_SYMBOL_LEN 4096 struct fprobe sample_probe; +static unsigned long nhit; static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; module_param_string(symbol, symbol, sizeof(symbol), 0644); @@ -28,6 +29,8 @@ static char nosymbol[MAX_SYMBOL_LEN] = ""; module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644); static bool stackdump = true; module_param(stackdump, bool, 0644); +static bool use_trace = false; +module_param(use_trace, bool, 0644); static void show_backtrace(void) { @@ -40,7 +43,15 @@ static void show_backtrace(void) static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) { - pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + else + pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + nhit++; if (stackdump) show_backtrace(); } @@ -49,8 +60,17 @@ static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_r { unsigned long rip = instruction_pointer(regs); - pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", - (void *)ip, (void *)ip, (void *)rip, (void *)rip); + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + else + pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + nhit++; if (stackdump) show_backtrace(); } @@ -112,7 +132,8 @@ static void __exit fprobe_exit(void) { unregister_fprobe(&sample_probe); - pr_info("fprobe at %s unregistered\n", symbol); + pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n", + symbol, nhit, sample_probe.nmissed); } module_init(fprobe_init) From c0f3bb4054ef036e5f67e27f2e3cad9e6512cf00 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 8 Jun 2022 01:11:12 +0900 Subject: [PATCH 12/12] rethook: Reject getting a rethook if RCU is not watching Since the rethook_recycle() will involve the call_rcu() for reclaiming the rethook_instance, the rethook must be set up at the RCU available context (non idle). This rethook_recycle() in the rethook trampoline handler is inevitable, thus the RCU available check must be done before setting the rethook trampoline. This adds a rcu_is_watching() check in the rethook_try_get() so that it will return NULL if it is called when !rcu_is_watching(). Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Daniel Borkmann Acked-by: Steven Rostedt (Google) Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/165461827269.280167.7379263615545598958.stgit@devnote2 --- kernel/trace/rethook.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index b56833700d23..c69d82273ce7 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + fn = freelist_try_get(&rh->pool); if (!fn) return NULL;