diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8e6092d0ea95..f3e88afdaffe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -5,6 +5,7 @@ #define _LINUX_BPF_H 1 #include +#include #include #include @@ -22,8 +23,10 @@ #include #include #include +#include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1084,6 +1087,40 @@ struct bpf_prog_aux { }; }; +struct bpf_prog { + u16 pages; /* Number of allocated pages */ + u16 jited:1, /* Is our filter JIT'ed? */ + jit_requested:1,/* archs need to JIT the prog */ + gpl_compatible:1, /* Is filter GPL compatible? */ + cb_access:1, /* Is control block accessed? */ + dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ + blinded:1, /* Was blinded */ + is_func:1, /* program is a bpf function */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ + enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ + u32 len; /* Number of filter blocks */ + u32 jited_len; /* Size of jited insns in bytes */ + u8 tag[BPF_TAG_SIZE]; + struct bpf_prog_stats __percpu *stats; + int __percpu *active; + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ + /* Instructions for interpreter */ + union { + DECLARE_FLEX_ARRAY(struct sock_filter, insns); + DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); + }; +}; + struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; @@ -1336,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, @@ -1427,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array, return ret; } +/* Notes on RCU design for bpf_prog_arrays containing sleepable programs: + * + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array + * overall. As a result, we must use the bpf_prog_array_free_sleepable + * in order to use the tasks_trace rcu grace period. + * + * When a non-sleepable program is inside the array, we take the rcu read + * section and disable preemption for that program alone, so it can access + * rcu-protected dynamically sized maps. + */ +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + might_fault(); + + rcu_read_lock_trace(); + migrate_disable(); + + array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); + if (unlikely(!array)) + goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + if (!prog->aux->sleepable) + rcu_read_lock(); + + run_ctx.bpf_cookie = item->bpf_cookie; + ret &= run_prog(prog, ctx); + item++; + + if (!prog->aux->sleepable) + rcu_read_unlock(); + } + bpf_reset_run_ctx(old_run_ctx); +out: + migrate_enable(); + rcu_read_unlock_trace(); + return ret; +} + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/include/linux/filter.h b/include/linux/filter.h index ed0c0ff42ad5..d0cbb31b1b4d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,40 +559,6 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); -struct bpf_prog { - u16 pages; /* Number of allocated pages */ - u16 jited:1, /* Is our filter JIT'ed? */ - jit_requested:1,/* archs need to JIT the prog */ - gpl_compatible:1, /* Is filter GPL compatible? */ - cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - blinding_requested:1, /* needs constant blinding */ - blinded:1, /* Was blinded */ - is_func:1, /* program is a bpf function */ - kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1, /* callchain buffer allocated? */ - enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ - call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ - call_get_func_ip:1, /* Do we call get_func_ip() */ - tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ - enum bpf_prog_type type; /* Type of BPF program */ - enum bpf_attach_type expected_attach_type; /* For some prog types */ - u32 len; /* Number of filter blocks */ - u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; - struct bpf_prog_stats __percpu *stats; - int __percpu *active; - unsigned int (*bpf_func)(const void *ctx, - const struct bpf_insn *insn); - struct bpf_prog_aux *aux; /* Auxiliary fields */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ - /* Instructions for interpreter */ - union { - DECLARE_FLEX_ARRAY(struct sock_filter, insns); - DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); - }; -}; - struct sk_filter { refcount_t refcnt; struct rcu_head rcu; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e78cc5eea4a5..b5ffebcce6cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2d2872682278..eadc23a8452c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14829,8 +14829,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 950b25c3f210..deee6815bdd3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10069,26 +10069,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event) int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..d1c22594dbf9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..0282c119b1b2 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_probe.h" @@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); if (!ret) return; } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d989b0a17a89..49e359cd34df 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9177,8 +9177,10 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), @@ -11571,7 +11573,8 @@ static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf break; case 3: case 4: - opts.retprobe = strcmp(probe_type, "uretprobe") == 0; + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; if (opts.retprobe && offset != 0) { pr_warn("prog '%s': uretprobes do not support offset specification\n", prog->name); diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 08c0601b3e84..0b899d2d8ea7 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -17,6 +17,14 @@ static void trigger_func2(void) asm volatile (""); } +/* attach point for byname sleepable uprobe */ +static void trigger_func3(void) +{ + asm volatile (""); +} + +static char test_data[] = "test_data"; + void test_attach_probe(void) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); @@ -49,9 +57,17 @@ void test_attach_probe(void) if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset")) return; - skel = test_attach_probe__open_and_load(); + skel = test_attach_probe__open(); if (!ASSERT_OK_PTR(skel, "skel_open")) return; + + /* sleepable kprobe test case needs flags set before loading */ + if (!ASSERT_OK(bpf_program__set_flags(skel->progs.handle_kprobe_sleepable, + BPF_F_SLEEPABLE), "kprobe_sleepable_flags")) + goto cleanup; + + if (!ASSERT_OK(test_attach_probe__load(skel), "skel_load")) + goto cleanup; if (!ASSERT_OK_PTR(skel->bss, "check_bss")) goto cleanup; @@ -151,6 +167,30 @@ void test_attach_probe(void) if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname2, "attach_uretprobe_byname2")) goto cleanup; + /* sleepable kprobes should not attach successfully */ + skel->links.handle_kprobe_sleepable = bpf_program__attach(skel->progs.handle_kprobe_sleepable); + if (!ASSERT_ERR_PTR(skel->links.handle_kprobe_sleepable, "attach_kprobe_sleepable")) + goto cleanup; + + /* test sleepable uprobe and uretprobe variants */ + skel->links.handle_uprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uprobe_byname3_sleepable); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3_sleepable, "attach_uprobe_byname3_sleepable")) + goto cleanup; + + skel->links.handle_uprobe_byname3 = bpf_program__attach(skel->progs.handle_uprobe_byname3); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname3, "attach_uprobe_byname3")) + goto cleanup; + + skel->links.handle_uretprobe_byname3_sleepable = bpf_program__attach(skel->progs.handle_uretprobe_byname3_sleepable); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3_sleepable, "attach_uretprobe_byname3_sleepable")) + goto cleanup; + + skel->links.handle_uretprobe_byname3 = bpf_program__attach(skel->progs.handle_uretprobe_byname3); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname3, "attach_uretprobe_byname3")) + goto cleanup; + + skel->bss->user_ptr = test_data; + /* trigger & validate kprobe && kretprobe */ usleep(1); @@ -164,6 +204,9 @@ void test_attach_probe(void) /* trigger & validate uprobe attached by name */ trigger_func2(); + /* trigger & validate sleepable uprobe attached by name */ + trigger_func3(); + ASSERT_EQ(skel->bss->kprobe_res, 1, "check_kprobe_res"); ASSERT_EQ(skel->bss->kprobe2_res, 11, "check_kprobe_auto_res"); ASSERT_EQ(skel->bss->kretprobe_res, 2, "check_kretprobe_res"); @@ -174,6 +217,10 @@ void test_attach_probe(void) ASSERT_EQ(skel->bss->uretprobe_byname_res, 6, "check_uretprobe_byname_res"); ASSERT_EQ(skel->bss->uprobe_byname2_res, 7, "check_uprobe_byname2_res"); ASSERT_EQ(skel->bss->uretprobe_byname2_res, 8, "check_uretprobe_byname2_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res"); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index ce9acf4db8d2..f1c88ad368ef 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "bpf_misc.h" int kprobe_res = 0; @@ -17,6 +18,11 @@ int uprobe_byname_res = 0; int uretprobe_byname_res = 0; int uprobe_byname2_res = 0; int uretprobe_byname2_res = 0; +int uprobe_byname3_sleepable_res = 0; +int uprobe_byname3_res = 0; +int uretprobe_byname3_sleepable_res = 0; +int uretprobe_byname3_res = 0; +void *user_ptr = 0; SEC("kprobe") int handle_kprobe(struct pt_regs *ctx) @@ -32,6 +38,17 @@ int BPF_KPROBE(handle_kprobe_auto) return 0; } +/** + * This program will be manually made sleepable on the userspace side + * and should thus be unattachable. + */ +SEC("kprobe/" SYS_PREFIX "sys_nanosleep") +int handle_kprobe_sleepable(struct pt_regs *ctx) +{ + kprobe_res = 2; + return 0; +} + SEC("kretprobe") int handle_kretprobe(struct pt_regs *ctx) { @@ -93,4 +110,47 @@ int handle_uretprobe_byname2(struct pt_regs *ctx) return 0; } +static __always_inline bool verify_sleepable_user_copy(void) +{ + char data[9]; + + bpf_copy_from_user(data, sizeof(data), user_ptr); + return bpf_strncmp(data, sizeof(data), "test_data") == 0; +} + +SEC("uprobe.s//proc/self/exe:trigger_func3") +int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) +{ + if (verify_sleepable_user_copy()) + uprobe_byname3_sleepable_res = 9; + return 0; +} + +/** + * same target as the uprobe.s above to force sleepable and non-sleepable + * programs in the same bpf_prog_array + */ +SEC("uprobe//proc/self/exe:trigger_func3") +int handle_uprobe_byname3(struct pt_regs *ctx) +{ + uprobe_byname3_res = 10; + return 0; +} + +SEC("uretprobe.s//proc/self/exe:trigger_func3") +int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx) +{ + if (verify_sleepable_user_copy()) + uretprobe_byname3_sleepable_res = 11; + return 0; +} + +SEC("uretprobe//proc/self/exe:trigger_func3") +int handle_uretprobe_byname3(struct pt_regs *ctx) +{ + uretprobe_byname3_res = 12; + return 0; +} + + char _license[] SEC("license") = "GPL";