diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2fe7a3188282..12d115ef5e74 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -371,7 +371,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, unsigned int len, unsigned int truesize, - bool hdr_valid) + bool hdr_valid, unsigned int metasize) { struct sk_buff *skb; struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -393,6 +393,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, else hdr_padded_len = sizeof(struct padded_vnet_hdr); + /* hdr_valid means no XDP, so we can copy the vnet header */ if (hdr_valid) memcpy(hdr, p, hdr_len); @@ -405,6 +406,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, copy = skb_tailroom(skb); skb_put_data(skb, p, copy); + if (metasize) { + __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); + } + len -= copy; offset += copy; @@ -450,10 +456,6 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct virtio_net_hdr_mrg_rxbuf *hdr; int err; - /* virtqueue want to use data area in-front of packet */ - if (unlikely(xdpf->metasize > 0)) - return -EOPNOTSUPP; - if (unlikely(xdpf->headroom < vi->hdr_len)) return -EOVERFLOW; @@ -644,6 +646,7 @@ static struct sk_buff *receive_small(struct net_device *dev, unsigned int delta = 0; struct page *xdp_page; int err; + unsigned int metasize = 0; len -= vi->hdr_len; stats->bytes += len; @@ -683,8 +686,8 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; + xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -695,6 +698,7 @@ static struct sk_buff *receive_small(struct net_device *dev, /* Recalculate length in case bpf program changed it */ delta = orig_data - xdp.data; len = xdp.data_end - xdp.data; + metasize = xdp.data - xdp.data_meta; break; case XDP_TX: stats->xdp_tx++; @@ -735,10 +739,13 @@ static struct sk_buff *receive_small(struct net_device *dev, } skb_reserve(skb, headroom - delta); skb_put(skb, len); - if (!delta) { + if (!xdp_prog) { buf += header_offset; memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len); - } /* keep zeroed vnet hdr since packet was changed by bpf */ + } /* keep zeroed vnet hdr since XDP is loaded */ + + if (metasize) + skb_metadata_set(skb, metasize); err: return skb; @@ -760,8 +767,8 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page = buf; - struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, - PAGE_SIZE, true); + struct sk_buff *skb = + page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0); stats->bytes += len - vi->hdr_len; if (unlikely(!skb)) @@ -793,6 +800,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); int err; + unsigned int metasize = 0; head_skb = NULL; stats->bytes += len - vi->hdr_len; @@ -839,8 +847,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); + xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -848,24 +856,27 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, switch (act) { case XDP_PASS: - /* recalculate offset to account for any header - * adjustments. Note other cases do not build an - * skb and avoid using offset - */ - offset = xdp.data - - page_address(xdp_page) - vi->hdr_len; + metasize = xdp.data - xdp.data_meta; - /* recalculate len if xdp.data or xdp.data_end were - * adjusted + /* recalculate offset to account for any header + * adjustments and minus the metasize to copy the + * metadata in page_to_skb(). Note other cases do not + * build an skb and avoid using offset */ - len = xdp.data_end - xdp.data + vi->hdr_len; + offset = xdp.data - page_address(xdp_page) - + vi->hdr_len - metasize; + + /* recalculate len if xdp.data, xdp.data_end or + * xdp.data_meta were adjusted + */ + len = xdp.data_end - xdp.data + vi->hdr_len + metasize; /* We can only create skb based on xdp_page. */ if (unlikely(xdp_page != page)) { rcu_read_unlock(); put_page(page); - head_skb = page_to_skb(vi, rq, xdp_page, - offset, len, - PAGE_SIZE, false); + head_skb = page_to_skb(vi, rq, xdp_page, offset, + len, PAGE_SIZE, false, + metasize); return head_skb; } break; @@ -921,7 +932,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_skb; } - head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog); + head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog, + metasize); curr_skb = head_skb; if (unlikely(!curr_skb)) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a11d5b7dbbf3..a7cd5c7a2509 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -36,7 +36,7 @@ struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; - char data[0]; + char data[]; }; struct bpf_cgroup_storage { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49b1a70e12c8..6015a4daf118 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -859,7 +859,7 @@ struct bpf_prog_array_item { struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog_array_item items[0]; + struct bpf_prog_array_item items[]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ - preempt_disable(); \ + migrate_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ @@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, } \ _out: \ rcu_read_unlock(); \ - preempt_enable(); \ + migrate_enable(); \ _ret; \ }) @@ -932,7 +932,7 @@ _out: \ u32 ret; \ u32 _ret = 1; \ u32 _cn = 0; \ - preempt_disable(); \ + migrate_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ @@ -944,7 +944,7 @@ _out: \ _item++; \ } \ rcu_read_unlock(); \ - preempt_enable(); \ + migrate_enable(); \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ @@ -961,6 +961,36 @@ _out: \ #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); +/* + * Block execution of BPF programs attached to instrumentation (perf, + * kprobes, tracepoints) to prevent deadlocks on map operations as any of + * these events can happen inside a region which holds a map bucket lock + * and can deadlock on it. + * + * Use the preemption safe inc/dec variants on RT because migrate disable + * is preemptible on RT and preemption in the middle of the RMW operation + * might lead to inconsistent state. Use the raw variants for non RT + * kernels as migrate_disable() maps to preempt_disable() so the slightly + * more expensive save operation can be avoided. + */ +static inline void bpf_disable_instrumentation(void) +{ + migrate_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_inc(bpf_prog_active); + else + __this_cpu_inc(bpf_prog_active); +} + +static inline void bpf_enable_instrumentation(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_dec(bpf_prog_active); + else + __this_cpu_dec(bpf_prog_active); + migrate_enable(); +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; @@ -993,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); diff --git a/include/linux/filter.h b/include/linux/filter.h index f349e2c0884c..43b5e455d2f5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ u32 ret; \ - cant_sleep(); \ + cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ struct bpf_prog_stats *stats; \ u64 start = sched_clock(); \ @@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); } \ ret; }) -#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \ - bpf_dispatcher_nopfunc) +#define BPF_PROG_RUN(prog, ctx) \ + __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc) + +/* + * Use in preemptible and therefore migratable context to make sure that + * the execution of the BPF program runs on one CPU. + * + * This uses migrate_disable/enable() explicitly to document that the + * invocation of a BPF program does not require reentrancy protection + * against a BPF program which is invoked from a preempting task. + * + * For non RT enabled kernels migrate_disable/enable() maps to + * preempt_disable/enable(), i.e. it disables also preemption. + */ +static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, + const void *ctx) +{ + u32 ret; + + migrate_disable(); + ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc); + migrate_enable(); + return ret; +} #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN @@ -655,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) return qdisc_skb_cb(skb)->data; } +/* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { @@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, { u32 res; - preempt_disable(); + migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); - preempt_enable(); + migrate_enable(); return res; } @@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); - preempt_disable(); - res = BPF_PROG_RUN(prog, skb); - preempt_enable(); + res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 39faaaf843e1..e4ba25d63913 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -15,11 +15,9 @@ struct netlink_callback; struct inet_diag_handler { void (*dump)(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); - int (*dump_one)(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, + int (*dump_one)(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); void (*idiag_get_info)(struct sock *sk, @@ -40,18 +38,25 @@ struct inet_diag_handler { __u16 idiag_info_size; }; +struct bpf_sk_storage_diag; +struct inet_diag_dump_data { + struct nlattr *req_nlas[__INET_DIAG_REQ_MAX]; +#define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE] +#define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] + + struct bpf_sk_storage_diag *bpf_stg_diag; +}; + struct inet_connection_sock; int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin); + struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + u16 nlmsg_flags, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct sk_buff *in_skb, const struct nlmsghdr *nlh, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req); struct sock *inet_diag_find_one_icsk(struct net *net, diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 0d9db2a14f44..9b7a8d74a9d6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -257,6 +257,13 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) +#ifndef CONFIG_PREEMPT_RT +# define cant_migrate() cant_sleep() +#else + /* Placeholder for now */ +# define cant_migrate() do { } while (0) +#endif + /** * abs - return absolute value of an argument * @x: the value. If it is unsigned type, it is converted to signed type first. diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 205fa7b1f07a..788969ccbbde 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -188,10 +188,10 @@ struct netlink_callback { struct module *module; struct netlink_ext_ack *extack; u16 family; - u16 min_dump_alloc; - bool strict_check; u16 answer_flags; + u32 min_dump_alloc; unsigned int prev_seq, seq; + bool strict_check; union { u8 ctx[48]; diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bbb68dba37cc..bc3f1aecaa19 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,4 +322,34 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif +/** + * migrate_disable - Prevent migration of the current task + * + * Maps to preempt_disable() which also disables preemption. Use + * migrate_disable() to annotate that the intent is to prevent migration, + * but not necessarily preemption. + * + * Can be invoked nested like preempt_disable() and needs the corresponding + * number of migrate_enable() invocations. + */ +static __always_inline void migrate_disable(void) +{ + preempt_disable(); +} + +/** + * migrate_enable - Allow migration of the current task + * + * Counterpart to migrate_disable(). + * + * As migrate_disable() can be invoked nested, only the outermost invocation + * reenables migration. + * + * Currently mapped to preempt_enable(). + */ +static __always_inline void migrate_enable(void) +{ + preempt_enable(); +} + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 8e4f831d2e52..5036c94c0503 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_sk_storage_diag; +struct sk_buff; +struct nlattr; +struct sock; + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs); +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag); +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size); #else static inline int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { return 0; } +static inline struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla) +{ + return NULL; +} +static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ +} +static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + return 0; +} #endif #endif /* _BPF_SK_STORAGE_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 906e9f2752db..8e98ced0963b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index a1ff345b3f33..75dffd78363a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -64,9 +64,11 @@ struct inet_diag_req_raw { enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, + INET_DIAG_REQ_SK_BPF_STORAGES, + __INET_DIAG_REQ_MAX, }; -#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE +#define INET_DIAG_REQ_MAX (__INET_DIAG_REQ_MAX - 1) /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: @@ -154,6 +156,7 @@ enum { INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, + INET_DIAG_SK_BPF_STORAGES, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index e5925009a652..5f74a5f6091d 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -36,4 +36,30 @@ enum sknetlink_groups { }; #define SKNLGRP_MAX (__SKNLGRP_MAX - 1) +enum { + SK_DIAG_BPF_STORAGE_REQ_NONE, + SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + __SK_DIAG_BPF_STORAGE_REQ_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_REQ_MAX (__SK_DIAG_BPF_STORAGE_REQ_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_REP_NONE, + SK_DIAG_BPF_STORAGE, + __SK_DIAG_BPF_STORAGE_REP_MAX, +}; + +#define SK_DIAB_BPF_STORAGE_REP_MAX (__SK_DIAG_BPF_STORAGE_REP_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_NONE, + SK_DIAG_BPF_STORAGE_PAD, + SK_DIAG_BPF_STORAGE_MAP_ID, + SK_DIAG_BPF_STORAGE_MAP_VALUE, + __SK_DIAG_BPF_STORAGE_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1) + #endif /* _UAPI__SOCK_DIAG_H__ */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..c498f0fffb40 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,7 @@ enum bpf_struct_ops_state { struct bpf_struct_ops_value { BPF_STRUCT_OPS_COMMON_VALUE; - char data[0] ____cacheline_aligned_in_smp; + char data[] ____cacheline_aligned_in_smp; }; struct bpf_struct_ops_map { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a1468e3f5af2..d541c8486c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -27,9 +27,62 @@ .map_delete_batch = \ generic_map_delete_batch +/* + * The bucket lock has two protection scopes: + * + * 1) Serializing concurrent operations from BPF programs on differrent + * CPUs + * + * 2) Serializing concurrent operations from BPF programs and sys_bpf() + * + * BPF programs can execute in any context including perf, kprobes and + * tracing. As there are almost no limits where perf, kprobes and tracing + * can be invoked from the lock operations need to be protected against + * deadlocks. Deadlocks can be caused by recursion and by an invocation in + * the lock held section when functions which acquire this lock are invoked + * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU + * variable bpf_prog_active, which prevents BPF programs attached to perf + * events, kprobes and tracing to be invoked before the prior invocation + * from one of these contexts completed. sys_bpf() uses the same mechanism + * by pinning the task to the current CPU and incrementing the recursion + * protection accross the map operation. + * + * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain + * operations like memory allocations (even with GFP_ATOMIC) from atomic + * contexts. This is required because even with GFP_ATOMIC the memory + * allocator calls into code pathes which acquire locks with long held lock + * sections. To ensure the deterministic behaviour these locks are regular + * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only + * true atomic contexts on an RT kernel are the low level hardware + * handling, scheduling, low level interrupt handling, NMIs etc. None of + * these contexts should ever do memory allocations. + * + * As regular device interrupt handlers and soft interrupts are forced into + * thread context, the existing code which does + * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * just works. + * + * In theory the BPF locks could be converted to regular spinlocks as well, + * but the bucket locks and percpu_freelist locks can be taken from + * arbitrary contexts (perf, kprobes, tracepoints) which are required to be + * atomic contexts even on RT. These mechanisms require preallocated maps, + * so there is no need to invoke memory allocations within the lock held + * sections. + * + * BPF maps which need dynamic allocation are only used from (forced) + * thread context on RT and can therefore use regular spinlocks which in + * turn allows to invoke memory allocations from the lock held section. + * + * On a non RT kernel this distinction is neither possible nor required. + * spinlock maps to raw_spinlock and the extra code is optimized out by the + * compiler. + */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t lock; + union { + raw_spinlock_t raw_lock; + spinlock_t lock; + }; }; struct bpf_htab { @@ -65,9 +118,54 @@ struct htab_elem { struct bpf_lru_node lru_node; }; u32 hash; - char key[0] __aligned(8); + char key[] __aligned(8); }; +static inline bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + +static inline bool htab_use_raw_lock(const struct bpf_htab *htab) +{ + return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); +} + +static void htab_init_buckets(struct bpf_htab *htab) +{ + unsigned i; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); + if (htab_use_raw_lock(htab)) + raw_spin_lock_init(&htab->buckets[i].raw_lock); + else + spin_lock_init(&htab->buckets[i].lock); + } +} + +static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b) +{ + unsigned long flags; + + if (htab_use_raw_lock(htab)) + raw_spin_lock_irqsave(&b->raw_lock, flags); + else + spin_lock_irqsave(&b->lock, flags); + return flags; +} + +static inline void htab_unlock_bucket(const struct bpf_htab *htab, + struct bucket *b, + unsigned long flags) +{ + if (htab_use_raw_lock(htab)) + raw_spin_unlock_irqrestore(&b->raw_lock, flags); + else + spin_unlock_irqrestore(&b->lock, flags); +} + static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) @@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } -static bool htab_is_prealloc(const struct bpf_htab *htab) -{ - return !(htab->map.map_flags & BPF_F_NO_PREALLOC); -} - static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; u64 cost; + int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->hashrnd = get_random_int(); - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].lock); - } + htab_init_buckets(htab); if (prealloc) { err = prealloc_init(htab); @@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return l == tgt_l; } @@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct bpf_htab *htab = l->htab; - /* must increment bpf_prog_active to avoid kprobe+bpf triggering while - * we're calling kfree, otherwise deadlock is possible if kprobes - * are placed somewhere inside of slub - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); htab_elem_free(htab, l); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); } static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) @@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1325,8 +1403,7 @@ alloc: } again: - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); again_nocopy: dst_key = keys; @@ -1335,7 +1412,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1352,10 +1429,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); goto after_loop; } @@ -1364,10 +1440,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); kvfree(keys); kvfree(values); goto alloc; @@ -1418,7 +1493,7 @@ again_nocopy: dst_val += value_size; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); locked = false; while (node_to_free) { @@ -1437,8 +1512,7 @@ next_batch: } rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, key_size * bucket_cnt) || copy_to_user(uvalues + total * value_size, values, diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 56e6c75d354d..65c236cf341e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -25,7 +25,7 @@ struct lpm_trie_node { struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; - u8 data[0]; + u8 data[]; }; struct lpm_trie { @@ -34,7 +34,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Allocate and fill a new node */ @@ -422,7 +422,7 @@ out: kfree(im_node); } - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) kfree_rcu(node, rcu); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (ret) goto out_err; - raw_spin_lock_init(&trie->lock); + spin_lock_init(&trie->lock); return &trie->map; out_err: diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 6e090140b924..b367430e611c 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } +static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + node->next = head->first; + head->first = node; +} + static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); - node->next = head->first; - head->first = node; + pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } @@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; - unsigned long flags; int i, cpu, pcpu_entries; pcpu_entries = nr_elems / num_possible_cpus() + 1; i = 0; - /* disable irq to workaround lockdep false positive - * in bpf usage pcpu_freelist_populate() will never race - * with pcpu_freelist_push() - */ - local_irq_save(flags); for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - ___pcpu_freelist_push(head, buf); + /* No locking required as this is not visible yet. */ + pcpu_freelist_push_node(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -78,7 +79,6 @@ again: if (i % pcpu_entries) goto again; } - local_irq_restore(flags); } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3f958b90d914..db76339fe358 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry) { struct stack_map_irq_work *work; + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + work = container_of(entry, struct stack_map_irq_work, irq_work); up_read_non_owner(work->sem); work->sem = NULL; @@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, struct stack_map_irq_work *work = NULL; if (irqs_disabled()) { - work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) - /* cannot queue more up_read, fallback */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&up_read_work); + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ irq_work_busy = true; + } } /* diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..c536c65256ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -171,11 +171,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, flags); } - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); @@ -206,8 +202,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -222,8 +217,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, if (bpf_map_is_dev_bound(map)) return bpf_map_offload_lookup_elem(map, key, value); - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); @@ -268,8 +262,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, rcu_read_unlock(); } - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -909,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map); + fdput(f); + + return map; +} + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -1136,13 +1144,11 @@ static int map_delete_elem(union bpf_attr *attr) goto out; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: kfree(key); @@ -1254,13 +1260,11 @@ int generic_map_delete_batch(struct bpf_map *map, break; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); if (err) break; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6b264a92064b..704fa787fec0 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -367,8 +367,9 @@ out: mutex_unlock(&trampoline_mutex); } -/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that - * are needed for trampoline. The macro is split into +/* The logic is similar to BPF_PROG_RUN, but with an explicit + * rcu_read_lock() and migrate_disable() which are required + * for the trampoline. The macro is split into * call _bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit @@ -378,7 +379,7 @@ u64 notrace __bpf_prog_enter(void) u64 start = 0; rcu_read_lock(); - preempt_disable(); + migrate_disable(); if (static_branch_unlikely(&bpf_stats_enabled_key)) start = sched_clock(); return start; @@ -401,7 +402,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } - preempt_enable(); + migrate_enable(); rcu_read_unlock(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d15dfbd4b88..289383edfc8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8143,26 +8143,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } +static bool is_preallocated_map(struct bpf_map *map) +{ + if (!check_map_prealloc(map)) + return false; + if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) + return false; + return true; +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { - /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use - * preallocated hash maps, since doing memory allocation - * in overflow_handler can crash depending on where nmi got - * triggered. + /* + * Validate that trace type programs use preallocated hash maps. + * + * For programs attached to PERF events this is mandatory as the + * perf NMI can hit any arbitrary code sequence. + * + * All other trace types using preallocated hash maps are unsafe as + * well because tracepoint or kprobes can be inside locked regions + * of the memory allocator or at a place where a recursion into the + * memory allocator would see inconsistent state. + * + * On RT enabled kernels run-time allocation of all trace type + * programs is strictly prohibited due to lock type constraints. On + * !RT kernels it is allowed for backwards compatibility reasons for + * now, but warnings are emitted so developers are made aware of + * the unsafety and can fix their programs before this is enforced. */ - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { - if (!check_map_prealloc(map)) { + if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } - if (map->inner_map_meta && - !check_map_prealloc(map->inner_map_meta)) { - verbose(env, "perf_event programs can only use preallocated inner hash map\n"); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "trace type programs can only use preallocated hash map\n"); return -EINVAL; } + WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); + verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } if ((is_tracing_prog_type(prog->type) || diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..bbdfac0182f4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9206,7 +9206,6 @@ static void bpf_overflow_handler(struct perf_event *event, int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); - preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); @@ -9214,7 +9213,6 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); if (!ret) return; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b6ea3dcb57bf..787041eb011b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - preempt_disable(); for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, sd); + u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } } - preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b8661bd0d028..07764c761073 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) if (in_nmi()) /* not supported yet */ return 1; - preempt_disable(); + cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { /* @@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) @@ -1516,10 +1514,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { + cant_sleep(); rcu_read_lock(); - preempt_disable(); (void) BPF_PROG_RUN(prog, args); - preempt_enable(); rcu_read_unlock(); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 18d16f3ef980..2a8e8e9c1c75 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + u32 ret; + + preempt_disable(); + ret = trace_call_bpf(call, regs); + preempt_enable(); + if (!ret) + return; + } esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); diff --git a/lib/test_bpf.c b/lib/test_bpf.c index cecb230833be..a5fddf9ebcb7 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6660,14 +6660,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data, u64 start, finish; int ret = 0, i; - preempt_disable(); + migrate_disable(); start = ktime_get_ns(); for (i = 0; i < runs; i++) ret = BPF_PROG_RUN(fp, data); finish = ktime_get_ns(); - preempt_enable(); + migrate_enable(); *duration = finish - start; do_div(*duration, runs); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index d555c0d8657d..562443f94133 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -37,7 +37,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, repeat = 1; rcu_read_lock(); - preempt_disable(); + migrate_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { bpf_cgroup_storage_set(storage); @@ -54,18 +54,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (need_resched()) { time_spent += ktime_get_ns() - time_start; - preempt_enable(); + migrate_enable(); rcu_read_unlock(); cond_resched(); rcu_read_lock(); - preempt_disable(); + migrate_disable(); time_start = ktime_get_ns(); } } time_spent += ktime_get_ns() - time_start; - preempt_enable(); + migrate_enable(); rcu_read_unlock(); do_div(time_spent, repeat); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 427cfbc0d50d..756b63b6f7b3 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -8,6 +8,7 @@ #include #include #include +#include #include static atomic_t cache_idx; @@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) kfree(map); } +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_sk_storage_elem))) + static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || @@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || - /* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ - attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + if (attr->value_size > MAX_VALUE_SIZE) return -E2BIG; return 0; @@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_SOCKET, }; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + nla_for_each_nested(nla, nla_stgs, rem) { + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + nr_maps++; + } + + diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, + GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nla, nla_stgs, rem) { + struct bpf_map *map; + int map_fd; + + if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + continue; + + map_fd = nla_get_u32(nla); + map = bpf_map_get(map_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_sk_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = __sk_storage_lookup(sk_storage, + (struct bpf_sk_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a1670dff0629..3eff84824c8b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; - preempt_disable(); - result = BPF_PROG_RUN(prog, ctx); - preempt_enable(); + result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index eeb28cb85664..c479372f2cd2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct bpf_prog *prog; int ret; - preempt_disable(); rcu_read_lock(); prog = READ_ONCE(psock->progs.msg_parser); if (unlikely(!prog)) { @@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, sk_msg_compute_data_pointers(msg); msg->sk = sk; - ret = BPF_PROG_RUN(prog, msg); + ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { @@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, } out: rcu_read_unlock(); - preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); @@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - ret = BPF_PROG_RUN(prog, skb); - preempt_enable(); + ret = bpf_prog_run_pin_on_cpu(prog, skb); /* strparser clones the skb before handing it to a upper layer, * meaning skb_orphan has been called. We NULL sk on the way out * to ensure we don't trigger a BUG_ON() in skb/sk operations diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 73ef73a218ff..8a82c5a2c5a8 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -46,16 +46,15 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); } -static int dccp_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int dccp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req); + return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); } static const struct inet_diag_handler dccp_diag_handler = { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f11e997e517b..e1cad25909df 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -156,26 +157,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, - bool net_admin) + struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + u16 nlmsg_flags, bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; + struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + cb_data = cb->data; handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(!handler); - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -187,7 +190,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, + sk_user_ns(NETLINK_CB(cb->skb).sk), + net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { @@ -302,6 +307,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } + /* Keep it at the end for potential retry with a larger skb, + * or else do best-effort fitting, which is only done for the + * first_nlmsg. + */ + if (cb_data->bpf_stg_diag) { + bool first_nlmsg = ((unsigned char *)nlh == skb->data); + unsigned int prev_min_dump_alloc; + unsigned int total_nla_size = 0; + unsigned int msg_len; + int err; + + msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; + err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, + INET_DIAG_SK_BPF_STORAGES, + &total_nla_size); + + if (!err) + goto out; + + total_nla_size += msg_len; + prev_min_dump_alloc = cb->min_dump_alloc; + if (total_nla_size > prev_min_dump_alloc) + cb->min_dump_alloc = min_t(u32, total_nla_size, + MAX_DUMP_ALLOC_SIZE); + + if (!first_nlmsg) + goto errout; + + if (cb->min_dump_alloc > prev_min_dump_alloc) + /* Retry with pskb_expand_head() with + * __GFP_DIRECT_RECLAIM + */ + goto errout; + + WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); + + /* Send what we have for this sk + * and move on to the next sk in the following + * dump() + */ + } + out: nlmsg_end(skb, nlh); return 0; @@ -312,30 +359,19 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_csk_diag_fill(struct sock *sk, - struct sk_buff *skb, - const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, - bool net_admin) -{ - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, - portid, seq, nlmsg_flags, unlh, net_admin); -} - static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + struct netlink_callback *cb, + u16 nlmsg_flags) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -359,16 +395,16 @@ static int inet_twsk_diag_fill(struct sock *sk, } static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin) + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) { struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; @@ -397,21 +433,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, bool net_admin) + u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh, net_admin); + return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh, net_admin); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -459,10 +492,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net, EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct sk_buff *in_skb, - const struct nlmsghdr *nlh, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); struct net *net = sock_net(in_skb->sk); struct sk_buff *rep; @@ -479,10 +512,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, goto out; } - err = sk_diag_fill(sk, rep, req, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, net_admin); + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -509,14 +539,21 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, int err; handler = inet_diag_lock_handler(req->sdiag_protocol); - if (IS_ERR(handler)) + if (IS_ERR(handler)) { err = PTR_ERR(handler); - else if (cmd == SOCK_DIAG_BY_FAMILY) - err = handler->dump_one(in_skb, nlh, req); - else if (cmd == SOCK_DESTROY && handler->destroy) + } else if (cmd == SOCK_DIAG_BY_FAMILY) { + struct inet_diag_dump_data empty_dump_data = {}; + struct netlink_callback cb = { + .nlh = nlh, + .skb = in_skb, + .data = &empty_dump_data, + }; + err = handler->dump_one(&cb, req); + } else if (cmd == SOCK_DESTROY && handler->destroy) { err = handler->destroy(in_skb, req); - else + } else { err = -EOPNOTSUPP; + } inet_diag_unlock_handler(handler); return err; @@ -847,23 +884,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static int inet_csk_diag_dump(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc, - bool net_admin) -{ - if (!inet_diag_bc_sk(bc, sk)) - return 0; - - return inet_csk_diag_fill(sk, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, - net_admin); -} - static void twsk_build_assert(void) { BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != @@ -892,14 +912,17 @@ static void twsk_build_assert(void) void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; int i, num, s_i, s_num; + struct nlattr *bc; struct sock *sk; + bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -935,8 +958,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, - bc, net_admin) < 0) { + if (!inet_diag_bc_sk(bc, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { spin_unlock(&ilb->lock); goto done; } @@ -1014,11 +1041,8 @@ next_normal: res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh, net_admin); + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } @@ -1042,31 +1066,101 @@ out: EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { const struct inet_diag_handler *handler; + u32 prev_min_dump_alloc; int err = 0; +again: + prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) - handler->dump(skb, cb, r, bc); + handler->dump(skb, cb, r); else err = PTR_ERR(handler); inet_diag_unlock_handler(handler); + /* The skb is not large enough to fit one sk info and + * inet_sk_diag_fill() has requested for a larger skb. + */ + if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { + err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); + if (!err) + goto again; + } + return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int hdrlen = sizeof(struct inet_diag_req_v2); - struct nlattr *bc = NULL; + return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); +} - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); +static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct inet_diag_dump_data *cb_data; + struct sk_buff *skb = cb->skb; + struct nlattr *nla; + int rem, err; - return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); + cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); + if (!cb_data) + return -ENOMEM; + + nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + cb_data->req_nlas[type] = nla; + } + + nla = cb_data->inet_diag_nla_bc; + if (nla) { + err = inet_diag_bc_audit(nla, skb); + if (err) { + kfree(cb_data); + return err; + } + } + + nla = cb_data->inet_diag_nla_bpf_stgs; + if (nla) { + struct bpf_sk_storage_diag *bpf_stg_diag; + + bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); + if (IS_ERR(bpf_stg_diag)) { + kfree(cb_data); + return PTR_ERR(bpf_stg_diag); + } + cb_data->bpf_stg_diag = bpf_stg_diag; + } + + cb->data = cb_data; + return 0; +} + +static int inet_diag_dump_start(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); +} + +static int inet_diag_dump_start_compat(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); +} + +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + struct inet_diag_dump_data *cb_data = cb->data; + + bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); + kfree(cb->data); + + return 0; } static int inet_diag_type2proto(int type) @@ -1085,9 +1179,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); - int hdrlen = sizeof(struct inet_diag_req); struct inet_diag_req_v2 req; - struct nlattr *bc = NULL; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); @@ -1095,10 +1187,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - - return __inet_diag_dump(skb, cb, &req, bc); + return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, @@ -1126,22 +1215,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(nlh, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(nlh, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump_compat, - }; - return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start_compat, + .done = inet_diag_dump_done, + .dump = inet_diag_dump_compat, + }; + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); @@ -1157,22 +1236,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(h, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(h, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump, - }; - return netlink_dump_start(net->diag_nlsk, skb, h, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start, + .done = inet_diag_dump_done, + .dump = inet_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index e35736b99300..d19cce39be1b 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -87,15 +87,16 @@ out_unlock: return sk ? sk : ERR_PTR(-ENOENT); } -static int raw_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct net *net = sock_net(in_skb->sk); + struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; + struct net *net; int err; + net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -108,10 +109,7 @@ static int raw_diag_dump_one(struct sk_buff *in_skb, return -ENOMEM; } - err = inet_sk_diag_fill(sk, NULL, rep, r, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, + err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); @@ -136,25 +134,25 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, if (!inet_diag_bc_sk(bc, sk)) return 0; - return inet_sk_diag_fill(sk, NULL, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh, net_admin); + return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct sock *sk = NULL; + struct nlattr *bc; if (IS_ERR(hashinfo)) return; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0d08f9e2d8d0..75a1c985f49a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -179,15 +179,15 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r); } -static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); + return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 910555a4d9fe..93884696abdd 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, if (!inet_diag_bc_sk(bc, sk)) return 0; - return inet_sk_diag_fill(sk, NULL, skb, req, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); + return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, + net_admin); } -static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int udp_dump_one(struct udp_table *tbl, + struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; int err = -EINVAL; struct sock *sk = NULL; struct sk_buff *rep; @@ -70,11 +69,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, if (!rep) goto out; - err = inet_sk_diag_fill(sk, NULL, rep, req, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, - netlink_net_capable(in_skb, CAP_NET_ADMIN)); + err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -93,12 +89,16 @@ out_nosk: static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; + struct nlattr *bc; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -146,15 +146,15 @@ done: } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udp_table, skb, cb, r, bc); + udp_dump(&udp_table, skb, cb, r); } -static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return udp_dump_one(&udp_table, in_skb, nlh, req); + return udp_dump_one(&udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -249,16 +249,15 @@ static const struct inet_diag_handler udp_diag_handler = { }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udplite_table, skb, cb, r, bc); + udp_dump(&udplite_table, skb, cb, r); } -static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return udp_dump_one(&udplite_table, in_skb, nlh, req); + return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index ea9e73428ed9..56fac24a627a 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) struct bpf_prog *prog = psock->bpf_prog; int res; - preempt_disable(); - res = BPF_PROG_RUN(prog, skb); - preempt_enable(); + res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 8a15146faaeb..69743a6aaf6f 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -432,11 +432,12 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); } -static int sctp_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, +static int sctp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { + struct sk_buff *in_skb = cb->skb; struct net *net = sock_net(in_skb->sk); + const struct nlmsghdr *nlh = cb->nlh; union sctp_addr laddr, paddr; struct sctp_comm_param commp = { .skb = in_skb, @@ -470,7 +471,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb, } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { u32 idiag_states = r->idiag_states; struct net *net = sock_net(skb->sk); diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 90baf7d70911..cebed6fb5bbb 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0-only # # Copyright (C) 2018-2019 Netronome Systems, Inc. diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 4d08f35034a2..b04156cfd7a3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -19,19 +19,24 @@ SYNOPSIS FEATURE COMMANDS ================ -| **bpftool** **feature probe** [*COMPONENT*] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**macros** [**prefix** *PREFIX*]] | **bpftool** **feature help** | | *COMPONENT* := { **kernel** | **dev** *NAME* } DESCRIPTION =========== - **bpftool feature probe** [**kernel**] [**macros** [**prefix** *PREFIX*]] + **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] Probe the running kernel and dump a number of eBPF-related parameters, such as availability of the **bpf()** system call, JIT status, eBPF program types availability, eBPF helper functions availability, and more. + By default, bpftool **does not run probes** for + **bpf_probe_write_user**\ () and **bpf_trace_printk**\() + helpers which print warnings to kernel logs. To enable them + and run all probes, the **full** keyword should be used. + If the **macros** keyword (but not the **-j** option) is passed, a subset of the output is dumped as a list of **#define** macros that are ready to be included in a C @@ -44,16 +49,12 @@ DESCRIPTION Keyword **kernel** can be omitted. If no probe target is specified, probing the kernel is the default behaviour. - Note that when probed, some eBPF helpers (e.g. - **bpf_trace_printk**\ () or **bpf_probe_write_user**\ ()) may - print warnings to kernel logs. - - **bpftool feature probe dev** *NAME* [**macros** [**prefix** *PREFIX*]] + **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] Probe network device for supported eBPF features and dump results to the console. - The two keywords **macros** and **prefix** have the same - role as when probing the kernel. + The keywords **full**, **macros** and **prefix** have the + same role as when probing the kernel. **bpftool feature help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 64ddf8a4c518..46862e85fed2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -42,7 +42,8 @@ PROG COMMANDS | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** +| **cgroup/getsockopt** | **cgroup/setsockopt** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | } | *ATTACH_TYPE* := { | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 754d8395e451..f2838a658339 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -469,7 +469,8 @@ _bpftool() cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ cgroup/sysctl cgroup/getsockopt \ - cgroup/setsockopt" -- \ + cgroup/setsockopt struct_ops \ + fentry fexit freplace" -- \ "$cur" ) ) return 0 ;; @@ -983,11 +984,12 @@ _bpftool() probe) [[ $prev == "prefix" ]] && return 0 if _bpftool_search_list 'macros'; then - COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) ) + _bpftool_once_attr 'prefix' else COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) fi _bpftool_one_of_list 'kernel dev' + _bpftool_once_attr 'full' return 0 ;; *) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 941873d778d8..88718ee6a438 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -112,18 +112,12 @@ print_start_section(const char *json_title, const char *plain_title, } } -static void -print_end_then_start_section(const char *json_title, const char *plain_title, - const char *define_comment, - const char *define_prefix) +static void print_end_section(void) { if (json_output) jsonw_end_object(json_wtr); else printf("\n"); - - print_start_section(json_title, plain_title, define_comment, - define_prefix); } /* Probing functions */ @@ -519,14 +513,39 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix, define_prefix); } +static void +probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, + const char *define_prefix, unsigned int id, + const char *ptype_name, __u32 ifindex) +{ + bool res; + + if (!supported_type) + res = false; + else + res = bpf_probe_helper(id, prog_type, ifindex); + + if (json_output) { + if (res) + jsonw_string(json_wtr, helper_name[id]); + } else if (define_prefix) { + printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", + define_prefix, ptype_name, helper_name[id], + res ? "1" : "0"); + } else { + if (res) + printf("\n\t- %s", helper_name[id]); + } +} + static void probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, - const char *define_prefix, __u32 ifindex) + const char *define_prefix, bool full_mode, + __u32 ifindex) { const char *ptype_name = prog_type_name[prog_type]; char feat_name[128]; unsigned int id; - bool res; if (ifindex) /* Only test helpers for offload-able program types */ @@ -548,21 +567,19 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, } for (id = 1; id < ARRAY_SIZE(helper_name); id++) { - if (!supported_type) - res = false; - else - res = bpf_probe_helper(id, prog_type, ifindex); - - if (json_output) { - if (res) - jsonw_string(json_wtr, helper_name[id]); - } else if (define_prefix) { - printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", - define_prefix, ptype_name, helper_name[id], - res ? "1" : "0"); - } else { - if (res) - printf("\n\t- %s", helper_name[id]); + /* Skip helper functions which emit dmesg messages when not in + * the full mode. + */ + switch (id) { + case BPF_FUNC_trace_printk: + case BPF_FUNC_probe_write_user: + if (!full_mode) + continue; + /* fallthrough */ + default: + probe_helper_for_progtype(prog_type, supported_type, + define_prefix, id, ptype_name, + ifindex); } } @@ -584,13 +601,132 @@ probe_large_insn_limit(const char *define_prefix, __u32 ifindex) res, define_prefix); } +static void +section_system_config(enum probe_component target, const char *define_prefix) +{ + switch (target) { + case COMPONENT_KERNEL: + case COMPONENT_UNSPEC: + if (define_prefix) + break; + + print_start_section("system_config", + "Scanning system configuration...", + NULL, /* define_comment never used here */ + NULL); /* define_prefix always NULL here */ + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } + probe_kernel_image_config(); + print_end_section(); + break; + default: + break; + } +} + +static bool section_syscall_config(const char *define_prefix) +{ + bool res; + + print_start_section("syscall_config", + "Scanning system call availability...", + "/*** System call availability ***/", + define_prefix); + res = probe_bpf_syscall(define_prefix); + print_end_section(); + + return res; +} + +static void +section_program_types(bool *supported_types, const char *define_prefix, + __u32 ifindex) +{ + unsigned int i; + + print_start_section("program_types", + "Scanning eBPF program types...", + "/*** eBPF program types ***/", + define_prefix); + + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_prog_type(i, supported_types, define_prefix, ifindex); + + print_end_section(); +} + +static void section_map_types(const char *define_prefix, __u32 ifindex) +{ + unsigned int i; + + print_start_section("map_types", + "Scanning eBPF map types...", + "/*** eBPF map types ***/", + define_prefix); + + for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) + probe_map_type(i, define_prefix, ifindex); + + print_end_section(); +} + +static void +section_helpers(bool *supported_types, const char *define_prefix, + bool full_mode, __u32 ifindex) +{ + unsigned int i; + + print_start_section("helpers", + "Scanning eBPF helper functions...", + "/*** eBPF helper functions ***/", + define_prefix); + + if (define_prefix) + printf("/*\n" + " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" + " * to determine if is available for ,\n" + " * e.g.\n" + " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" + " * // do stuff with this helper\n" + " * #elif\n" + " * // use a workaround\n" + " * #endif\n" + " */\n" + "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" + " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", + define_prefix, define_prefix, define_prefix, + define_prefix); + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_helpers_for_progtype(i, supported_types[i], + define_prefix, full_mode, ifindex); + + print_end_section(); +} + +static void section_misc(const char *define_prefix, __u32 ifindex) +{ + print_start_section("misc", + "Scanning miscellaneous eBPF features...", + "/*** eBPF misc features ***/", + define_prefix); + probe_large_insn_limit(define_prefix, ifindex); + print_end_section(); +} + static int do_probe(int argc, char **argv) { enum probe_component target = COMPONENT_UNSPEC; const char *define_prefix = NULL; bool supported_types[128] = {}; + bool full_mode = false; __u32 ifindex = 0; - unsigned int i; char *ifname; /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). @@ -629,6 +765,9 @@ static int do_probe(int argc, char **argv) strerror(errno)); return -1; } + } else if (is_prefix(*argv, "full")) { + full_mode = true; + NEXT_ARG(); } else if (is_prefix(*argv, "macros") && !define_prefix) { define_prefix = ""; NEXT_ARG(); @@ -658,97 +797,19 @@ static int do_probe(int argc, char **argv) jsonw_start_object(json_wtr); } - switch (target) { - case COMPONENT_KERNEL: - case COMPONENT_UNSPEC: - if (define_prefix) - break; - - print_start_section("system_config", - "Scanning system configuration...", - NULL, /* define_comment never used here */ - NULL); /* define_prefix always NULL here */ - if (check_procfs()) { - probe_unprivileged_disabled(); - probe_jit_enable(); - probe_jit_harden(); - probe_jit_kallsyms(); - probe_jit_limit(); - } else { - p_info("/* procfs not mounted, skipping related probes */"); - } - probe_kernel_image_config(); - if (json_output) - jsonw_end_object(json_wtr); - else - printf("\n"); - break; - default: - break; - } - - print_start_section("syscall_config", - "Scanning system call availability...", - "/*** System call availability ***/", - define_prefix); - - if (!probe_bpf_syscall(define_prefix)) + section_system_config(target, define_prefix); + if (!section_syscall_config(define_prefix)) /* bpf() syscall unavailable, don't probe other BPF features */ goto exit_close_json; - - print_end_then_start_section("program_types", - "Scanning eBPF program types...", - "/*** eBPF program types ***/", - define_prefix); - - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_prog_type(i, supported_types, define_prefix, ifindex); - - print_end_then_start_section("map_types", - "Scanning eBPF map types...", - "/*** eBPF map types ***/", - define_prefix); - - for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) - probe_map_type(i, define_prefix, ifindex); - - print_end_then_start_section("helpers", - "Scanning eBPF helper functions...", - "/*** eBPF helper functions ***/", - define_prefix); - - if (define_prefix) - printf("/*\n" - " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" - " * to determine if is available for ,\n" - " * e.g.\n" - " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" - " * // do stuff with this helper\n" - " * #elif\n" - " * // use a workaround\n" - " * #endif\n" - " */\n" - "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" - " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", - define_prefix, define_prefix, define_prefix, - define_prefix); - for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) - probe_helpers_for_progtype(i, supported_types[i], - define_prefix, ifindex); - - print_end_then_start_section("misc", - "Scanning miscellaneous eBPF features...", - "/*** eBPF misc features ***/", - define_prefix); - probe_large_insn_limit(define_prefix, ifindex); + section_program_types(supported_types, define_prefix, ifindex); + section_map_types(define_prefix, ifindex); + section_helpers(supported_types, define_prefix, full_mode, ifindex); + section_misc(define_prefix, ifindex); exit_close_json: - if (json_output) { - /* End current "section" of probes */ - jsonw_end_object(json_wtr); + if (json_output) /* End root object */ jsonw_end_object(json_wtr); - } return 0; } @@ -761,7 +822,7 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %s %s probe [COMPONENT] [macros [prefix PREFIX]]\n" + "Usage: %s %s probe [COMPONENT] [full] [macros [prefix PREFIX]]\n" " %s %s help\n" "\n" " COMPONENT := { kernel | dev NAME }\n" diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 4e75b58d3989..724ef9d941d3 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -76,6 +76,9 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", }; extern const char * const map_type_name[]; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b352ab041160..1996e67a2f00 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1573,8 +1573,8 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" - " cgroup/recvmsg6 | cgroup/getsockopt |\n" - " cgroup/setsockopt }\n" + " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n" + " struct_ops | fentry | fexit | freplace }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" " " HELP_SPEC_OPTIONS "\n" diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 61df01cdf0b2..304fdf1a21dc 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -3,4 +3,7 @@ gpiogpio-hammer gpioinclude/ gpiolsgpio tpm2/SpaceTest.log -tpm2/*.pyc + +# Python bytecode and cache +__pycache__/ +*.py[cod] diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2a583196fa51..2d7f5df33f04 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,7 +20,7 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program @@ -62,7 +62,8 @@ TEST_PROGS := test_kmod.sh \ test_tc_tunnel.sh \ test_tc_edt.sh \ test_xdping.sh \ - test_bpftool_build.sh + test_bpftool_build.sh \ + test_bpftool.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 68d452bb9fd9..a1dd13b34d4b 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -509,11 +509,6 @@ static void test_syncookie(int type, sa_family_t family) .pass_on_failure = 0, }; - if (type != SOCK_STREAM) { - test__skip(); - return; - } - /* * +1 for TCP-SYN and * +1 for the TCP-ACK (ack the syncookie) @@ -787,7 +782,7 @@ static const char *sotype_str(int sotype) } } -#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } +#define TEST_INIT(fn_, ...) { .fn = fn_, .name = #fn_, __VA_ARGS__ } static void test_config(int sotype, sa_family_t family, bool inany) { @@ -795,19 +790,31 @@ static void test_config(int sotype, sa_family_t family, bool inany) void (*fn)(int sotype, sa_family_t family); const char *name; bool no_inner_map; + int need_sotype; } tests[] = { - TEST_INIT(test_err_inner_map, true /* no_inner_map */), + TEST_INIT(test_err_inner_map, + .no_inner_map = true), TEST_INIT(test_err_skb_data), TEST_INIT(test_err_sk_select_port), TEST_INIT(test_pass), - TEST_INIT(test_syncookie), + TEST_INIT(test_syncookie, + .need_sotype = SOCK_STREAM), TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; char s[MAX_TEST_NAME]; const struct test *t; + /* SOCKMAP/SOCKHASH don't support UDP yet */ + if (sotype == SOCK_DGRAM && + (inner_map_type == BPF_MAP_TYPE_SOCKMAP || + inner_map_type == BPF_MAP_TYPE_SOCKHASH)) + return; + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + if (t->need_sotype && t->need_sotype != sotype) + continue; /* test not compatible with socket type */ + snprintf(s, sizeof(s), "%s %s/%s %s %s", maptype_str(inner_map_type), family_str(family), sotype_str(sotype), @@ -816,13 +823,6 @@ static void test_config(int sotype, sa_family_t family, bool inany) if (!test__start_subtest(s)) continue; - if (sotype == SOCK_DGRAM && - inner_map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - /* SOCKMAP/SOCKHASH don't support UDP yet */ - test__skip(); - continue; - } - setup_per_test(sotype, family, inany, t->no_inner_map); t->fn(sotype, family); cleanup_per_test(t->no_inner_map); diff --git a/tools/testing/selftests/bpf/test_bpftool.py b/tools/testing/selftests/bpf/test_bpftool.py new file mode 100644 index 000000000000..4fed2dc25c0a --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +import collections +import functools +import json +import os +import socket +import subprocess +import unittest + + +# Add the source tree of bpftool and /usr/local/sbin to PATH +cur_dir = os.path.dirname(os.path.realpath(__file__)) +bpftool_dir = os.path.abspath(os.path.join(cur_dir, "..", "..", "..", "..", + "tools", "bpf", "bpftool")) +os.environ["PATH"] = bpftool_dir + ":/usr/local/sbin:" + os.environ["PATH"] + + +class IfaceNotFoundError(Exception): + pass + + +class UnprivilegedUserError(Exception): + pass + + +def _bpftool(args, json=True): + _args = ["bpftool"] + if json: + _args.append("-j") + _args.extend(args) + + return subprocess.check_output(_args) + + +def bpftool(args): + return _bpftool(args, json=False).decode("utf-8") + + +def bpftool_json(args): + res = _bpftool(args) + return json.loads(res) + + +def get_default_iface(): + for iface in socket.if_nameindex(): + if iface[1] != "lo": + return iface[1] + raise IfaceNotFoundError("Could not find any network interface to probe") + + +def default_iface(f): + @functools.wraps(f) + def wrapper(*args, **kwargs): + iface = get_default_iface() + return f(*args, iface, **kwargs) + return wrapper + + +class TestBpftool(unittest.TestCase): + @classmethod + def setUpClass(cls): + if os.getuid() != 0: + raise UnprivilegedUserError( + "This test suite needs root privileges") + + @default_iface + def test_feature_dev_json(self, iface): + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + res = bpftool_json(["feature", "probe", "dev", iface]) + # Check if the result has all expected keys. + self.assertCountEqual(res.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in res["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel"]), + bpftool_json(["feature", "probe"]), + bpftool_json(["feature"]), + ] + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "system_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + for tc in test_cases: + # Check if the result has all expected keys. + self.assertCountEqual(tc.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in tc["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel_full(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel", "full"]), + bpftool_json(["feature", "probe", "full"]), + ] + expected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + + for tc in test_cases: + # Check if expected helpers are included at least once in any + # helpers list for any program type. Unfortunately we cannot assume + # that they will be included in all program types or a specific + # subset of programs. It depends on the kernel version and + # configuration. + found_helpers = False + + for helpers in tc["helpers"].values(): + if all(expected_helper in helpers + for expected_helper in expected_helpers): + found_helpers = True + break + + self.assertTrue(found_helpers) + + def test_feature_kernel_full_vs_not_full(self): + full_res = bpftool_json(["feature", "probe", "full"]) + not_full_res = bpftool_json(["feature", "probe"]) + not_full_set = set() + full_set = set() + + for helpers in full_res["helpers"].values(): + for helper in helpers: + full_set.add(helper) + + for helpers in not_full_res["helpers"].values(): + for helper in helpers: + not_full_set.add(helper) + + self.assertCountEqual(full_set - not_full_set, + {"bpf_probe_write_user", "bpf_trace_printk"}) + self.assertCountEqual(not_full_set - full_set, set()) + + def test_feature_macros(self): + expected_patterns = [ + r"/\*\*\* System call availability \*\*\*/", + r"#define HAVE_BPF_SYSCALL", + r"/\*\*\* eBPF program types \*\*\*/", + r"#define HAVE.*PROG_TYPE", + r"/\*\*\* eBPF map types \*\*\*/", + r"#define HAVE.*MAP_TYPE", + r"/\*\*\* eBPF helper functions \*\*\*/", + r"#define HAVE.*HELPER", + r"/\*\*\* eBPF misc features \*\*\*/", + ] + + res = bpftool(["feature", "probe", "macros"]) + for pattern in expected_patterns: + self.assertRegex(res, pattern) diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh new file mode 100755 index 000000000000..66690778e36d --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +python3 -m unittest -v test_bpftool.TestBpftool diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index bab1e6f1d8f1..a969c77e9456 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -6,6 +6,8 @@ #include "bpf_rlimit.h" #include #include +#include +#include /* backtrace */ /* defined in test_progs.h */ struct test_env env = {}; @@ -617,6 +619,23 @@ int cd_flavor_subdir(const char *exec_name) return chdir(flavor); } +#define MAX_BACKTRACE_SZ 128 +void crash_handler(int signum) +{ + void *bt[MAX_BACKTRACE_SZ]; + size_t sz; + + sz = backtrace(bt, ARRAY_SIZE(bt)); + + if (env.test) + dump_test_log(env.test, true); + if (env.stdout) + stdio_restore(); + + fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); + backtrace_symbols_fd(bt, sz, STDERR_FILENO); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -624,8 +643,14 @@ int main(int argc, char **argv) .parser = parse_arg, .doc = argp_program_doc, }; + struct sigaction sigact = { + .sa_handler = crash_handler, + .sa_flags = SA_RESETHAND, + }; int err, i; + sigaction(SIGSEGV, &sigact, NULL); + err = argp_parse(&argp, argc, argv, 0, NULL, &env); if (err) return err;