diff --git a/MAINTAINERS b/MAINTAINERS index 7f4615336fa5..b585e6092a74 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3384,9 +3384,11 @@ F: Documentation/networking/filter.rst F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* +F: include/linux/btf* F: include/linux/filter.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* +F: include/uapi/linux/btf* F: include/uapi/linux/filter.h F: kernel/bpf/ F: kernel/trace/bpf_trace.c diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0af88622c619..cb6d22439f71 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -662,6 +662,11 @@ static void build_epilogue(struct jit_ctx *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ func##_positive) +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + static int build_body(struct jit_ctx *ctx) { const struct bpf_prog *prog = ctx->skf; @@ -728,7 +733,10 @@ load_common: /* Load return register on DS for failures */ emit_reg_move(r_ret, r_zero, ctx); /* Return with error */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_LD | BPF_W | BPF_IND: @@ -775,8 +783,10 @@ load_ind: emit_jalr(MIPS_R_RA, r_s0, ctx); emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ /* Check the error value */ - emit_bcond(MIPS_COND_NE, r_ret, 0, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); /* We are good */ /* X <- P[1:K] & 0xf */ @@ -855,8 +865,10 @@ load_ind: /* A /= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_div(r_A, r_X, ctx); break; @@ -864,8 +876,10 @@ load_ind: /* A %= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_mod(r_A, r_X, ctx); break; @@ -926,7 +940,10 @@ load_ind: break; case BPF_JMP | BPF_JA: /* pc += K */ - emit_b(b_imm(i + k + 1, ctx), ctx); + b_off = b_imm(i + k + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1056,12 +1073,16 @@ jmp_cmp: break; case BPF_RET | BPF_A: ctx->flags |= SEEN_A; - if (i != prog->len - 1) + if (i != prog->len - 1) { /* * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); + } emit_reg_move(r_ret, r_A, ctx); /* delay slot */ break; case BPF_RET | BPF_K: @@ -1075,7 +1096,10 @@ jmp_cmp: * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); } break; @@ -1133,8 +1157,10 @@ jmp_cmp: /* Load *dev pointer */ emit_load_ptr(r_s0, r_skb, off, ctx); /* error (0) in the delay slot */ - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); if (code == (BPF_ANC | SKF_AD_IFINDEX)) { BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); @@ -1244,7 +1270,10 @@ void bpf_jit_compile(struct bpf_prog *fp) /* Generate the actual JIT code */ build_prologue(&ctx); - build_body(&ctx); + if (build_body(&ctx)) { + module_memfree(ctx.target); + goto out; + } build_epilogue(&ctx); /* Update the icache */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3db..9ea57389c554 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1341,9 +1341,10 @@ st: if (is_imm8(insn->off)) if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) { - u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; u32 real_src_reg = src_reg; + u32 real_dst_reg = dst_reg; + u8 *branch_target; /* * Can't be implemented with a single x86 insn. @@ -1354,11 +1355,13 @@ st: if (is_imm8(insn->off)) emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0) real_src_reg = BPF_REG_AX; + if (dst_reg == BPF_REG_0) + real_dst_reg = BPF_REG_AX; branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), - BPF_REG_0, dst_reg, insn->off); + BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG. @@ -1369,7 +1372,8 @@ st: if (is_imm8(insn->off)) add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, - dst_reg, AUX_REG, insn->off, + real_dst_reg, AUX_REG, + insn->off, BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; @@ -1383,11 +1387,10 @@ st: if (is_imm8(insn->off)) /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; - } err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; @@ -1744,7 +1747,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool mod_ret) + struct bpf_prog *p, int stack_size, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; @@ -1777,11 +1780,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + /* + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. + * + * BPF_TRAMP_FENTRY trampoline may need to return the return + * value of BPF_PROG_TYPE_STRUCT_OPS prog. */ - if (mod_ret) + if (save_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); /* replace 2 nops with JE insn, since jmp target is known */ @@ -1828,13 +1835,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size) + struct bpf_tramp_progs *tp, int stack_size, + bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + save_ret)) return -EINVAL; } *pprog = prog; @@ -1877,6 +1886,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +static bool is_valid_bpf_tramp_flags(unsigned int flags) +{ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return false; + + /* + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) + return false; + + return true; +} + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -1949,17 +1975,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; + bool save_ret; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ if (nr_args > 6) return -ENOTSUPP; - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; - if (flags & BPF_TRAMP_F_CALL_ORIG) - stack_size += 8; /* room for return value of orig_call */ + /* room for return value of orig_call or fentry prog */ + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + stack_size += 8; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2005,7 +2033,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size)) + if (invoke_bpf(m, &prog, fentry, stack_size, + flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; if (fmod_ret->nr_progs) { @@ -2052,7 +2081,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) { + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { ret = -EINVAL; goto cleanup; } @@ -2072,9 +2101,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - /* restore original return value back into RAX */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); } + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4c16f19f83e..020a7d5bf470 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -578,11 +578,12 @@ struct btf_func_model { * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) - /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) +/* Return the return value of fentry prog. Only used by bpf_struct_ops. */ +#define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c32864e..9abcc33f02cf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021b1..d6b7dfdd8066 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8afa8690d288..570b0c97392a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6574,22 +6574,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2eb0e55ef54d..b5f4ef35357c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -552,6 +552,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->gso_segs = skb_shinfo(skb)->gso_segs; } +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -596,20 +602,19 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - sk = kzalloc(sizeof(struct sock), GFP_USER); + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } - sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); if (!skb) { kfree(data); kfree(ctx); - kfree(sk); + sk_free(sk); return -ENOMEM; } skb->sk = sk; @@ -682,8 +687,7 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - bpf_sk_storage_free(sk); - kfree(sk); + sk_free(sk); kfree(ctx); return ret; } diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 10911a8cad0f..2df880cefdae 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1649,11 +1649,17 @@ static bool btf_is_non_static(const struct btf_type *t) static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, int *out_btf_sec_id, int *out_btf_id) { - int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + int i, j, n, m, btf_id = 0; const struct btf_type *t; const struct btf_var_secinfo *vi; const char *name; + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__get_nr_types(obj->btf); for (i = 1; i <= n; i++) { t = btf__type_by_id(obj->btf, i); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 866531c08e4f..799b88152e9e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -375,7 +375,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) + $(wildcard $(BPFDIR)/bpf_*.h) \ + | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 59ea56945e6c..b497bb85b667 100755 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -112,6 +112,14 @@ setup() ip netns add "${NS2}" ip netns add "${NS3}" + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 + ip link add veth1 type veth peer name veth2 ip link add veth3 type veth peer name veth4 ip link add veth5 type veth peer name veth6 @@ -236,11 +244,6 @@ setup() ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) sleep 1 # reduce flakiness