diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2e586f579945..15615c94804f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -203,8 +203,9 @@ struct jit_context { /* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -/* number of bytes emit_call() needs to generate call instruction */ -#define X86_CALL_SIZE 5 + +/* Number of bytes emit_patch() needs to generate instructions */ +#define X86_PATCH_SIZE 5 #define PROLOGUE_SIZE 25 @@ -215,7 +216,7 @@ struct jit_context { static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; - int cnt = X86_CALL_SIZE; + int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later @@ -238,6 +239,123 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + X86_PATCH_SIZE); + if (!is_simm32(offset)) { + pr_err("Target call %p is out of range\n", func); + return -ERANGE; + } + EMIT1_off32(opcode, offset); + *pprog = prog; + return 0; +} + +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr, + const bool text_live) +{ + int (*emit_patch_fn)(u8 **pprog, void *func, void *ip); + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + u8 old_insn[X86_PATCH_SIZE] = {}; + u8 new_insn[X86_PATCH_SIZE] = {}; + u8 *prog; + int ret; + + switch (t) { + case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP: + emit_patch_fn = emit_call; + break; + case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP: + emit_patch_fn = emit_jump; + break; + default: + return -ENOTSUPP; + } + + switch (t) { + case BPF_MOD_NOP_TO_CALL: + case BPF_MOD_NOP_TO_JUMP: + if (!old_addr && new_addr) { + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + + prog = new_insn; + ret = emit_patch_fn(&prog, new_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + case BPF_MOD_CALL_TO_CALL: + case BPF_MOD_JUMP_TO_JUMP: + if (old_addr && new_addr) { + prog = old_insn; + ret = emit_patch_fn(&prog, old_addr, ip); + if (ret) + return ret; + + prog = new_insn; + ret = emit_patch_fn(&prog, new_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + case BPF_MOD_CALL_TO_NOP: + case BPF_MOD_JUMP_TO_NOP: + if (old_addr && !new_addr) { + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + + prog = old_insn; + ret = emit_patch_fn(&prog, old_addr, ip); + if (ret) + return ret; + break; + } + return -ENXIO; + default: + return -ENOTSUPP; + } + + ret = -EBUSY; + mutex_lock(&text_mutex); + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) + goto out; + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) + /* BPF poking in modules is not supported */ + return -EINVAL; + + return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); +} + /* * Generate the following code: * @@ -252,7 +370,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog) { u8 *prog = *pprog; int label1, label2, label3; @@ -319,6 +437,69 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } +static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, + u8 **pprog, int addr, u8 *image) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT2(X86_JA, 14); /* ja out */ + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + + poke->ip = image + (addr - X86_PATCH_SIZE); + poke->adj_off = PROLOGUE_SIZE; + + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; + /* out: */ + + *pprog = prog; +} + +static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) +{ + static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP; + struct bpf_jit_poke_descriptor *poke; + struct bpf_array *array; + struct bpf_prog *target; + int i, ret; + + for (i = 0; i < prog->aux->size_poke_tab; i++) { + poke = &prog->aux->poke_tab[i]; + WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + + array = container_of(poke->tail_call.map, struct bpf_array, map); + mutex_lock(&array->aux->poke_mutex); + target = array->ptrs[poke->tail_call.key]; + if (target) { + /* Plain memcpy is used when image is not live yet + * and still not locked as read-only. Once poke + * location is active (poke->ip_stable), any parallel + * bpf_arch_text_poke() might occur still on the + * read-write image until we finally locked it as + * read-only. Both modifications on the given image + * are under text_mutex to avoid interference. + */ + ret = __bpf_arch_text_poke(poke->ip, type, NULL, + (u8 *)target->bpf_func + + poke->adj_off, false); + BUG_ON(ret < 0); + } + WRITE_ONCE(poke->ip_stable, true); + mutex_unlock(&array->aux->poke_mutex); + } +} + static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -480,72 +661,6 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } -static int emit_call(u8 **pprog, void *func, void *ip) -{ - u8 *prog = *pprog; - int cnt = 0; - s64 offset; - - offset = func - (ip + X86_CALL_SIZE); - if (!is_simm32(offset)) { - pr_err("Target call %p is out of range\n", func); - return -EINVAL; - } - EMIT1_off32(0xE8, offset); - *pprog = prog; - return 0; -} - -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) -{ - u8 old_insn[X86_CALL_SIZE] = {}; - u8 new_insn[X86_CALL_SIZE] = {}; - u8 *prog; - int ret; - - if (!is_kernel_text((long)ip) && - !is_bpf_text_address((long)ip)) - /* BPF trampoline in modules is not supported */ - return -EINVAL; - - if (old_addr) { - prog = old_insn; - ret = emit_call(&prog, old_addr, (void *)ip); - if (ret) - return ret; - } - if (new_addr) { - prog = new_insn; - ret = emit_call(&prog, new_addr, (void *)ip); - if (ret) - return ret; - } - ret = -EBUSY; - mutex_lock(&text_mutex); - switch (t) { - case BPF_MOD_NOP_TO_CALL: - if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE)) - goto out; - text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); - break; - case BPF_MOD_CALL_TO_CALL: - if (memcmp(ip, old_insn, X86_CALL_SIZE)) - goto out; - text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL); - break; - case BPF_MOD_CALL_TO_NOP: - if (memcmp(ip, old_insn, X86_CALL_SIZE)) - goto out; - text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL); - break; - } - ret = 0; -out: - mutex_unlock(&text_mutex); - return ret; -} - static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -1013,7 +1128,11 @@ xadd: if (is_imm8(insn->off)) break; case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + if (imm32) + emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + &prog, addrs[i], image); + else + emit_bpf_tail_call_indirect(&prog); break; /* cond jump */ @@ -1394,7 +1513,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - orig_call += X86_CALL_SIZE; + orig_call += X86_PATCH_SIZE; prog = image; @@ -1571,6 +1690,7 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + bpf_tail_call_direct_fixup(prog); bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e89e86122233..c2f07fd410c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; +struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; @@ -64,6 +65,12 @@ struct bpf_map_ops { const struct btf_type *key_type, const struct btf_type *value_type); + /* Prog poke tracking helpers. */ + int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, + struct bpf_prog *new); + /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); @@ -488,6 +495,24 @@ struct bpf_func_info_aux { bool unreliable; }; +enum bpf_jit_poke_reason { + BPF_POKE_REASON_TAIL_CALL, +}; + +/* Descriptor of pokes pointing /into/ the JITed image. */ +struct bpf_jit_poke_descriptor { + void *ip; + union { + struct { + struct bpf_map *map; + u32 key; + } tail_call; + }; + bool ip_stable; + u8 adj_off; + u16 reason; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -513,6 +538,8 @@ struct bpf_prog_aux { const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ + struct bpf_jit_poke_descriptor *poke_tab; + u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; @@ -560,17 +587,26 @@ struct bpf_prog_aux { }; }; +struct bpf_array_aux { + /* 'Ownership' of prog array is claimed by the first program that + * is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have + * the same prog type and JITed flag. + */ + enum bpf_prog_type type; + bool jited; + /* Programs with direct jumps into programs part of this array. */ + struct list_head poke_progs; + struct bpf_map *map; + struct mutex poke_mutex; + struct work_struct work; +}; + struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; - /* 'ownership' of prog_array is claimed by the first program that - * is going to use this map or by the first program which FD is stored - * in the map to make sure that all callers and callees have the same - * prog_type and JITed flag - */ - enum bpf_prog_type owner_prog_type; - bool owner_jited; + struct bpf_array_aux *aux; union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); @@ -1031,6 +1067,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, { return -ENOTSUPP; } + +static inline void bpf_map_put(struct bpf_map *map) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1284,10 +1324,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + /* All call-related pokes. */ BPF_MOD_NOP_TO_CALL, BPF_MOD_CALL_TO_CALL, BPF_MOD_CALL_TO_NOP, + /* All jump-related pokes. */ + BPF_MOD_NOP_TO_JUMP, + BPF_MOD_JUMP_TO_JUMP, + BPF_MOD_JUMP_TO_NOP, }; + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cdd08bf0ec06..26e40de9ef55 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -293,7 +293,7 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_state; /* pointer/poison value for maps */ + unsigned long map_ptr_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { @@ -301,6 +301,7 @@ struct bpf_insn_aux_data { u32 map_off; /* offset from value base address */ }; }; + u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ diff --git a/include/linux/filter.h b/include/linux/filter.h index ad80e9c6111c..796b60d8cc6c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke); + int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); @@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return false; } +static inline int +bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + return -ENOTSUPP; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633c8c701ff6..58bdf5fd24cc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -671,17 +686,205 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + enum bpf_text_poke_type type; + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + if (!old && new) + type = BPF_MOD_NOP_TO_JUMP; + else if (old && !new) + type = BPF_MOD_JUMP_TO_NOP; + else if (old && new) + type = BPF_MOD_JUMP_TO_JUMP; + else + return; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, type, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5945c3aaa8e..49e32acad7d8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { free_percpu(fp->aux->stats); + kfree(fp->aux->poke_tab); kfree(fp->aux); } vfree(fp); @@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + static const u32 poke_tab_max = 1024; + u32 slot = prog->aux->size_poke_tab; + u32 size = slot + 1; + + if (size > poke_tab_max) + return -ENOSPC; + if (poke->ip || poke->ip_stable || poke->adj_off) + return -EINVAL; + + switch (poke->reason) { + case BPF_POKE_REASON_TAIL_CALL: + if (!poke->tail_call.map) + return -EINVAL; + break; + default: + return -EINVAL; + } + + tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + if (!tab) + return -ENOMEM; + + memcpy(&tab[slot], poke, sizeof(*poke)); + prog->aux->size_poke_tab = size; + prog->aux->poke_tab = tab; + + return slot; +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, @@ -1691,18 +1725,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array, if (fp->kprobe_override) return false; - if (!array->owner_prog_type) { + if (!array->aux->type) { /* There's no owner yet where we could check for * compatibility. */ - array->owner_prog_type = fp->type; - array->owner_jited = fp->jited; - + array->aux->type = fp->type; + array->aux->jited = fp->jited; return true; } - return array->owner_prog_type == fp->type && - array->owner_jited == fp->jited; + return array->aux->type == fp->type && + array->aux->jited == fp->jited; } static int bpf_check_tail_call(const struct bpf_prog *fp) @@ -2003,12 +2036,40 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } +static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + struct bpf_map *map; + int i; + + bpf_free_cgroup_storage(aux); + for (i = 0; i < aux->used_map_cnt; i++) { + map = aux->used_maps[i]; + if (map->ops->map_poke_untrack) + map->ops->map_poke_untrack(map, aux); + bpf_map_put(map); + } + kfree(aux->used_maps); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); + bpf_free_used_maps(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 4cbe987be35b..5e9366b33f0f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->owner_prog_type and owner_jited - * is a runtime binding. Doing static check alone - * in the verifier is not enough. + /* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ae52eb05f41..e3461ec59570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,12 +25,13 @@ #include #include -#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) -#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ + IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) @@ -389,13 +390,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; const struct bpf_array *array; - u32 owner_prog_type = 0; - u32 owner_jited = 0; + u32 type = 0, jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - owner_prog_type = array->owner_prog_type; - owner_jited = array->owner_jited; + type = array->aux->type; + jited = array->aux->jited; } seq_printf(m, @@ -415,12 +415,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); - - if (owner_prog_type) { - seq_printf(m, "owner_prog_type:\t%u\n", - owner_prog_type); - seq_printf(m, "owner_jited:\t%u\n", - owner_jited); + if (type) { + seq_printf(m, "owner_prog_type:\t%u\n", type); + seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif @@ -881,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map)) { + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); @@ -1008,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + goto out; } /* must increment bpf_prog_active to avoid kprobe+bpf triggering from @@ -1090,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = map->ops->map_delete_elem(map, key); + goto out; } preempt_disable(); @@ -1302,25 +1306,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } -/* drop refcnt on maps used by eBPF program and free auxilary data */ -static void free_used_maps(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); - - kfree(aux->used_maps); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1415,7 +1400,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc85714428c7..a0482e1c4a77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 +#define BPF_MAP_KEY_POISON (1ULL << 63) +#define BPF_MAP_KEY_SEEN (1ULL << 62) + #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ POISON_POINTER_DELTA)) @@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem { static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; + return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, @@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, { BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} + +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & BPF_MAP_KEY_POISON; +} + +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) +{ + return !(aux->map_key_state & BPF_MAP_KEY_SEEN); +} + +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); +} + +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) +{ + bool poisoned = bpf_map_key_poisoned(aux); + + aux->map_key_state = state | BPF_MAP_KEY_SEEN | + (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } struct bpf_call_arg_meta { @@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_state)) + if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); - else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, meta->map_ptr->unpriv_array); return 0; } +static int +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_map *map = meta->map_ptr; + struct tnum range; + u64 val; + + if (func_id != BPF_FUNC_tail_call) + return 0; + if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + range = tnum_range(0, map->max_entries - 1); + reg = ®s[BPF_REG_3]; + + if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; + } + + val = reg->var_off.value; + if (bpf_map_key_unseen(aux)) + bpf_map_key_store(aux, val); + else if (!bpf_map_key_poisoned(aux) && + bpf_map_key_immediate(aux) != val) + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; +} + static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); @@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_key(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) static int fixup_bpf_calls(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool expect_blinding = bpf_jit_blinding_enabled(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, cnt, delta = 0; + int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; + if (prog->jit_requested && !expect_blinding && + !bpf_map_key_poisoned(aux) && + !bpf_map_ptr_poisoned(aux) && + !bpf_map_ptr_unpriv(aux)) { + struct bpf_jit_poke_descriptor desc = { + .reason = BPF_POKE_REASON_TAIL_CALL, + .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.key = bpf_map_key_immediate(aux), + }; + + ret = bpf_jit_add_poke_descriptor(prog, &desc); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + return ret; + } + + insn->imm = ret + 1; + continue; + } + if (!bpf_map_ptr_unpriv(aux)) continue; @@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -9345,6 +9430,23 @@ patch_call_imm: insn->imm = fn->func - __bpf_call_base; } + /* Since poke tab is now finalized, publish aux to tracker. */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + if (!map_ptr->ops->map_poke_track || + !map_ptr->ops->map_poke_untrack || + !map_ptr->ops->map_poke_run) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + return ret; + } + } + return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c new file mode 100644 index 000000000000..bb8fe646dd9f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +/* test_tailcall_1 checks basic functionality by patching multiple locations + * in a single program for a single tail call slot with nop->jmp, jmp->nop + * and jmp->jmp rewrites. Also checks for nop->nop. + */ +static void test_tailcall_1(void) +{ + int err, map_fd, prog_fd, main_fd, i, j; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + char buff[128] = {}; + + err = bpf_prog_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + j = bpf_map__def(prog_array)->max_entries - 1 - i; + snprintf(prog_name, sizeof(prog_name), "classifier/%i", j); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + j = bpf_map__def(prog_array)->max_entries - 1 - i; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != j, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err >= 0 || errno != ENOENT)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + +out: + bpf_object__close(obj); +} + +/* test_tailcall_2 checks that patching multiple programs for a single + * tail call slot works. It also jumps through several programs and tests + * the tail call limit counter. + */ +static void test_tailcall_2(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + char buff[128] = {}; + + err = bpf_prog_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 2, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 2; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_3 checks that the count value of the tail call limit + * enforcement matches with expectations. + */ +static void test_tailcall_3(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char buff[128] = {}; + + err = bpf_prog_load("tailcall3.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + prog = bpf_object__find_program_by_title(obj, "classifier/0"); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_4 checks that the kernel properly selects indirect jump + * for the case where the key is not known. Latter is passed via global + * data to select different targets we can compare return value of. + */ +static void test_tailcall_4(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + static const int zero = 0; + char buff[128] = {}; + char prog_name[32]; + + err = bpf_prog_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } +out: + bpf_object__close(obj); +} + +/* test_tailcall_5 probes similarly to test_tailcall_4 that the kernel generates + * an indirect jump when the keys are const but different from different branches. + */ +static void test_tailcall_5(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, key[] = { 1111, 1234, 5678 }; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + static const int zero = 0; + char buff[128] = {}; + char prog_name[32]; + + err = bpf_prog_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != i, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 3, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + } +out: + bpf_object__close(obj); +} + +void test_tailcalls(void) +{ + if (test__start_subtest("tailcall_1")) + test_tailcall_1(); + if (test__start_subtest("tailcall_2")) + test_tailcall_2(); + if (test__start_subtest("tailcall_3")) + test_tailcall_3(); + if (test__start_subtest("tailcall_4")) + test_tailcall_4(); + if (test__start_subtest("tailcall_5")) + test_tailcall_5(); +} diff --git a/tools/testing/selftests/bpf/progs/tailcall1.c b/tools/testing/selftests/bpf/progs/tailcall1.c new file mode 100644 index 000000000000..63531e1a9fa4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall1.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + /* Multiple locations to make sure we patch + * all of them. + */ + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + bpf_tail_call(skb, &jmp_table, 0); + + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + bpf_tail_call(skb, &jmp_table, 1); + + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + bpf_tail_call(skb, &jmp_table, 2); + + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall2.c b/tools/testing/selftests/bpf/progs/tailcall2.c new file mode 100644 index 000000000000..21c85c477210 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall2.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 5); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + return 0; +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 2); + return 1; +} + +SEC("classifier/2") +int bpf_func_2(struct __sk_buff *skb) +{ + return 2; +} + +SEC("classifier/3") +int bpf_func_3(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 4); + return 3; +} + +SEC("classifier/4") +int bpf_func_4(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 3); + return 4; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + /* Check multi-prog update. */ + bpf_tail_call(skb, &jmp_table, 2); + /* Check tail call limit. */ + bpf_tail_call(skb, &jmp_table, 3); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall3.c b/tools/testing/selftests/bpf/progs/tailcall3.c new file mode 100644 index 000000000000..1ecae198b8c1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall3.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int count; + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + count++; + bpf_tail_call(skb, &jmp_table, 0); + return 1; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + return 0; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall4.c b/tools/testing/selftests/bpf/progs/tailcall4.c new file mode 100644 index 000000000000..499388758119 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall4.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int selector; + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, selector); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall5.c b/tools/testing/selftests/bpf/progs/tailcall5.c new file mode 100644 index 000000000000..49c64eb53f19 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall5.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int selector; + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) +TAIL_FUNC(2) + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + int idx = 0; + + if (selector == 1234) + idx = 1; + else if (selector == 5678) + idx = 2; + + bpf_tail_call(skb, &jmp_table, idx); + return 3; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1;