2014-07-23 14:01:58 +08:00
|
|
|
/*
|
|
|
|
* Linux Socket Filter - Kernel level socket filtering
|
|
|
|
*
|
|
|
|
* Based on the design of the Berkeley Packet Filter. The new
|
|
|
|
* internal format has been designed by PLUMgrid:
|
|
|
|
*
|
|
|
|
* Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
*
|
|
|
|
* Jay Schulist <jschlst@samba.org>
|
|
|
|
* Alexei Starovoitov <ast@plumgrid.com>
|
|
|
|
* Daniel Borkmann <dborkman@redhat.com>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* Andi Kleen - Fix a few bad bugs and races.
|
2014-07-31 11:34:14 +08:00
|
|
|
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
|
2014-07-23 14:01:58 +08:00
|
|
|
*/
|
2014-09-08 14:04:47 +08:00
|
|
|
|
2014-07-23 14:01:58 +08:00
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/skbuff.h>
|
2014-09-03 04:53:44 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2014-09-08 14:04:47 +08:00
|
|
|
#include <linux/random.h>
|
|
|
|
#include <linux/moduleloader.h>
|
2014-09-26 15:17:00 +08:00
|
|
|
#include <linux/bpf.h>
|
2014-07-23 14:01:58 +08:00
|
|
|
|
2015-05-30 05:23:07 +08:00
|
|
|
#include <asm/unaligned.h>
|
|
|
|
|
2014-07-23 14:01:58 +08:00
|
|
|
/* Registers */
|
|
|
|
#define BPF_R0 regs[BPF_REG_0]
|
|
|
|
#define BPF_R1 regs[BPF_REG_1]
|
|
|
|
#define BPF_R2 regs[BPF_REG_2]
|
|
|
|
#define BPF_R3 regs[BPF_REG_3]
|
|
|
|
#define BPF_R4 regs[BPF_REG_4]
|
|
|
|
#define BPF_R5 regs[BPF_REG_5]
|
|
|
|
#define BPF_R6 regs[BPF_REG_6]
|
|
|
|
#define BPF_R7 regs[BPF_REG_7]
|
|
|
|
#define BPF_R8 regs[BPF_REG_8]
|
|
|
|
#define BPF_R9 regs[BPF_REG_9]
|
|
|
|
#define BPF_R10 regs[BPF_REG_10]
|
|
|
|
|
|
|
|
/* Named registers */
|
|
|
|
#define DST regs[insn->dst_reg]
|
|
|
|
#define SRC regs[insn->src_reg]
|
|
|
|
#define FP regs[BPF_REG_FP]
|
|
|
|
#define ARG1 regs[BPF_REG_ARG1]
|
|
|
|
#define CTX regs[BPF_REG_CTX]
|
|
|
|
#define IMM insn->imm
|
|
|
|
|
|
|
|
/* No hurry in this branch
|
|
|
|
*
|
|
|
|
* Exported for the bpf jit load helper.
|
|
|
|
*/
|
|
|
|
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
|
|
|
|
{
|
|
|
|
u8 *ptr = NULL;
|
|
|
|
|
|
|
|
if (k >= SKF_NET_OFF)
|
|
|
|
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
|
|
|
|
else if (k >= SKF_LL_OFF)
|
|
|
|
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
|
2015-05-30 05:23:07 +08:00
|
|
|
|
2014-07-23 14:01:58 +08:00
|
|
|
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
|
|
|
|
return ptr;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-09-03 04:53:44 +08:00
|
|
|
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
|
|
|
|
{
|
|
|
|
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
|
|
|
|
gfp_extra_flags;
|
2014-09-26 15:17:00 +08:00
|
|
|
struct bpf_prog_aux *aux;
|
2014-09-03 04:53:44 +08:00
|
|
|
struct bpf_prog *fp;
|
|
|
|
|
|
|
|
size = round_up(size, PAGE_SIZE);
|
|
|
|
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
|
|
|
|
if (fp == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
2014-09-26 15:17:00 +08:00
|
|
|
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
|
|
|
|
if (aux == NULL) {
|
2014-09-03 04:53:44 +08:00
|
|
|
vfree(fp);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
fp->pages = size / PAGE_SIZE;
|
2014-09-26 15:17:00 +08:00
|
|
|
fp->aux = aux;
|
2014-09-03 04:53:44 +08:00
|
|
|
|
|
|
|
return fp;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
|
|
|
|
|
|
|
|
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
|
|
|
|
gfp_t gfp_extra_flags)
|
|
|
|
{
|
|
|
|
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
|
|
|
|
gfp_extra_flags;
|
|
|
|
struct bpf_prog *fp;
|
|
|
|
|
|
|
|
BUG_ON(fp_old == NULL);
|
|
|
|
|
|
|
|
size = round_up(size, PAGE_SIZE);
|
|
|
|
if (size <= fp_old->pages * PAGE_SIZE)
|
|
|
|
return fp_old;
|
|
|
|
|
|
|
|
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
|
|
|
|
if (fp != NULL) {
|
|
|
|
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
|
|
|
|
fp->pages = size / PAGE_SIZE;
|
|
|
|
|
2014-09-26 15:17:00 +08:00
|
|
|
/* We keep fp->aux from fp_old around in the new
|
2014-09-03 04:53:44 +08:00
|
|
|
* reallocated structure.
|
|
|
|
*/
|
2014-09-26 15:17:00 +08:00
|
|
|
fp_old->aux = NULL;
|
2014-09-03 04:53:44 +08:00
|
|
|
__bpf_prog_free(fp_old);
|
|
|
|
}
|
|
|
|
|
|
|
|
return fp;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(bpf_prog_realloc);
|
|
|
|
|
|
|
|
void __bpf_prog_free(struct bpf_prog *fp)
|
|
|
|
{
|
2014-09-26 15:17:00 +08:00
|
|
|
kfree(fp->aux);
|
2014-09-03 04:53:44 +08:00
|
|
|
vfree(fp);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__bpf_prog_free);
|
|
|
|
|
2014-09-10 21:01:02 +08:00
|
|
|
#ifdef CONFIG_BPF_JIT
|
2014-09-08 14:04:47 +08:00
|
|
|
struct bpf_binary_header *
|
|
|
|
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
|
|
|
|
unsigned int alignment,
|
|
|
|
bpf_jit_fill_hole_t bpf_fill_ill_insns)
|
|
|
|
{
|
|
|
|
struct bpf_binary_header *hdr;
|
|
|
|
unsigned int size, hole, start;
|
|
|
|
|
|
|
|
/* Most of BPF filters are really small, but if some of them
|
|
|
|
* fill a page, allow at least 128 extra bytes to insert a
|
|
|
|
* random section of illegal instructions.
|
|
|
|
*/
|
|
|
|
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
|
|
|
|
hdr = module_alloc(size);
|
|
|
|
if (hdr == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* Fill space with illegal/arch-dep instructions. */
|
|
|
|
bpf_fill_ill_insns(hdr, size);
|
|
|
|
|
|
|
|
hdr->pages = size / PAGE_SIZE;
|
|
|
|
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
|
|
|
|
PAGE_SIZE - sizeof(*hdr));
|
|
|
|
start = (prandom_u32() % hole) & ~(alignment - 1);
|
|
|
|
|
|
|
|
/* Leave a random number of instructions before BPF code. */
|
|
|
|
*image_ptr = &hdr->image[start];
|
|
|
|
|
|
|
|
return hdr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
|
|
|
|
{
|
2015-01-20 06:37:05 +08:00
|
|
|
module_memfree(hdr);
|
2014-09-08 14:04:47 +08:00
|
|
|
}
|
2014-09-10 21:01:02 +08:00
|
|
|
#endif /* CONFIG_BPF_JIT */
|
2014-09-08 14:04:47 +08:00
|
|
|
|
2014-07-23 14:01:58 +08:00
|
|
|
/* Base function for offset calculation. Needs to go into .text section,
|
|
|
|
* therefore keeping it non-static as well; will also be used by JITs
|
|
|
|
* anyway later on, so do not let the compiler omit it.
|
|
|
|
*/
|
|
|
|
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
* __bpf_prog_run - run eBPF program on a given context
|
|
|
|
* @ctx: is the data we are operating on
|
|
|
|
* @insn: is the array of eBPF instructions
|
2014-07-23 14:01:58 +08:00
|
|
|
*
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
* Decode and execute eBPF instructions.
|
2014-07-23 14:01:58 +08:00
|
|
|
*/
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
|
2014-07-23 14:01:58 +08:00
|
|
|
{
|
|
|
|
u64 stack[MAX_BPF_STACK / sizeof(u64)];
|
|
|
|
u64 regs[MAX_BPF_REG], tmp;
|
|
|
|
static const void *jumptable[256] = {
|
|
|
|
[0 ... 255] = &&default_label,
|
|
|
|
/* Now overwrite non-defaults ... */
|
|
|
|
/* 32 bit ALU operations */
|
|
|
|
[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
|
|
|
|
[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
|
|
|
|
[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
|
|
|
|
[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
|
|
|
|
[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
|
|
|
|
[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
|
|
|
|
[BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
|
|
|
|
[BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
|
|
|
|
[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
|
|
|
|
[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
|
|
|
|
[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
|
|
|
|
[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
|
|
|
|
[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
|
|
|
|
[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
|
|
|
|
[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
|
|
|
|
[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
|
|
|
|
[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
|
|
|
|
[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
|
|
|
|
[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
|
|
|
|
[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
|
|
|
|
[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
|
|
|
|
[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
|
|
|
|
[BPF_ALU | BPF_NEG] = &&ALU_NEG,
|
|
|
|
[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
|
|
|
|
[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
|
|
|
|
/* 64 bit ALU operations */
|
|
|
|
[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
|
|
|
|
[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
|
|
|
|
[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
|
|
|
|
[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
|
|
|
|
[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
|
|
|
|
[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
|
|
|
|
[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
|
|
|
|
[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
|
|
|
|
[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
|
|
|
|
[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
|
|
|
|
[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
|
|
|
|
[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
|
|
|
|
[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
|
|
|
|
[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
|
|
|
|
[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
|
|
|
|
[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
|
|
|
|
[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
|
|
|
|
[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
|
|
|
|
[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
|
|
|
|
[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
|
|
|
|
[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
|
|
|
|
[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
|
|
|
|
[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
|
|
|
|
[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
|
|
|
|
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
|
|
|
|
/* Call instruction */
|
|
|
|
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
|
2014-07-23 14:01:58 +08:00
|
|
|
/* Jumps */
|
|
|
|
[BPF_JMP | BPF_JA] = &&JMP_JA,
|
|
|
|
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
|
|
|
|
[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
|
|
|
|
[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
|
|
|
|
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
|
|
|
|
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
|
|
|
|
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
|
|
|
|
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
|
|
|
|
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
|
|
|
|
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
|
|
|
|
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
|
|
|
|
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
|
|
|
|
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
|
|
|
|
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
|
|
|
|
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
|
|
|
|
/* Program return */
|
|
|
|
[BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
|
|
|
|
/* Store instructions */
|
|
|
|
[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
|
|
|
|
[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
|
|
|
|
[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
|
|
|
|
[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
|
|
|
|
[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
|
|
|
|
[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
|
|
|
|
[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
|
|
|
|
[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
|
|
|
|
[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
|
|
|
|
[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
|
|
|
|
/* Load instructions */
|
|
|
|
[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
|
|
|
|
[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
|
|
|
|
[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
|
|
|
|
[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
|
|
|
|
[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
|
|
|
|
[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
|
|
|
|
[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
|
|
|
|
[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
|
|
|
|
[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
|
|
|
|
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
|
net: filter: add "load 64-bit immediate" eBPF instruction
add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register.
All previous instructions were 8-byte. This is first 16-byte instruction.
Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction:
insn[0].code = BPF_LD | BPF_DW | BPF_IMM
insn[0].dst_reg = destination register
insn[0].imm = lower 32-bit
insn[1].code = 0
insn[1].imm = upper 32-bit
All unused fields must be zero.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM
which loads 32-bit immediate value into a register.
x64 JITs it as single 'movabsq %rax, imm64'
arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn
Note that old eBPF programs are binary compatible with new interpreter.
It helps eBPF programs load 64-bit constant into a register with one
instruction instead of using two registers and 4 instructions:
BPF_MOV32_IMM(R1, imm32)
BPF_ALU64_IMM(BPF_LSH, R1, 32)
BPF_MOV32_IMM(R2, imm32)
BPF_ALU64_REG(BPF_OR, R1, R2)
User space generated programs will use this instruction to load constants only.
To tell kernel that user space needs a pointer the _pseudo_ variant of
this instruction may be added later, which will use extra bits of encoding
to indicate what type of pointer user space is asking kernel to provide.
For example 'off' or 'src_reg' fields can be used for such purpose.
src_reg = 1 could mean that user space is asking kernel to validate and
load in-kernel map pointer.
src_reg = 2 could mean that user space needs readonly data section pointer
src_reg = 3 could mean that user space needs a pointer to per-cpu local data
All such future pseudo instructions will not be carrying the actual pointer
as part of the instruction, but rather will be treated as a request to kernel
to provide one. The kernel will verify the request_for_a_pointer, then
will drop _pseudo_ marking and will store actual internal pointer inside
the instruction, so the end result is the interpreter and JITs never
see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that
loads 64-bit immediate into a register. User space never operates on direct
pointers and verifier can easily recognize request_for_pointer vs other
instructions.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-05 13:17:17 +08:00
|
|
|
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
|
2014-07-23 14:01:58 +08:00
|
|
|
};
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
u32 tail_call_cnt = 0;
|
2014-07-23 14:01:58 +08:00
|
|
|
void *ptr;
|
|
|
|
int off;
|
|
|
|
|
|
|
|
#define CONT ({ insn++; goto select_insn; })
|
|
|
|
#define CONT_JMP ({ insn++; goto select_insn; })
|
|
|
|
|
|
|
|
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
|
|
|
|
ARG1 = (u64) (unsigned long) ctx;
|
|
|
|
|
|
|
|
/* Registers used in classic BPF programs need to be reset first. */
|
|
|
|
regs[BPF_REG_A] = 0;
|
|
|
|
regs[BPF_REG_X] = 0;
|
|
|
|
|
|
|
|
select_insn:
|
|
|
|
goto *jumptable[insn->code];
|
|
|
|
|
|
|
|
/* ALU */
|
|
|
|
#define ALU(OPCODE, OP) \
|
|
|
|
ALU64_##OPCODE##_X: \
|
|
|
|
DST = DST OP SRC; \
|
|
|
|
CONT; \
|
|
|
|
ALU_##OPCODE##_X: \
|
|
|
|
DST = (u32) DST OP (u32) SRC; \
|
|
|
|
CONT; \
|
|
|
|
ALU64_##OPCODE##_K: \
|
|
|
|
DST = DST OP IMM; \
|
|
|
|
CONT; \
|
|
|
|
ALU_##OPCODE##_K: \
|
|
|
|
DST = (u32) DST OP (u32) IMM; \
|
|
|
|
CONT;
|
|
|
|
|
|
|
|
ALU(ADD, +)
|
|
|
|
ALU(SUB, -)
|
|
|
|
ALU(AND, &)
|
|
|
|
ALU(OR, |)
|
|
|
|
ALU(LSH, <<)
|
|
|
|
ALU(RSH, >>)
|
|
|
|
ALU(XOR, ^)
|
|
|
|
ALU(MUL, *)
|
|
|
|
#undef ALU
|
|
|
|
ALU_NEG:
|
|
|
|
DST = (u32) -DST;
|
|
|
|
CONT;
|
|
|
|
ALU64_NEG:
|
|
|
|
DST = -DST;
|
|
|
|
CONT;
|
|
|
|
ALU_MOV_X:
|
|
|
|
DST = (u32) SRC;
|
|
|
|
CONT;
|
|
|
|
ALU_MOV_K:
|
|
|
|
DST = (u32) IMM;
|
|
|
|
CONT;
|
|
|
|
ALU64_MOV_X:
|
|
|
|
DST = SRC;
|
|
|
|
CONT;
|
|
|
|
ALU64_MOV_K:
|
|
|
|
DST = IMM;
|
|
|
|
CONT;
|
net: filter: add "load 64-bit immediate" eBPF instruction
add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register.
All previous instructions were 8-byte. This is first 16-byte instruction.
Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction:
insn[0].code = BPF_LD | BPF_DW | BPF_IMM
insn[0].dst_reg = destination register
insn[0].imm = lower 32-bit
insn[1].code = 0
insn[1].imm = upper 32-bit
All unused fields must be zero.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM
which loads 32-bit immediate value into a register.
x64 JITs it as single 'movabsq %rax, imm64'
arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn
Note that old eBPF programs are binary compatible with new interpreter.
It helps eBPF programs load 64-bit constant into a register with one
instruction instead of using two registers and 4 instructions:
BPF_MOV32_IMM(R1, imm32)
BPF_ALU64_IMM(BPF_LSH, R1, 32)
BPF_MOV32_IMM(R2, imm32)
BPF_ALU64_REG(BPF_OR, R1, R2)
User space generated programs will use this instruction to load constants only.
To tell kernel that user space needs a pointer the _pseudo_ variant of
this instruction may be added later, which will use extra bits of encoding
to indicate what type of pointer user space is asking kernel to provide.
For example 'off' or 'src_reg' fields can be used for such purpose.
src_reg = 1 could mean that user space is asking kernel to validate and
load in-kernel map pointer.
src_reg = 2 could mean that user space needs readonly data section pointer
src_reg = 3 could mean that user space needs a pointer to per-cpu local data
All such future pseudo instructions will not be carrying the actual pointer
as part of the instruction, but rather will be treated as a request to kernel
to provide one. The kernel will verify the request_for_a_pointer, then
will drop _pseudo_ marking and will store actual internal pointer inside
the instruction, so the end result is the interpreter and JITs never
see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that
loads 64-bit immediate into a register. User space never operates on direct
pointers and verifier can easily recognize request_for_pointer vs other
instructions.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-05 13:17:17 +08:00
|
|
|
LD_IMM_DW:
|
|
|
|
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
|
|
|
|
insn++;
|
|
|
|
CONT;
|
2014-07-23 14:01:58 +08:00
|
|
|
ALU64_ARSH_X:
|
|
|
|
(*(s64 *) &DST) >>= SRC;
|
|
|
|
CONT;
|
|
|
|
ALU64_ARSH_K:
|
|
|
|
(*(s64 *) &DST) >>= IMM;
|
|
|
|
CONT;
|
|
|
|
ALU64_MOD_X:
|
|
|
|
if (unlikely(SRC == 0))
|
|
|
|
return 0;
|
2015-04-28 05:40:37 +08:00
|
|
|
div64_u64_rem(DST, SRC, &tmp);
|
|
|
|
DST = tmp;
|
2014-07-23 14:01:58 +08:00
|
|
|
CONT;
|
|
|
|
ALU_MOD_X:
|
|
|
|
if (unlikely(SRC == 0))
|
|
|
|
return 0;
|
|
|
|
tmp = (u32) DST;
|
|
|
|
DST = do_div(tmp, (u32) SRC);
|
|
|
|
CONT;
|
|
|
|
ALU64_MOD_K:
|
2015-04-28 05:40:37 +08:00
|
|
|
div64_u64_rem(DST, IMM, &tmp);
|
|
|
|
DST = tmp;
|
2014-07-23 14:01:58 +08:00
|
|
|
CONT;
|
|
|
|
ALU_MOD_K:
|
|
|
|
tmp = (u32) DST;
|
|
|
|
DST = do_div(tmp, (u32) IMM);
|
|
|
|
CONT;
|
|
|
|
ALU64_DIV_X:
|
|
|
|
if (unlikely(SRC == 0))
|
|
|
|
return 0;
|
2015-04-28 05:40:37 +08:00
|
|
|
DST = div64_u64(DST, SRC);
|
2014-07-23 14:01:58 +08:00
|
|
|
CONT;
|
|
|
|
ALU_DIV_X:
|
|
|
|
if (unlikely(SRC == 0))
|
|
|
|
return 0;
|
|
|
|
tmp = (u32) DST;
|
|
|
|
do_div(tmp, (u32) SRC);
|
|
|
|
DST = (u32) tmp;
|
|
|
|
CONT;
|
|
|
|
ALU64_DIV_K:
|
2015-04-28 05:40:37 +08:00
|
|
|
DST = div64_u64(DST, IMM);
|
2014-07-23 14:01:58 +08:00
|
|
|
CONT;
|
|
|
|
ALU_DIV_K:
|
|
|
|
tmp = (u32) DST;
|
|
|
|
do_div(tmp, (u32) IMM);
|
|
|
|
DST = (u32) tmp;
|
|
|
|
CONT;
|
|
|
|
ALU_END_TO_BE:
|
|
|
|
switch (IMM) {
|
|
|
|
case 16:
|
|
|
|
DST = (__force u16) cpu_to_be16(DST);
|
|
|
|
break;
|
|
|
|
case 32:
|
|
|
|
DST = (__force u32) cpu_to_be32(DST);
|
|
|
|
break;
|
|
|
|
case 64:
|
|
|
|
DST = (__force u64) cpu_to_be64(DST);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
ALU_END_TO_LE:
|
|
|
|
switch (IMM) {
|
|
|
|
case 16:
|
|
|
|
DST = (__force u16) cpu_to_le16(DST);
|
|
|
|
break;
|
|
|
|
case 32:
|
|
|
|
DST = (__force u32) cpu_to_le32(DST);
|
|
|
|
break;
|
|
|
|
case 64:
|
|
|
|
DST = (__force u64) cpu_to_le64(DST);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
|
|
|
|
/* CALL */
|
|
|
|
JMP_CALL:
|
|
|
|
/* Function call scratches BPF_R1-BPF_R5 registers,
|
|
|
|
* preserves BPF_R6-BPF_R9, and stores return value
|
|
|
|
* into BPF_R0.
|
|
|
|
*/
|
|
|
|
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
|
|
|
|
BPF_R4, BPF_R5);
|
|
|
|
CONT;
|
|
|
|
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
JMP_TAIL_CALL: {
|
|
|
|
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
|
|
|
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
u64 index = BPF_R3;
|
|
|
|
|
|
|
|
if (unlikely(index >= array->map.max_entries))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
tail_call_cnt++;
|
|
|
|
|
|
|
|
prog = READ_ONCE(array->prog[index]);
|
|
|
|
if (unlikely(!prog))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ARG1 = BPF_R1;
|
|
|
|
insn = prog->insnsi;
|
|
|
|
goto select_insn;
|
|
|
|
out:
|
|
|
|
CONT;
|
|
|
|
}
|
2014-07-23 14:01:58 +08:00
|
|
|
/* JMP */
|
|
|
|
JMP_JA:
|
|
|
|
insn += insn->off;
|
|
|
|
CONT;
|
|
|
|
JMP_JEQ_X:
|
|
|
|
if (DST == SRC) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JEQ_K:
|
|
|
|
if (DST == IMM) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JNE_X:
|
|
|
|
if (DST != SRC) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JNE_K:
|
|
|
|
if (DST != IMM) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JGT_X:
|
|
|
|
if (DST > SRC) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JGT_K:
|
|
|
|
if (DST > IMM) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JGE_X:
|
|
|
|
if (DST >= SRC) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JGE_K:
|
|
|
|
if (DST >= IMM) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSGT_X:
|
|
|
|
if (((s64) DST) > ((s64) SRC)) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSGT_K:
|
|
|
|
if (((s64) DST) > ((s64) IMM)) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSGE_X:
|
|
|
|
if (((s64) DST) >= ((s64) SRC)) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSGE_K:
|
|
|
|
if (((s64) DST) >= ((s64) IMM)) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSET_X:
|
|
|
|
if (DST & SRC) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_JSET_K:
|
|
|
|
if (DST & IMM) {
|
|
|
|
insn += insn->off;
|
|
|
|
CONT_JMP;
|
|
|
|
}
|
|
|
|
CONT;
|
|
|
|
JMP_EXIT:
|
|
|
|
return BPF_R0;
|
|
|
|
|
|
|
|
/* STX and ST and LDX*/
|
|
|
|
#define LDST(SIZEOP, SIZE) \
|
|
|
|
STX_MEM_##SIZEOP: \
|
|
|
|
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
|
|
|
|
CONT; \
|
|
|
|
ST_MEM_##SIZEOP: \
|
|
|
|
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
|
|
|
|
CONT; \
|
|
|
|
LDX_MEM_##SIZEOP: \
|
|
|
|
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
|
|
|
|
CONT;
|
|
|
|
|
|
|
|
LDST(B, u8)
|
|
|
|
LDST(H, u16)
|
|
|
|
LDST(W, u32)
|
|
|
|
LDST(DW, u64)
|
|
|
|
#undef LDST
|
|
|
|
STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
|
|
|
|
atomic_add((u32) SRC, (atomic_t *)(unsigned long)
|
|
|
|
(DST + insn->off));
|
|
|
|
CONT;
|
|
|
|
STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
|
|
|
|
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
|
|
|
|
(DST + insn->off));
|
|
|
|
CONT;
|
|
|
|
LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
|
|
|
|
off = IMM;
|
|
|
|
load_word:
|
|
|
|
/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
|
|
|
|
* only appearing in the programs where ctx ==
|
|
|
|
* skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
|
2014-07-31 11:34:15 +08:00
|
|
|
* == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
|
2014-07-23 14:01:58 +08:00
|
|
|
* internal BPF verifier will check that BPF_R6 ==
|
|
|
|
* ctx.
|
|
|
|
*
|
|
|
|
* BPF_ABS and BPF_IND are wrappers of function calls,
|
|
|
|
* so they scratch BPF_R1-BPF_R5 registers, preserve
|
|
|
|
* BPF_R6-BPF_R9, and store return value into BPF_R0.
|
|
|
|
*
|
|
|
|
* Implicit input:
|
|
|
|
* ctx == skb == BPF_R6 == CTX
|
|
|
|
*
|
|
|
|
* Explicit input:
|
|
|
|
* SRC == any register
|
|
|
|
* IMM == 32-bit immediate
|
|
|
|
*
|
|
|
|
* Output:
|
|
|
|
* BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
|
|
|
|
*/
|
|
|
|
|
|
|
|
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
|
|
|
|
if (likely(ptr != NULL)) {
|
|
|
|
BPF_R0 = get_unaligned_be32(ptr);
|
|
|
|
CONT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
|
|
|
|
off = IMM;
|
|
|
|
load_half:
|
|
|
|
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
|
|
|
|
if (likely(ptr != NULL)) {
|
|
|
|
BPF_R0 = get_unaligned_be16(ptr);
|
|
|
|
CONT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
|
|
|
|
off = IMM;
|
|
|
|
load_byte:
|
|
|
|
ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
|
|
|
|
if (likely(ptr != NULL)) {
|
|
|
|
BPF_R0 = *(u8 *)ptr;
|
|
|
|
CONT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
|
|
|
|
off = IMM + SRC;
|
|
|
|
goto load_word;
|
|
|
|
LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
|
|
|
|
off = IMM + SRC;
|
|
|
|
goto load_half;
|
|
|
|
LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
|
|
|
|
off = IMM + SRC;
|
|
|
|
goto load_byte;
|
|
|
|
|
|
|
|
default_label:
|
|
|
|
/* If we ever reach this, we have a bug somewhere. */
|
|
|
|
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-30 05:23:07 +08:00
|
|
|
bool bpf_prog_array_compatible(struct bpf_array *array,
|
|
|
|
const struct bpf_prog *fp)
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
{
|
2015-05-30 05:23:07 +08:00
|
|
|
if (!array->owner_prog_type) {
|
|
|
|
/* There's no owner yet where we could check for
|
|
|
|
* compatibility.
|
|
|
|
*/
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
array->owner_prog_type = fp->type;
|
|
|
|
array->owner_jited = fp->jited;
|
2015-05-30 05:23:07 +08:00
|
|
|
|
|
|
|
return true;
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
}
|
2015-05-30 05:23:07 +08:00
|
|
|
|
|
|
|
return array->owner_prog_type == fp->type &&
|
|
|
|
array->owner_jited == fp->jited;
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
}
|
|
|
|
|
2015-05-30 05:23:07 +08:00
|
|
|
static int bpf_check_tail_call(const struct bpf_prog *fp)
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
{
|
|
|
|
struct bpf_prog_aux *aux = fp->aux;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < aux->used_map_cnt; i++) {
|
2015-05-30 05:23:07 +08:00
|
|
|
struct bpf_map *map = aux->used_maps[i];
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
struct bpf_array *array;
|
|
|
|
|
|
|
|
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
|
|
|
|
continue;
|
2015-05-30 05:23:07 +08:00
|
|
|
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
array = container_of(map, struct bpf_array, map);
|
|
|
|
if (!bpf_prog_array_compatible(array, fp))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-07-23 14:01:58 +08:00
|
|
|
/**
|
2015-05-30 05:23:07 +08:00
|
|
|
* bpf_prog_select_runtime - select exec runtime for BPF program
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
* @fp: bpf_prog populated with internal BPF program
|
2014-07-23 14:01:58 +08:00
|
|
|
*
|
2015-05-30 05:23:07 +08:00
|
|
|
* Try to JIT eBPF program, if JIT is not available, use interpreter.
|
|
|
|
* The BPF program will be executed via BPF_PROG_RUN() macro.
|
2014-07-23 14:01:58 +08:00
|
|
|
*/
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
int bpf_prog_select_runtime(struct bpf_prog *fp)
|
2014-07-23 14:01:58 +08:00
|
|
|
{
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
fp->bpf_func = (void *) __bpf_prog_run;
|
2014-07-23 14:01:58 +08:00
|
|
|
|
|
|
|
bpf_int_jit_compile(fp);
|
2014-09-03 04:53:44 +08:00
|
|
|
bpf_prog_lock_ro(fp);
|
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
...
bpf_tail_call(ctx, &jmp_table, index);
...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
...
if (jmp_table[index])
return (*jmp_table[index])(ctx);
...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.
bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table
Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.
New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.
The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.
Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
==========
- simplify complex programs by splitting them into a sequence of small programs
- dispatch routine
For tracing and future seccomp the program may be triggered on all system
calls, but processing of syscall arguments will be different. It's more
efficient to implement them as:
int syscall_entry(struct seccomp_data *ctx)
{
bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
... default: process unknown syscall ...
}
int sys_write_event(struct seccomp_data *ctx) {...}
int sys_read_event(struct seccomp_data *ctx) {...}
syscall_jmp_table[__NR_write] = sys_write_event;
syscall_jmp_table[__NR_read] = sys_read_event;
For networking the program may call into different parsers depending on
packet format, like:
int packet_parser(struct __sk_buff *skb)
{
... parse L2, L3 here ...
__u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
... default: process unknown protocol ...
}
int parse_tcp(struct __sk_buff *skb) {...}
int parse_udp(struct __sk_buff *skb) {...}
ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
ipproto_jmp_table[IPPROTO_UDP] = parse_udp;
- for TC use case, bpf_tail_call() allows to implement reclassify-like logic
- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
are atomic, so user space can build chains of BPF programs on the fly
Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
It could have been implemented without JIT changes as a wrapper on top of
BPF_PROG_RUN() macro, but with two downsides:
. all programs would have to pay performance penalty for this feature and
tail call itself would be slower, since mandatory stack unwind, return,
stack allocate would be done for every tailcall.
. tailcall would be limited to programs running preempt_disabled, since
generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
need to be either global per_cpu variable accessed by helper and by wrapper
or global variable protected by locks.
In this implementation x64 JIT bypasses stack unwind and jumps into the
callee program after prologue.
- bpf_prog_array_compatible() ensures that prog_type of callee and caller
are the same and JITed/non-JITed flag is the same, since calling JITed
program from non-JITed is invalid, since stack frames are different.
Similarly calling kprobe type program from socket type program is invalid.
- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
abstraction, its user space API and all of verifier logic.
It's in the existing arraymap.c file, since several functions are
shared with regular array map.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 07:59:03 +08:00
|
|
|
|
2015-05-30 05:23:07 +08:00
|
|
|
/* The tail call compatibility check can only be done at
|
|
|
|
* this late stage as we need to determine, if we deal
|
|
|
|
* with JITed or non JITed program concatenations and not
|
|
|
|
* all eBPF JITs might immediately support all features.
|
|
|
|
*/
|
|
|
|
return bpf_check_tail_call(fp);
|
2014-07-23 14:01:58 +08:00
|
|
|
}
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
|
2014-07-23 14:01:58 +08:00
|
|
|
|
2014-09-03 04:53:44 +08:00
|
|
|
static void bpf_prog_free_deferred(struct work_struct *work)
|
|
|
|
{
|
2014-09-26 15:17:00 +08:00
|
|
|
struct bpf_prog_aux *aux;
|
2014-09-03 04:53:44 +08:00
|
|
|
|
2014-09-26 15:17:00 +08:00
|
|
|
aux = container_of(work, struct bpf_prog_aux, work);
|
|
|
|
bpf_jit_free(aux->prog);
|
2014-09-03 04:53:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Free internal BPF program */
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
void bpf_prog_free(struct bpf_prog *fp)
|
2014-07-23 14:01:58 +08:00
|
|
|
{
|
2014-09-26 15:17:00 +08:00
|
|
|
struct bpf_prog_aux *aux = fp->aux;
|
2014-09-03 04:53:44 +08:00
|
|
|
|
2014-09-26 15:17:00 +08:00
|
|
|
INIT_WORK(&aux->work, bpf_prog_free_deferred);
|
|
|
|
aux->prog = fp;
|
|
|
|
schedule_work(&aux->work);
|
2014-07-23 14:01:58 +08:00
|
|
|
}
|
net: filter: split 'struct sk_filter' into socket and bpf parts
clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix
split 'struct sk_filter' into
struct sk_filter {
atomic_t refcnt;
struct rcu_head rcu;
struct bpf_prog *prog;
};
and
struct bpf_prog {
u32 jited:1,
len:31;
struct sock_fprog_kern *orig_prog;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
struct work_struct work;
};
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases
split SK_RUN_FILTER macro into:
SK_RUN_FILTER to be used with 'struct sk_filter *' and
BPF_PROG_RUN to be used with 'struct bpf_prog *'
__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function
also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:
sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter
API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet
API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 11:34:16 +08:00
|
|
|
EXPORT_SYMBOL_GPL(bpf_prog_free);
|
2014-10-24 09:41:08 +08:00
|
|
|
|
2015-03-06 06:27:51 +08:00
|
|
|
/* Weak definitions of helper functions in case we don't have bpf syscall. */
|
|
|
|
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
|
|
|
|
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
|
|
|
|
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
|
|
|
|
|
2015-03-14 09:27:16 +08:00
|
|
|
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
|
2015-03-14 09:27:17 +08:00
|
|
|
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
|
2015-05-30 05:23:06 +08:00
|
|
|
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
|
2015-03-14 09:27:16 +08:00
|
|
|
|
2015-05-30 05:23:07 +08:00
|
|
|
/* Always built-in helper functions. */
|
|
|
|
const struct bpf_func_proto bpf_tail_call_proto = {
|
|
|
|
.func = NULL,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_VOID,
|
|
|
|
.arg1_type = ARG_PTR_TO_CTX,
|
|
|
|
.arg2_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg3_type = ARG_ANYTHING,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
|
|
|
|
void __weak bpf_int_jit_compile(struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2014-10-24 09:41:08 +08:00
|
|
|
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
|
|
|
|
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
|
|
|
|
*/
|
|
|
|
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
|
|
|
|
int len)
|
|
|
|
{
|
|
|
|
return -EFAULT;
|
|
|
|
}
|