Merge branch 'bpf_avoid_clone'

Alexei Starovoitov says:

====================
bpf: performance improvements

v1->v2: dropped redundant iff_up check in patch 2

At plumbers we discussed different options on how to get rid of skb_clone
from bpf_clone_redirect(), the patch 2 implements the best option.
Patch 1 adds 'integrated exts' to cls_bpf to improve performance by
combining simple actions into bpf classifier.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-09-17 21:09:07 -07:00
commit 41a9802fd8
9 changed files with 159 additions and 13 deletions

View File

@ -251,7 +251,7 @@ struct tcf_proto {
struct qdisc_skb_cb { struct qdisc_skb_cb {
unsigned int pkt_len; unsigned int pkt_len;
u16 slave_dev_queue_mapping; u16 slave_dev_queue_mapping;
u16 _pad; u16 tc_classid;
#define QDISC_CB_PRIV_LEN 20 #define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN]; unsigned char data[QDISC_CB_PRIV_LEN];
}; };
@ -402,6 +402,7 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb,
const struct qdisc_size_table *stab); const struct qdisc_size_table *stab);
bool tcf_destroy(struct tcf_proto *tp, bool force); bool tcf_destroy(struct tcf_proto *tp, bool force);
void tcf_destroy_chain(struct tcf_proto __rcu **fl); void tcf_destroy_chain(struct tcf_proto __rcu **fl);
int skb_do_redirect(struct sk_buff *);
/* Reset all TX qdiscs greater then index of a device. */ /* Reset all TX qdiscs greater then index of a device. */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)

View File

@ -272,6 +272,14 @@ enum bpf_func_id {
BPF_FUNC_skb_get_tunnel_key, BPF_FUNC_skb_get_tunnel_key,
BPF_FUNC_skb_set_tunnel_key, BPF_FUNC_skb_set_tunnel_key,
BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
/**
* bpf_redirect(ifindex, flags) - redirect to another netdev
* @ifindex: ifindex of the net device
* @flags: bit 0 - if set, redirect to ingress instead of egress
* other bits - reserved
* Return: TC_ACT_REDIRECT
*/
BPF_FUNC_redirect,
__BPF_FUNC_MAX_ID, __BPF_FUNC_MAX_ID,
}; };
@ -293,6 +301,7 @@ struct __sk_buff {
__u32 tc_index; __u32 tc_index;
__u32 cb[5]; __u32 cb[5];
__u32 hash; __u32 hash;
__u32 tc_classid;
}; };
struct bpf_tunnel_key { struct bpf_tunnel_key {

View File

@ -87,6 +87,7 @@ enum {
#define TC_ACT_STOLEN 4 #define TC_ACT_STOLEN 4
#define TC_ACT_QUEUED 5 #define TC_ACT_QUEUED 5
#define TC_ACT_REPEAT 6 #define TC_ACT_REPEAT 6
#define TC_ACT_REDIRECT 7
#define TC_ACT_JUMP 0x10000000 #define TC_ACT_JUMP 0x10000000
/* Action type identifiers*/ /* Action type identifiers*/
@ -373,6 +374,8 @@ enum {
/* BPF classifier */ /* BPF classifier */
#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0)
enum { enum {
TCA_BPF_UNSPEC, TCA_BPF_UNSPEC,
TCA_BPF_ACT, TCA_BPF_ACT,
@ -382,6 +385,7 @@ enum {
TCA_BPF_OPS, TCA_BPF_OPS,
TCA_BPF_FD, TCA_BPF_FD,
TCA_BPF_NAME, TCA_BPF_NAME,
TCA_BPF_FLAGS,
__TCA_BPF_MAX, __TCA_BPF_MAX,
}; };

View File

@ -3670,6 +3670,14 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
case TC_ACT_QUEUED: case TC_ACT_QUEUED:
kfree_skb(skb); kfree_skb(skb);
return NULL; return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
* we can safely push the L2 header back before
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
default: default:
break; break;
} }

View File

@ -1427,6 +1427,48 @@ const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING,
}; };
struct redirect_info {
u32 ifindex;
u32 flags;
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
ri->ifindex = ifindex;
ri->flags = flags;
return TC_ACT_REDIRECT;
}
int skb_do_redirect(struct sk_buff *skb)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct net_device *dev;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
ri->ifindex = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
}
if (BPF_IS_REDIRECT_INGRESS(ri->flags))
return dev_forward_skb(dev, skb);
skb->dev = dev;
return dev_queue_xmit(skb);
}
const struct bpf_func_proto bpf_redirect_proto = {
.func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{ {
return task_get_classid((struct sk_buff *) (unsigned long) r1); return task_get_classid((struct sk_buff *) (unsigned long) r1);
@ -1607,6 +1649,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_get_tunnel_key_proto; return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key: case BPF_FUNC_skb_set_tunnel_key:
return bpf_get_skb_set_tunnel_key_proto(); return bpf_get_skb_set_tunnel_key_proto();
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
default: default:
return sk_filter_func_proto(func_id); return sk_filter_func_proto(func_id);
} }
@ -1632,6 +1676,9 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
static bool sk_filter_is_valid_access(int off, int size, static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type) enum bpf_access_type type)
{ {
if (off == offsetof(struct __sk_buff, tc_classid))
return false;
if (type == BPF_WRITE) { if (type == BPF_WRITE) {
switch (off) { switch (off) {
case offsetof(struct __sk_buff, cb[0]) ... case offsetof(struct __sk_buff, cb[0]) ...
@ -1648,6 +1695,9 @@ static bool sk_filter_is_valid_access(int off, int size,
static bool tc_cls_act_is_valid_access(int off, int size, static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type) enum bpf_access_type type)
{ {
if (off == offsetof(struct __sk_buff, tc_classid))
return type == BPF_WRITE ? true : false;
if (type == BPF_WRITE) { if (type == BPF_WRITE) {
switch (off) { switch (off) {
case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, mark):
@ -1760,6 +1810,14 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
break; break;
case offsetof(struct __sk_buff, tc_classid):
ctx_off -= offsetof(struct __sk_buff, tc_classid);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
WARN_ON(type != BPF_WRITE);
*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
break;
case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_SCHED
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);

View File

@ -72,6 +72,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
case TC_ACT_PIPE: case TC_ACT_PIPE:
case TC_ACT_RECLASSIFY: case TC_ACT_RECLASSIFY:
case TC_ACT_OK: case TC_ACT_OK:
case TC_ACT_REDIRECT:
action = filter_res; action = filter_res;
break; break;
case TC_ACT_SHOT: case TC_ACT_SHOT:

View File

@ -38,6 +38,7 @@ struct cls_bpf_prog {
struct bpf_prog *filter; struct bpf_prog *filter;
struct list_head link; struct list_head link;
struct tcf_result res; struct tcf_result res;
bool exts_integrated;
struct tcf_exts exts; struct tcf_exts exts;
u32 handle; u32 handle;
union { union {
@ -52,6 +53,7 @@ struct cls_bpf_prog {
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_CLASSID] = { .type = NLA_U32 },
[TCA_BPF_FLAGS] = { .type = NLA_U32 },
[TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 },
[TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
@ -59,6 +61,23 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
.len = sizeof(struct sock_filter) * BPF_MAXINSNS }, .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
}; };
static int cls_bpf_exec_opcode(int code)
{
switch (code) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
case TC_ACT_SHOT:
case TC_ACT_PIPE:
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_REDIRECT:
case TC_ACT_UNSPEC:
return code;
default:
return TC_ACT_UNSPEC;
}
}
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res) struct tcf_result *res)
{ {
@ -79,6 +98,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
list_for_each_entry_rcu(prog, &head->plist, link) { list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res; int filter_res;
qdisc_skb_cb(skb)->tc_classid = prog->res.classid;
if (at_ingress) { if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */ /* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len); __skb_push(skb, skb->mac_len);
@ -88,6 +109,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
filter_res = BPF_PROG_RUN(prog->filter, skb); filter_res = BPF_PROG_RUN(prog->filter, skb);
} }
if (prog->exts_integrated) {
res->class = prog->res.class;
res->classid = qdisc_skb_cb(skb)->tc_classid;
ret = cls_bpf_exec_opcode(filter_res);
if (ret == TC_ACT_UNSPEC)
continue;
break;
}
if (filter_res == 0) if (filter_res == 0)
continue; continue;
@ -195,8 +226,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
return ret; return ret;
} }
static int cls_bpf_prog_from_ops(struct nlattr **tb, static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
struct cls_bpf_prog *prog, u32 classid)
{ {
struct sock_filter *bpf_ops; struct sock_filter *bpf_ops;
struct sock_fprog_kern fprog_tmp; struct sock_fprog_kern fprog_tmp;
@ -230,15 +260,13 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb,
prog->bpf_ops = bpf_ops; prog->bpf_ops = bpf_ops;
prog->bpf_num_ops = bpf_num_ops; prog->bpf_num_ops = bpf_num_ops;
prog->bpf_name = NULL; prog->bpf_name = NULL;
prog->filter = fp; prog->filter = fp;
prog->res.classid = classid;
return 0; return 0;
} }
static int cls_bpf_prog_from_efd(struct nlattr **tb, static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
struct cls_bpf_prog *prog, u32 classid) const struct tcf_proto *tp)
{ {
struct bpf_prog *fp; struct bpf_prog *fp;
char *name = NULL; char *name = NULL;
@ -268,9 +296,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb,
prog->bpf_ops = NULL; prog->bpf_ops = NULL;
prog->bpf_fd = bpf_fd; prog->bpf_fd = bpf_fd;
prog->bpf_name = name; prog->bpf_name = name;
prog->filter = fp; prog->filter = fp;
prog->res.classid = classid;
return 0; return 0;
} }
@ -280,8 +306,8 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
unsigned long base, struct nlattr **tb, unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr) struct nlattr *est, bool ovr)
{ {
bool is_bpf, is_ebpf, have_exts = false;
struct tcf_exts exts; struct tcf_exts exts;
bool is_bpf, is_ebpf;
u32 classid; u32 classid;
int ret; int ret;
@ -298,9 +324,22 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
return ret; return ret;
classid = nla_get_u32(tb[TCA_BPF_CLASSID]); classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
if (tb[TCA_BPF_FLAGS]) {
u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]);
ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) {
cls_bpf_prog_from_efd(tb, prog, classid); tcf_exts_destroy(&exts);
return -EINVAL;
}
have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
}
prog->res.classid = classid;
prog->exts_integrated = have_exts;
ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
cls_bpf_prog_from_efd(tb, prog, tp);
if (ret < 0) { if (ret < 0) {
tcf_exts_destroy(&exts); tcf_exts_destroy(&exts);
return ret; return ret;

View File

@ -33,6 +33,10 @@ static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm; (void *) BPF_FUNC_get_current_comm;
static int (*bpf_perf_event_read)(void *map, int index) = static int (*bpf_perf_event_read)(void *map, int index) =
(void *) BPF_FUNC_perf_event_read; (void *) BPF_FUNC_perf_event_read;
static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) =
(void *) BPF_FUNC_clone_redirect;
static int (*bpf_redirect)(int ifindex, int flags) =
(void *) BPF_FUNC_redirect;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions

View File

@ -5,7 +5,7 @@
#include <uapi/linux/in.h> #include <uapi/linux/in.h>
#include <uapi/linux/tcp.h> #include <uapi/linux/tcp.h>
#include <uapi/linux/filter.h> #include <uapi/linux/filter.h>
#include <uapi/linux/pkt_cls.h>
#include "bpf_helpers.h" #include "bpf_helpers.h"
/* compiler workaround */ /* compiler workaround */
@ -64,4 +64,26 @@ int bpf_prog1(struct __sk_buff *skb)
return 0; return 0;
} }
SEC("redirect_xmit")
int _redirect_xmit(struct __sk_buff *skb)
{
return bpf_redirect(skb->ifindex + 1, 0);
}
SEC("redirect_recv")
int _redirect_recv(struct __sk_buff *skb)
{
return bpf_redirect(skb->ifindex + 1, 1);
}
SEC("clone_redirect_xmit")
int _clone_redirect_xmit(struct __sk_buff *skb)
{
bpf_clone_redirect(skb, skb->ifindex + 1, 0);
return TC_ACT_SHOT;
}
SEC("clone_redirect_recv")
int _clone_redirect_recv(struct __sk_buff *skb)
{
bpf_clone_redirect(skb, skb->ifindex + 1, 1);
return TC_ACT_SHOT;
}
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";