diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 625c29329372..992652856fe8 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -154,8 +154,10 @@ struct ip_tunnel { #define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) +#define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) -#define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) +#define TUNNEL_OPTIONS_PRESENT \ + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) struct tnl_ptk_info { __be16 flags; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f70674799fdd..0162fb955b33 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -113,6 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); +static void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, bool truncate); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -287,7 +289,33 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, false, false) < 0) goto drop; - tunnel->index = ntohl(index); + if (tunnel->collect_md) { + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ip_tun_rx_dst(skb, flags, + tun_id, sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + md = ip_tunnel_info_opts(&tun_dst->u.tun_info); + if (!md) + return PACKET_REJECT; + + md->index = index; + info = &tun_dst->u.tun_info; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + } else { + tunnel->index = ntohl(index); + } + skb_reset_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; @@ -432,39 +460,33 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) +static struct rtable *prepare_fb_xmit(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl, + int tunnel_hlen) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; - struct flowi4 fl; int min_headroom; - int tunnel_hlen; - __be16 df, flags; bool use_cache; int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET)) - goto err_free_skb; - key = &tun_info->key; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); if (!rt) { - rt = gre_get_rt(skb, dev, &fl, key); + rt = gre_get_rt(skb, dev, fl, key); if (IS_ERR(rt)) - goto err_free_skb; + goto err_free_skb; if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl.saddr); + fl->saddr); } - tunnel_hlen = gre_calc_hlen(key->tun_flags); - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr); if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { @@ -476,6 +498,37 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (unlikely(err)) goto err_free_rt; } + return rt; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NULL; +} + +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) +{ + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct rtable *rt = NULL; + struct flowi4 fl; + int tunnel_hlen; + __be16 df, flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + tunnel_hlen = gre_calc_hlen(key->tun_flags); + + rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); + if (!rt) + return; /* Push Tunnel header. */ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) @@ -498,6 +551,64 @@ err_free_skb: dev->stats.tx_dropped++; } +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + struct rtable *rt = NULL; + bool truncate = false; + struct flowi4 fl; + int tunnel_hlen; + __be16 df; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + + /* ERSPAN has fixed 8 byte GRE header */ + tunnel_hlen = 8 + sizeof(struct erspanhdr); + + rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); + if (!rt) + return; + + if (gre_handle_offloads(skb, false)) + goto err_free_rt; + + if (skb->len > dev->mtu) { + pskb_trim(skb, dev->mtu); + truncate = true; + } + + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; + + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->index), truncate); + + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + + iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); + return; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; +} + static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); @@ -611,6 +722,11 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + if (tunnel->collect_md) { + erspan_fb_xmit(skb, dev, skb->protocol); + return NETDEV_TX_OK; + } + if (gre_handle_offloads(skb, false)) goto free_skb; @@ -973,9 +1089,12 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], return ret; /* ERSPAN should only have GRE sequence and key flag */ - flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); - flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); - if (flags != (GRE_SEQ | GRE_KEY)) + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) return -EINVAL; /* ERSPAN Session ID only has 10-bit. Since we reuse diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index 270edcc149a1..370b749f5ee6 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -17,6 +17,7 @@ #include #include #include "bpf_helpers.h" +#include "bpf_endian.h" #define _htonl __builtin_bswap32 #define ERROR(ret) do {\ @@ -38,6 +39,10 @@ struct vxlan_metadata { u32 gbp; }; +struct erspan_metadata { + __be32 index; +}; + SEC("gre_set_tunnel") int _gre_set_tunnel(struct __sk_buff *skb) { @@ -76,6 +81,63 @@ int _gre_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("erspan_set_tunnel") +int _erspan_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + struct erspan_metadata md; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + md.index = htonl(123); + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("erspan_get_tunnel") +int _erspan_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n"; + struct bpf_tunnel_key key; + struct erspan_metadata md; + u32 index; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + index = bpf_ntohl(md.index); + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv4, index); + + return TC_ACT_OK; +} + SEC("vxlan_set_tunnel") int _vxlan_set_tunnel(struct __sk_buff *skb) { @@ -378,5 +440,4 @@ int _ip6ip6_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } - char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index a70d2ea90313..410052d9fc37 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -32,6 +32,19 @@ function add_gre_tunnel { ip addr add dev $DEV 10.1.1.200/24 } +function add_erspan_tunnel { + # in namespace + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123 + ip netns exec at_ns0 ip link set dev $DEV_NS up + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + + # out of namespace + ip link add dev $DEV type $TYPE external + ip link set dev $DEV up + ip addr add dev $DEV 10.1.1.200/24 +} + function add_vxlan_tunnel { # Set static ARP entry here because iptables set-mark works # on L3 packet, as a result not applying to ARP packets, @@ -99,6 +112,18 @@ function test_gre { cleanup } +function test_erspan { + TYPE=erspan + DEV_NS=erspan00 + DEV=erspan11 + config_device + add_erspan_tunnel + attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel + ping -c 1 10.1.1.100 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + cleanup +} + function test_vxlan { TYPE=vxlan DEV_NS=vxlan00 @@ -151,14 +176,18 @@ function cleanup { ip link del gretap11 ip link del vxlan11 ip link del geneve11 + ip link del erspan11 pkill tcpdump pkill cat set -ex } +trap cleanup 0 2 3 6 9 cleanup echo "Testing GRE tunnel..." test_gre +echo "Testing ERSPAN tunnel..." +test_erspan echo "Testing VXLAN tunnel..." test_vxlan echo "Testing GENEVE tunnel..."