add DOVE extensions for VXLAN
This patch provides extensions to VXLAN for supporting Distributed Overlay Virtual Ethernet (DOVE) networks. The patch includes: + a dove flag per VXLAN device to enable DOVE extensions + ARP reduction, whereby a bridge-connected VXLAN tunnel endpoint answers ARP requests from the local bridge on behalf of remote DOVE clients + route short-circuiting (aka L3 switching). Known destination IP addresses use the corresponding destination MAC address for switching rather than going to a (possibly remote) router first. + netlink notification messages for forwarding table and L3 switching misses Changes since v2 - combined bools into "u32 flags" - replaced loop with !is_zero_ether_addr() Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
ff33c0e188
commit
e4f67addf1
|
@ -29,6 +29,8 @@
|
|||
#include <linux/etherdevice.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/hash.h>
|
||||
#include <net/arp.h>
|
||||
#include <net/ndisc.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/udp.h>
|
||||
|
@ -110,7 +112,7 @@ struct vxlan_dev {
|
|||
__u16 port_max;
|
||||
__u8 tos; /* TOS override */
|
||||
__u8 ttl;
|
||||
bool learn;
|
||||
u32 flags; /* VXLAN_F_* below */
|
||||
|
||||
unsigned long age_interval;
|
||||
struct timer_list age_timer;
|
||||
|
@ -121,6 +123,12 @@ struct vxlan_dev {
|
|||
struct hlist_head fdb_head[FDB_HASH_SIZE];
|
||||
};
|
||||
|
||||
#define VXLAN_F_LEARN 0x01
|
||||
#define VXLAN_F_PROXY 0x02
|
||||
#define VXLAN_F_RSC 0x04
|
||||
#define VXLAN_F_L2MISS 0x08
|
||||
#define VXLAN_F_L3MISS 0x10
|
||||
|
||||
/* salt for hash table */
|
||||
static u32 vxlan_salt __read_mostly;
|
||||
|
||||
|
@ -154,6 +162,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
|
|||
struct nda_cacheinfo ci;
|
||||
struct nlmsghdr *nlh;
|
||||
struct ndmsg *ndm;
|
||||
bool send_ip, send_eth;
|
||||
|
||||
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
|
||||
if (nlh == NULL)
|
||||
|
@ -161,16 +170,24 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
|
|||
|
||||
ndm = nlmsg_data(nlh);
|
||||
memset(ndm, 0, sizeof(*ndm));
|
||||
ndm->ndm_family = AF_BRIDGE;
|
||||
|
||||
send_eth = send_ip = true;
|
||||
|
||||
if (type == RTM_GETNEIGH) {
|
||||
ndm->ndm_family = AF_INET;
|
||||
send_ip = fdb->remote_ip != 0;
|
||||
send_eth = !is_zero_ether_addr(fdb->eth_addr);
|
||||
} else
|
||||
ndm->ndm_family = AF_BRIDGE;
|
||||
ndm->ndm_state = fdb->state;
|
||||
ndm->ndm_ifindex = vxlan->dev->ifindex;
|
||||
ndm->ndm_flags = NTF_SELF;
|
||||
ndm->ndm_type = NDA_DST;
|
||||
|
||||
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
|
||||
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
|
||||
if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
|
||||
goto nla_put_failure;
|
||||
|
||||
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
|
||||
|
@ -222,6 +239,29 @@ errout:
|
|||
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
|
||||
}
|
||||
|
||||
static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
|
||||
{
|
||||
struct vxlan_dev *vxlan = netdev_priv(dev);
|
||||
struct vxlan_fdb f;
|
||||
|
||||
memset(&f, 0, sizeof f);
|
||||
f.state = NUD_STALE;
|
||||
f.remote_ip = ipa; /* goes to NDA_DST */
|
||||
|
||||
vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
|
||||
}
|
||||
|
||||
static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
|
||||
{
|
||||
struct vxlan_fdb f;
|
||||
|
||||
memset(&f, 0, sizeof f);
|
||||
f.state = NUD_STALE;
|
||||
memcpy(f.eth_addr, eth_addr, ETH_ALEN);
|
||||
|
||||
vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
|
||||
}
|
||||
|
||||
/* Hash Ethernet address */
|
||||
static u32 eth_hash(const unsigned char *addr)
|
||||
{
|
||||
|
@ -551,6 +591,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
|
|||
goto drop;
|
||||
}
|
||||
|
||||
skb_reset_mac_header(skb);
|
||||
|
||||
/* Re-examine inner Ethernet packet */
|
||||
oip = ip_hdr(skb);
|
||||
skb->protocol = eth_type_trans(skb, vxlan->dev);
|
||||
|
@ -560,7 +602,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
|
|||
vxlan->dev->dev_addr) == 0)
|
||||
goto drop;
|
||||
|
||||
if (vxlan->learn)
|
||||
if (vxlan->flags & VXLAN_F_LEARN)
|
||||
vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
|
||||
|
||||
__skb_tunnel_rx(skb, vxlan->dev);
|
||||
|
@ -599,6 +641,117 @@ drop:
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
|
||||
{
|
||||
struct vxlan_dev *vxlan = netdev_priv(dev);
|
||||
struct arphdr *parp;
|
||||
u8 *arpptr, *sha;
|
||||
__be32 sip, tip;
|
||||
struct neighbour *n;
|
||||
|
||||
if (dev->flags & IFF_NOARP)
|
||||
goto out;
|
||||
|
||||
if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
|
||||
dev->stats.tx_dropped++;
|
||||
goto out;
|
||||
}
|
||||
parp = arp_hdr(skb);
|
||||
|
||||
if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
|
||||
parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
|
||||
parp->ar_pro != htons(ETH_P_IP) ||
|
||||
parp->ar_op != htons(ARPOP_REQUEST) ||
|
||||
parp->ar_hln != dev->addr_len ||
|
||||
parp->ar_pln != 4)
|
||||
goto out;
|
||||
arpptr = (u8 *)parp + sizeof(struct arphdr);
|
||||
sha = arpptr;
|
||||
arpptr += dev->addr_len; /* sha */
|
||||
memcpy(&sip, arpptr, sizeof(sip));
|
||||
arpptr += sizeof(sip);
|
||||
arpptr += dev->addr_len; /* tha */
|
||||
memcpy(&tip, arpptr, sizeof(tip));
|
||||
|
||||
if (ipv4_is_loopback(tip) ||
|
||||
ipv4_is_multicast(tip))
|
||||
goto out;
|
||||
|
||||
n = neigh_lookup(&arp_tbl, &tip, dev);
|
||||
|
||||
if (n) {
|
||||
struct vxlan_dev *vxlan = netdev_priv(dev);
|
||||
struct vxlan_fdb *f;
|
||||
struct sk_buff *reply;
|
||||
|
||||
if (!(n->nud_state & NUD_CONNECTED)) {
|
||||
neigh_release(n);
|
||||
goto out;
|
||||
}
|
||||
|
||||
f = vxlan_find_mac(vxlan, n->ha);
|
||||
if (f && f->remote_ip == 0) {
|
||||
/* bridge-local neighbor */
|
||||
neigh_release(n);
|
||||
goto out;
|
||||
}
|
||||
|
||||
reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
|
||||
n->ha, sha);
|
||||
|
||||
neigh_release(n);
|
||||
|
||||
skb_reset_mac_header(reply);
|
||||
__skb_pull(reply, skb_network_offset(reply));
|
||||
reply->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
reply->pkt_type = PACKET_HOST;
|
||||
|
||||
if (netif_rx_ni(reply) == NET_RX_DROP)
|
||||
dev->stats.rx_dropped++;
|
||||
} else if (vxlan->flags & VXLAN_F_L3MISS)
|
||||
vxlan_ip_miss(dev, tip);
|
||||
out:
|
||||
consume_skb(skb);
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
|
||||
{
|
||||
struct vxlan_dev *vxlan = netdev_priv(dev);
|
||||
struct neighbour *n;
|
||||
struct iphdr *pip;
|
||||
|
||||
if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
|
||||
return false;
|
||||
|
||||
n = NULL;
|
||||
switch (ntohs(eth_hdr(skb)->h_proto)) {
|
||||
case ETH_P_IP:
|
||||
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
|
||||
return false;
|
||||
pip = ip_hdr(skb);
|
||||
n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (n) {
|
||||
bool diff;
|
||||
|
||||
diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
|
||||
if (diff) {
|
||||
memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
|
||||
dev->addr_len);
|
||||
memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
|
||||
}
|
||||
neigh_release(n);
|
||||
return diff;
|
||||
} else if (vxlan->flags & VXLAN_F_L3MISS)
|
||||
vxlan_ip_miss(dev, pip->daddr);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Extract dsfield from inner protocol */
|
||||
static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
|
||||
const struct sk_buff *skb)
|
||||
|
@ -621,22 +774,6 @@ static inline u8 vxlan_ecn_encap(u8 tos,
|
|||
return INET_ECN_encapsulate(tos, inner);
|
||||
}
|
||||
|
||||
static __be32 vxlan_find_dst(struct vxlan_dev *vxlan, struct sk_buff *skb)
|
||||
{
|
||||
const struct ethhdr *eth = (struct ethhdr *) skb->data;
|
||||
const struct vxlan_fdb *f;
|
||||
|
||||
if (is_multicast_ether_addr(eth->h_dest))
|
||||
return vxlan->gaddr;
|
||||
|
||||
f = vxlan_find_mac(vxlan, eth->h_dest);
|
||||
if (f)
|
||||
return f->remote_ip;
|
||||
else
|
||||
return vxlan->gaddr;
|
||||
|
||||
}
|
||||
|
||||
static void vxlan_sock_free(struct sk_buff *skb)
|
||||
{
|
||||
sock_put(skb->sk);
|
||||
|
@ -683,6 +820,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
|
|||
struct vxlan_dev *vxlan = netdev_priv(dev);
|
||||
struct rtable *rt;
|
||||
const struct iphdr *old_iph;
|
||||
struct ethhdr *eth;
|
||||
struct iphdr *iph;
|
||||
struct vxlanhdr *vxh;
|
||||
struct udphdr *uh;
|
||||
|
@ -693,10 +831,50 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
|
|||
__be16 df = 0;
|
||||
__u8 tos, ttl;
|
||||
int err;
|
||||
bool did_rsc = false;
|
||||
const struct vxlan_fdb *f;
|
||||
|
||||
dst = vxlan_find_dst(vxlan, skb);
|
||||
if (!dst)
|
||||
skb_reset_mac_header(skb);
|
||||
eth = eth_hdr(skb);
|
||||
|
||||
if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
|
||||
return arp_reduce(dev, skb);
|
||||
else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
|
||||
did_rsc = route_shortcircuit(dev, skb);
|
||||
|
||||
f = vxlan_find_mac(vxlan, eth->h_dest);
|
||||
if (f == NULL) {
|
||||
did_rsc = false;
|
||||
dst = vxlan->gaddr;
|
||||
if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
|
||||
!is_multicast_ether_addr(eth->h_dest))
|
||||
vxlan_fdb_miss(vxlan, eth->h_dest);
|
||||
} else
|
||||
dst = f->remote_ip;
|
||||
|
||||
if (!dst) {
|
||||
if (did_rsc) {
|
||||
__skb_pull(skb, skb_network_offset(skb));
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
skb->pkt_type = PACKET_HOST;
|
||||
|
||||
/* short-circuited back to local bridge */
|
||||
if (netif_rx(skb) == NET_RX_SUCCESS) {
|
||||
struct vxlan_stats *stats =
|
||||
this_cpu_ptr(vxlan->stats);
|
||||
|
||||
u64_stats_update_begin(&stats->syncp);
|
||||
stats->tx_packets++;
|
||||
stats->tx_bytes += pkt_len;
|
||||
u64_stats_update_end(&stats->syncp);
|
||||
} else {
|
||||
dev->stats.tx_errors++;
|
||||
dev->stats.tx_aborted_errors++;
|
||||
}
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
goto drop;
|
||||
}
|
||||
|
||||
/* Need space for new headers (invalidates iph ptr) */
|
||||
if (skb_cow_head(skb, VXLAN_HEADROOM))
|
||||
|
@ -1019,6 +1197,10 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
|
|||
[IFLA_VXLAN_AGEING] = { .type = NLA_U32 },
|
||||
[IFLA_VXLAN_LIMIT] = { .type = NLA_U32 },
|
||||
[IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) },
|
||||
[IFLA_VXLAN_PROXY] = { .type = NLA_U8 },
|
||||
[IFLA_VXLAN_RSC] = { .type = NLA_U8 },
|
||||
[IFLA_VXLAN_L2MISS] = { .type = NLA_U8 },
|
||||
[IFLA_VXLAN_L3MISS] = { .type = NLA_U8 },
|
||||
};
|
||||
|
||||
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
|
||||
|
@ -1114,13 +1296,25 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
|
|||
vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
|
||||
|
||||
if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
|
||||
vxlan->learn = true;
|
||||
vxlan->flags |= VXLAN_F_LEARN;
|
||||
|
||||
if (data[IFLA_VXLAN_AGEING])
|
||||
vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
|
||||
else
|
||||
vxlan->age_interval = FDB_AGE_DEFAULT;
|
||||
|
||||
if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
|
||||
vxlan->flags |= VXLAN_F_PROXY;
|
||||
|
||||
if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
|
||||
vxlan->flags |= VXLAN_F_RSC;
|
||||
|
||||
if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
|
||||
vxlan->flags |= VXLAN_F_L2MISS;
|
||||
|
||||
if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
|
||||
vxlan->flags |= VXLAN_F_L3MISS;
|
||||
|
||||
if (data[IFLA_VXLAN_LIMIT])
|
||||
vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
|
||||
|
||||
|
@ -1157,6 +1351,10 @@ static size_t vxlan_get_size(const struct net_device *dev)
|
|||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
|
||||
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
|
||||
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
|
||||
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
|
||||
nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
|
||||
|
@ -1185,7 +1383,15 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
|||
|
||||
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_LEARNING,
|
||||
!!(vxlan->flags & VXLAN_F_LEARN)) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_PROXY,
|
||||
!!(vxlan->flags & VXLAN_F_PROXY)) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_L2MISS,
|
||||
!!(vxlan->flags & VXLAN_F_L2MISS)) ||
|
||||
nla_put_u8(skb, IFLA_VXLAN_L3MISS,
|
||||
!!(vxlan->flags & VXLAN_F_L3MISS)) ||
|
||||
nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
|
||||
nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
|
||||
goto nla_put_failure;
|
||||
|
|
|
@ -302,6 +302,10 @@ enum {
|
|||
IFLA_VXLAN_AGEING,
|
||||
IFLA_VXLAN_LIMIT,
|
||||
IFLA_VXLAN_PORT_RANGE,
|
||||
IFLA_VXLAN_PROXY,
|
||||
IFLA_VXLAN_RSC,
|
||||
IFLA_VXLAN_L2MISS,
|
||||
IFLA_VXLAN_L3MISS,
|
||||
__IFLA_VXLAN_MAX
|
||||
};
|
||||
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
|
||||
|
|
Loading…
Reference in New Issue