2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2016-11-08 21:57:41 +08:00
|
|
|
/*
|
|
|
|
* SR-IPv6 implementation
|
|
|
|
*
|
|
|
|
* Author:
|
|
|
|
* David Lebrun <david.lebrun@uclouvain.be>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/net.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <net/ip.h>
|
2018-03-30 00:59:36 +08:00
|
|
|
#include <net/ip_tunnels.h>
|
2016-11-08 21:57:41 +08:00
|
|
|
#include <net/lwtunnel.h>
|
|
|
|
#include <net/netevent.h>
|
|
|
|
#include <net/netns/generic.h>
|
|
|
|
#include <net/ip6_fib.h>
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/seg6.h>
|
|
|
|
#include <linux/seg6.h>
|
|
|
|
#include <linux/seg6_iptunnel.h>
|
|
|
|
#include <net/addrconf.h>
|
|
|
|
#include <net/ip6_route.h>
|
|
|
|
#include <net/dst_cache.h>
|
2016-11-08 21:59:19 +08:00
|
|
|
#ifdef CONFIG_IPV6_SEG6_HMAC
|
|
|
|
#include <net/seg6_hmac.h>
|
|
|
|
#endif
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
struct seg6_lwt {
|
|
|
|
struct dst_cache cache;
|
|
|
|
struct seg6_iptunnel_encap tuninfo[0];
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
|
|
|
|
{
|
|
|
|
return (struct seg6_lwt *)lwt->data;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct seg6_iptunnel_encap *
|
|
|
|
seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
|
|
|
|
{
|
|
|
|
return seg6_lwt_lwtunnel(lwt)->tuninfo;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
|
|
|
|
[SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
|
|
|
|
};
|
|
|
|
|
2017-02-07 00:15:05 +08:00
|
|
|
static int nla_put_srh(struct sk_buff *skb, int attrtype,
|
|
|
|
struct seg6_iptunnel_encap *tuninfo)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
|
|
|
struct seg6_iptunnel_encap *data;
|
|
|
|
struct nlattr *nla;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
|
|
|
|
|
|
|
|
nla = nla_reserve(skb, attrtype, len);
|
|
|
|
if (!nla)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
data = nla_data(nla);
|
|
|
|
memcpy(data, tuninfo, len);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_tun_src(struct net *net, struct net_device *dev,
|
|
|
|
struct in6_addr *daddr, struct in6_addr *saddr)
|
|
|
|
{
|
|
|
|
struct seg6_pernet_data *sdata = seg6_pernet(net);
|
|
|
|
struct in6_addr *tun_src;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
tun_src = rcu_dereference(sdata->tun_src);
|
|
|
|
|
|
|
|
if (!ipv6_addr_any(tun_src)) {
|
|
|
|
memcpy(saddr, tun_src, sizeof(struct in6_addr));
|
|
|
|
} else {
|
|
|
|
ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
|
|
|
|
saddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode
ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).
For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.
In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).
Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:
a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).
This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.
The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()
This patch has been tested for IPv6, IPv4, and L2 traffic.
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-25 02:23:16 +08:00
|
|
|
/* Compute flowlabel for outer IPv6 header */
|
|
|
|
static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
|
|
|
|
struct ipv6hdr *inner_hdr)
|
|
|
|
{
|
|
|
|
int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
|
|
|
|
__be32 flowlabel = 0;
|
|
|
|
u32 hash;
|
|
|
|
|
|
|
|
if (do_flowlabel > 0) {
|
|
|
|
hash = skb_get_hash(skb);
|
2018-07-17 23:52:54 +08:00
|
|
|
hash = rol32(hash, 16);
|
ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode
ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).
For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.
In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).
Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:
a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).
This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.
The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()
This patch has been tested for IPv6, IPv4, and L2 traffic.
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-25 02:23:16 +08:00
|
|
|
flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
|
|
|
|
} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
|
|
|
|
flowlabel = ip6_flowlabel(inner_hdr);
|
|
|
|
}
|
|
|
|
return flowlabel;
|
|
|
|
}
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
|
2017-08-25 15:56:44 +08:00
|
|
|
int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
2018-03-20 22:44:56 +08:00
|
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
struct net *net = dev_net(dst->dev);
|
2016-11-08 21:57:41 +08:00
|
|
|
struct ipv6hdr *hdr, *inner_hdr;
|
|
|
|
struct ipv6_sr_hdr *isrh;
|
|
|
|
int hdrlen, tot_len, err;
|
ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode
ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).
For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.
In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).
Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:
a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).
This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.
The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()
This patch has been tested for IPv6, IPv4, and L2 traffic.
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-25 02:23:16 +08:00
|
|
|
__be32 flowlabel;
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
hdrlen = (osrh->hdrlen + 1) << 3;
|
|
|
|
tot_len = hdrlen + sizeof(*hdr);
|
|
|
|
|
2018-05-25 20:29:41 +08:00
|
|
|
err = skb_cow_head(skb, tot_len + skb->mac_len);
|
2016-11-08 21:57:41 +08:00
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
|
|
|
|
inner_hdr = ipv6_hdr(skb);
|
ipv6: sr: extract the right key values for "seg6_make_flowlabel"
The seg6_make_flowlabel() is used by seg6_do_srh_encap() to compute the
flowlabel from a given skb. It relies on skb_get_hash() which eventually
calls __skb_flow_dissect() to extract the flow_keys struct values from
the skb.
In case of IPv4 traffic, calling seg6_make_flowlabel() after skb_push(),
skb_reset_network_header(), and skb_mac_header_rebuild() will results in
flow_keys struct of all key values set to zero.
This patch calls seg6_make_flowlabel() before resetting the headers of skb
to get the right key values.
Extracted Key values are based on the type inner packet as follows:
1) IPv6 traffic: src_IP, dst_IP, L4 proto, and flowlabel of inner packet.
2) IPv4 traffic: src_IP, dst_IP, L4 proto, src_port, and dst_port
3) L2 traffic: depends on what kind of traffic carried into the L2
frame. IPv6 and IPv4 traffic works as discussed 1) and 2)
Here a hex_dump of struct flow_keys for IPv4 and IPv6 traffic
10.100.1.100: 47302 > 30.0.0.2: 5001
00000000: 14 00 02 00 00 00 00 00 08 00 11 00 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 13 89 b8 c6 1e 00 00 02
00000020: 0a 64 01 64
fc00:a1:a > b2::2
00000000: 28 00 03 00 00 00 00 00 86 dd 11 00 99 f9 02 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 b2 00 00
00000020: 00 00 00 00 00 00 00 00 00 00 00 02 fc 00 00 a1
00000030: 00 00 00 00 00 00 00 00 00 00 00 0a
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-28 18:18:35 +08:00
|
|
|
flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
skb_push(skb, tot_len);
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
skb_mac_header_rebuild(skb);
|
|
|
|
hdr = ipv6_hdr(skb);
|
|
|
|
|
|
|
|
/* inherit tc, flowlabel and hlim
|
|
|
|
* hlim will be decremented in ip6_forward() afterwards and
|
|
|
|
* decapsulation will overwrite inner hlim with outer hlim
|
|
|
|
*/
|
2017-08-25 15:56:44 +08:00
|
|
|
|
|
|
|
if (skb->protocol == htons(ETH_P_IPV6)) {
|
|
|
|
ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
|
ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode
ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).
For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.
In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).
Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:
a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).
This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.
The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()
This patch has been tested for IPv6, IPv4, and L2 traffic.
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-25 02:23:16 +08:00
|
|
|
flowlabel);
|
2017-08-25 15:56:44 +08:00
|
|
|
hdr->hop_limit = inner_hdr->hop_limit;
|
|
|
|
} else {
|
ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode
ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).
For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.
In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).
Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:
a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).
This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.
The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()
This patch has been tested for IPv6, IPv4, and L2 traffic.
Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-25 02:23:16 +08:00
|
|
|
ip6_flow_hdr(hdr, 0, flowlabel);
|
2017-08-25 15:56:44 +08:00
|
|
|
hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
|
2019-01-29 14:52:34 +08:00
|
|
|
|
|
|
|
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
|
2024-06-12 13:13:20 +08:00
|
|
|
|
|
|
|
/* the control block has been erased, so we have to set the
|
|
|
|
* iif once again.
|
|
|
|
* We read the receiving interface index directly from the
|
|
|
|
* skb->skb_iif as it is done in the IPv4 receiving path (i.e.:
|
|
|
|
* ip_rcv_core(...)).
|
|
|
|
*/
|
|
|
|
IP6CB(skb)->iif = skb->skb_iif;
|
2017-08-25 15:56:44 +08:00
|
|
|
}
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
hdr->nexthdr = NEXTHDR_ROUTING;
|
|
|
|
|
|
|
|
isrh = (void *)hdr + sizeof(*hdr);
|
|
|
|
memcpy(isrh, osrh, hdrlen);
|
|
|
|
|
2017-08-25 15:56:44 +08:00
|
|
|
isrh->nexthdr = proto;
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
hdr->daddr = isrh->segments[isrh->first_segment];
|
2018-04-20 21:58:05 +08:00
|
|
|
set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
|
2016-11-08 21:57:41 +08:00
|
|
|
|
2016-11-08 21:59:19 +08:00
|
|
|
#ifdef CONFIG_IPV6_SEG6_HMAC
|
|
|
|
if (sr_has_hmac(isrh)) {
|
|
|
|
err = seg6_push_hmac(net, &hdr->saddr, isrh);
|
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2024-06-12 13:13:20 +08:00
|
|
|
hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
skb_postpush_rcsum(skb, hdr, tot_len);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-08-05 18:38:25 +08:00
|
|
|
EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
/* insert an SRH within an IPv6 packet, just after the IPv6 header */
|
2017-08-05 18:38:25 +08:00
|
|
|
int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
|
|
|
struct ipv6hdr *hdr, *oldhdr;
|
|
|
|
struct ipv6_sr_hdr *isrh;
|
|
|
|
int hdrlen, err;
|
|
|
|
|
|
|
|
hdrlen = (osrh->hdrlen + 1) << 3;
|
|
|
|
|
2018-05-25 20:29:41 +08:00
|
|
|
err = skb_cow_head(skb, hdrlen + skb->mac_len);
|
2016-11-08 21:57:41 +08:00
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
|
|
|
|
oldhdr = ipv6_hdr(skb);
|
|
|
|
|
|
|
|
skb_pull(skb, sizeof(struct ipv6hdr));
|
|
|
|
skb_postpull_rcsum(skb, skb_network_header(skb),
|
|
|
|
sizeof(struct ipv6hdr));
|
|
|
|
|
|
|
|
skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
skb_mac_header_rebuild(skb);
|
|
|
|
|
|
|
|
hdr = ipv6_hdr(skb);
|
|
|
|
|
|
|
|
memmove(hdr, oldhdr, sizeof(*hdr));
|
|
|
|
|
|
|
|
isrh = (void *)hdr + sizeof(*hdr);
|
|
|
|
memcpy(isrh, osrh, hdrlen);
|
|
|
|
|
|
|
|
isrh->nexthdr = hdr->nexthdr;
|
|
|
|
hdr->nexthdr = NEXTHDR_ROUTING;
|
|
|
|
|
|
|
|
isrh->segments[0] = hdr->daddr;
|
|
|
|
hdr->daddr = isrh->segments[isrh->first_segment];
|
|
|
|
|
2016-11-08 21:59:19 +08:00
|
|
|
#ifdef CONFIG_IPV6_SEG6_HMAC
|
|
|
|
if (sr_has_hmac(isrh)) {
|
|
|
|
struct net *net = dev_net(skb_dst(skb)->dev);
|
|
|
|
|
|
|
|
err = seg6_push_hmac(net, &hdr->saddr, isrh);
|
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2024-06-12 13:13:20 +08:00
|
|
|
hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-08-05 18:38:25 +08:00
|
|
|
EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
static int seg6_do_srh(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
struct seg6_iptunnel_encap *tinfo;
|
2017-08-25 15:56:44 +08:00
|
|
|
int proto, err = 0;
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
tinfo = seg6_encap_lwtunnel(dst->lwtstate);
|
|
|
|
|
|
|
|
switch (tinfo->mode) {
|
|
|
|
case SEG6_IPTUN_MODE_INLINE:
|
2017-08-25 15:56:44 +08:00
|
|
|
if (skb->protocol != htons(ETH_P_IPV6))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
err = seg6_do_srh_inline(skb, tinfo->srh);
|
2017-08-25 15:56:44 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2016-11-08 21:57:41 +08:00
|
|
|
break;
|
|
|
|
case SEG6_IPTUN_MODE_ENCAP:
|
2018-03-30 00:59:36 +08:00
|
|
|
err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2017-08-25 15:56:44 +08:00
|
|
|
if (skb->protocol == htons(ETH_P_IPV6))
|
|
|
|
proto = IPPROTO_IPV6;
|
|
|
|
else if (skb->protocol == htons(ETH_P_IP))
|
|
|
|
proto = IPPROTO_IPIP;
|
|
|
|
else
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
err = seg6_do_srh_encap(skb, tinfo->srh, proto);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2018-03-30 00:59:36 +08:00
|
|
|
skb_set_inner_transport_header(skb, skb_transport_offset(skb));
|
|
|
|
skb_set_inner_protocol(skb, skb->protocol);
|
2017-08-25 15:56:45 +08:00
|
|
|
skb->protocol = htons(ETH_P_IPV6);
|
|
|
|
break;
|
|
|
|
case SEG6_IPTUN_MODE_L2ENCAP:
|
|
|
|
if (!skb_mac_header_was_set(skb))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
skb_mac_header_rebuild(skb);
|
|
|
|
skb_push(skb, skb->mac_len);
|
|
|
|
|
|
|
|
err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2017-08-25 15:56:44 +08:00
|
|
|
skb->protocol = htons(ETH_P_IPV6);
|
2016-11-08 21:57:41 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-07 00:15:05 +08:00
|
|
|
static int seg6_input(struct sk_buff *skb)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
2017-03-24 17:46:27 +08:00
|
|
|
struct dst_entry *orig_dst = skb_dst(skb);
|
|
|
|
struct dst_entry *dst = NULL;
|
|
|
|
struct seg6_lwt *slwt;
|
2016-11-08 21:57:41 +08:00
|
|
|
int err;
|
|
|
|
|
|
|
|
err = seg6_do_srh(skb);
|
|
|
|
if (unlikely(err)) {
|
|
|
|
kfree_skb(skb);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-03-24 17:46:27 +08:00
|
|
|
slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
dst = dst_cache_get(&slwt->cache);
|
|
|
|
preempt_enable();
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
skb_dst_drop(skb);
|
2017-03-24 17:46:27 +08:00
|
|
|
|
|
|
|
if (!dst) {
|
|
|
|
ip6_route_input(skb);
|
|
|
|
dst = skb_dst(skb);
|
|
|
|
if (!dst->error) {
|
|
|
|
preempt_disable();
|
|
|
|
dst_cache_set_ip6(&slwt->cache, dst,
|
|
|
|
&ipv6_hdr(skb)->saddr);
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
skb_dst_set(skb, dst);
|
|
|
|
}
|
2016-11-08 21:57:41 +08:00
|
|
|
|
2017-04-16 18:27:14 +08:00
|
|
|
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
|
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
return dst_input(skb);
|
|
|
|
}
|
|
|
|
|
2017-02-07 00:15:05 +08:00
|
|
|
static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
|
|
|
struct dst_entry *orig_dst = skb_dst(skb);
|
|
|
|
struct dst_entry *dst = NULL;
|
|
|
|
struct seg6_lwt *slwt;
|
|
|
|
int err = -EINVAL;
|
|
|
|
|
|
|
|
err = seg6_do_srh(skb);
|
|
|
|
if (unlikely(err))
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
|
|
|
|
|
2017-01-13 04:30:01 +08:00
|
|
|
preempt_disable();
|
2016-11-08 21:57:41 +08:00
|
|
|
dst = dst_cache_get(&slwt->cache);
|
2017-01-13 04:30:01 +08:00
|
|
|
preempt_enable();
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
if (unlikely(!dst)) {
|
|
|
|
struct ipv6hdr *hdr = ipv6_hdr(skb);
|
|
|
|
struct flowi6 fl6;
|
|
|
|
|
2018-12-07 15:50:17 +08:00
|
|
|
memset(&fl6, 0, sizeof(fl6));
|
2016-11-08 21:57:41 +08:00
|
|
|
fl6.daddr = hdr->daddr;
|
|
|
|
fl6.saddr = hdr->saddr;
|
|
|
|
fl6.flowlabel = ip6_flowinfo(hdr);
|
|
|
|
fl6.flowi6_mark = skb->mark;
|
|
|
|
fl6.flowi6_proto = hdr->nexthdr;
|
|
|
|
|
|
|
|
dst = ip6_route_output(net, NULL, &fl6);
|
|
|
|
if (dst->error) {
|
|
|
|
err = dst->error;
|
|
|
|
dst_release(dst);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2017-01-13 04:30:01 +08:00
|
|
|
preempt_disable();
|
2016-11-08 21:57:41 +08:00
|
|
|
dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
|
2017-01-13 04:30:01 +08:00
|
|
|
preempt_enable();
|
2016-11-08 21:57:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
skb_dst_drop(skb);
|
|
|
|
skb_dst_set(skb, dst);
|
|
|
|
|
2017-04-16 18:27:14 +08:00
|
|
|
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
|
|
|
|
if (unlikely(err))
|
|
|
|
goto drop;
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
return dst_output(net, sk, skb);
|
|
|
|
drop:
|
|
|
|
kfree_skb(skb);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-01-31 04:07:37 +08:00
|
|
|
static int seg6_build_state(struct nlattr *nla,
|
2016-11-08 21:57:41 +08:00
|
|
|
unsigned int family, const void *cfg,
|
2017-05-28 06:19:28 +08:00
|
|
|
struct lwtunnel_state **ts,
|
|
|
|
struct netlink_ext_ack *extack)
|
2016-11-08 21:57:41 +08:00
|
|
|
{
|
|
|
|
struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
|
|
|
|
struct seg6_iptunnel_encap *tuninfo;
|
|
|
|
struct lwtunnel_state *newts;
|
|
|
|
int tuninfo_len, min_size;
|
|
|
|
struct seg6_lwt *slwt;
|
|
|
|
int err;
|
|
|
|
|
2017-08-25 15:56:44 +08:00
|
|
|
if (family != AF_INET && family != AF_INET6)
|
|
|
|
return -EINVAL;
|
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 20:07:28 +08:00
|
|
|
err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla,
|
|
|
|
seg6_iptunnel_policy, extack);
|
2016-11-08 21:57:41 +08:00
|
|
|
|
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!tb[SEG6_IPTUNNEL_SRH])
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
|
|
|
|
tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
|
|
|
|
|
|
|
|
/* tuninfo must contain at least the iptunnel encap structure,
|
|
|
|
* the SRH and one segment
|
|
|
|
*/
|
|
|
|
min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
|
|
|
|
sizeof(struct in6_addr);
|
|
|
|
if (tuninfo_len < min_size)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (tuninfo->mode) {
|
|
|
|
case SEG6_IPTUN_MODE_INLINE:
|
2017-08-25 15:56:44 +08:00
|
|
|
if (family != AF_INET6)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
break;
|
|
|
|
case SEG6_IPTUN_MODE_ENCAP:
|
|
|
|
break;
|
2017-08-25 15:56:45 +08:00
|
|
|
case SEG6_IPTUN_MODE_L2ENCAP:
|
|
|
|
break;
|
2016-11-08 21:57:41 +08:00
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* verify that SRH is consistent */
|
|
|
|
if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
|
|
|
|
if (!newts)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
slwt = seg6_lwt_lwtunnel(newts);
|
|
|
|
|
2018-03-20 22:44:55 +08:00
|
|
|
err = dst_cache_init(&slwt->cache, GFP_ATOMIC);
|
2016-11-08 21:57:41 +08:00
|
|
|
if (err) {
|
|
|
|
kfree(newts);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
|
|
|
|
|
|
|
|
newts->type = LWTUNNEL_ENCAP_SEG6;
|
2017-08-25 15:56:45 +08:00
|
|
|
newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
|
|
|
|
|
|
|
|
if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
|
|
|
|
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
|
|
|
|
|
2016-11-08 21:57:41 +08:00
|
|
|
newts->headroom = seg6_lwt_headroom(tuninfo);
|
|
|
|
|
|
|
|
*ts = newts;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void seg6_destroy_state(struct lwtunnel_state *lwt)
|
|
|
|
{
|
|
|
|
dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int seg6_fill_encap_info(struct sk_buff *skb,
|
|
|
|
struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
|
|
|
struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
|
|
|
|
|
|
|
|
if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
|
|
|
|
{
|
|
|
|
struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
|
|
|
|
|
|
|
|
return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
|
|
|
|
{
|
|
|
|
struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
|
|
|
|
struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
|
|
|
|
int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
|
|
|
|
|
|
|
|
if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return memcmp(a_hdr, b_hdr, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct lwtunnel_encap_ops seg6_iptun_ops = {
|
|
|
|
.build_state = seg6_build_state,
|
|
|
|
.destroy_state = seg6_destroy_state,
|
|
|
|
.output = seg6_output,
|
|
|
|
.input = seg6_input,
|
|
|
|
.fill_encap = seg6_fill_encap_info,
|
|
|
|
.get_encap_size = seg6_encap_nlsize,
|
|
|
|
.cmp_encap = seg6_encap_cmp,
|
2017-01-25 00:26:47 +08:00
|
|
|
.owner = THIS_MODULE,
|
2016-11-08 21:57:41 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
int __init seg6_iptunnel_init(void)
|
|
|
|
{
|
|
|
|
return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
|
|
|
|
}
|
|
|
|
|
|
|
|
void seg6_iptunnel_exit(void)
|
|
|
|
{
|
|
|
|
lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
|
|
|
|
}
|