OpenCloudOS-Kernel/include/net/route.h

333 lines
9.2 KiB
C
Raw Normal View History

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the IP router.
*
* Version: @(#)route.h 1.0.4 05/27/93
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Fixes:
* Alan Cox : Reformatted. Added ip_rt_local()
* Alan Cox : Support for TCP parameters.
* Alexey Kuznetsov: Major changes for new routing code.
* Mike McLagan : Routing by source
* Robert Olsson : Added rt_cache statistics
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ROUTE_H
#define _ROUTE_H
#include <net/dst.h>
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/route.h>
#include <linux/ip.h>
#include <linux/cache.h>
#include <linux/security.h>
#define RTO_ONLINK 0x01
#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
#define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
struct fib_nh;
net: Implement read-only protection and COW'ing of metrics. Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
2011-01-27 12:51:05 +08:00
struct fib_info;
struct rtable {
struct dst_entry dst;
int rt_genid;
unsigned int rt_flags;
__u16 rt_type;
__u8 rt_is_input;
__u8 rt_uses_gateway;
int rt_iif;
/* Info on neighbour */
__be32 rt_gateway;
/* Miscellaneous cached information */
u32 rt_pmtu;
struct list_head rt_uncached;
};
static inline bool rt_is_input_route(const struct rtable *rt)
{
return rt->rt_is_input != 0;
}
static inline bool rt_is_output_route(const struct rtable *rt)
{
return rt->rt_is_input == 0;
}
static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
if (rt->rt_gateway)
return rt->rt_gateway;
return daddr;
}
struct ip_rt_acct {
__u32 o_bytes;
__u32 o_packets;
__u32 i_bytes;
__u32 i_packets;
};
struct rt_cache_stat {
unsigned int in_slow_tot;
unsigned int in_slow_mc;
unsigned int in_no_route;
unsigned int in_brd;
unsigned int in_martian_dst;
unsigned int in_martian_src;
unsigned int out_slow_tot;
unsigned int out_slow_mc;
};
extern struct ip_rt_acct __percpu *ip_rt_acct;
struct in_device;
int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
struct dst_entry *ipv4_blackhole_route(struct net *net,
struct dst_entry *dst_orig);
static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp)
{
return ip_route_output_flow(net, flp, NULL);
}
static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
__be32 saddr, u8 tos, int oif)
{
struct flowi4 fl4 = {
.flowi4_oif = oif,
.flowi4_tos = tos,
.daddr = daddr,
.saddr = saddr,
};
return ip_route_output_key(net, &fl4);
}
static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
struct sock *sk,
__be32 daddr, __be32 saddr,
__be16 dport, __be16 sport,
__u8 proto, __u8 tos, int oif)
{
flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
RT_SCOPE_UNIVERSE, proto,
sk ? inet_sk_flowi_flags(sk) : 0,
daddr, saddr, dport, sport);
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
}
static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4,
__be32 daddr, __be32 saddr,
__be32 gre_key, __u8 tos, int oif)
{
memset(fl4, 0, sizeof(*fl4));
fl4->flowi4_oif = oif;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->flowi4_tos = tos;
fl4->flowi4_proto = IPPROTO_GRE;
fl4->fl4_gre_key = gre_key;
return ip_route_output_key(net, fl4);
}
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin);
static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin)
{
int err;
rcu_read_lock();
err = ip_route_input_noref(skb, dst, src, tos, devin);
if (!err)
skb_dst_force(skb);
rcu_read_unlock();
return err;
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
u32 mark, u8 protocol, int flow_flags);
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
u8 protocol, int flow_flags);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
void ip_rt_send_redirect(struct sk_buff *skb);
unsigned int inet_addr_type(struct net *net, __be32 addr);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr);
void ip_rt_multicast_event(struct in_device *);
int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb);
struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
static inline void ip_rt_put(struct rtable *rt)
{
/* dst_release() accepts a NULL parameter.
* We rely on dst being first structure in struct rtable
*/
BUILD_BUG_ON(offsetof(struct rtable, dst) != 0);
dst_release(&rt->dst);
}
#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)
extern const __u8 ip_tos2prio[16];
static inline char rt_tos2priority(u8 tos)
{
return ip_tos2prio[IPTOS_TOS(tos)>>1];
}
/* ip_route_connect() and ip_route_newports() work in tandem whilst
* binding a socket for a new outgoing connection.
*
* In order to use IPSEC properly, we must, in the end, have a
* route that was looked up using all available keys including source
* and destination ports.
*
* However, if a source port needs to be allocated (the user specified
* a wildcard source port) we need to obtain addressing information
* in order to perform that allocation.
*
* So ip_route_connect() looks up a route using wildcarded source and
* destination ports in the key, simply so that we can get a pair of
* addresses to use for port allocation.
*
* Later, once the ports are allocated, ip_route_newports() will make
* another route lookup if needed to make sure we catch any IPSEC
* rules keyed on the port information.
*
* The callers allocate the flow key on their stack, and must pass in
* the same flowi4 object to both the ip_route_connect() and the
* ip_route_newports() calls.
*/
static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src,
u32 tos, int oif, u8 protocol,
__be16 sport, __be16 dport,
struct sock *sk, bool can_sleep)
{
__u8 flow_flags = 0;
if (inet_sk(sk)->transparent)
flow_flags |= FLOWI_FLAG_ANYSRC;
if (can_sleep)
flow_flags |= FLOWI_FLAG_CAN_SLEEP;
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
protocol, flow_flags, dst, src, dport, sport);
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
__be32 dst, __be32 src, u32 tos,
int oif, u8 protocol,
__be16 sport, __be16 dport,
struct sock *sk, bool can_sleep)
{
struct net *net = sock_net(sk);
struct rtable *rt;
ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
sport, dport, sk, can_sleep);
if (!dst || !src) {
rt = __ip_route_output_key(net, fl4);
if (IS_ERR(rt))
return rt;
ip_rt_put(rt);
flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr);
}
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
}
static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
__be16 orig_sport, __be16 orig_dport,
__be16 sport, __be16 dport,
struct sock *sk)
{
if (sport != orig_sport || dport != orig_dport) {
fl4->fl4_dport = dport;
fl4->fl4_sport = sport;
ip_rt_put(rt);
flowi4_update_output(fl4, sk->sk_bound_dev_if,
RT_CONN_FLAGS(sk), fl4->daddr,
fl4->saddr);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(sock_net(sk), fl4, sk);
}
return rt;
}
static inline int inet_iif(const struct sk_buff *skb)
{
int iif = skb_rtable(skb)->rt_iif;
if (iif)
return iif;
return skb->skb_iif;
}
extern int sysctl_ip_default_ttl;
static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
if (hoplimit == 0)
hoplimit = sysctl_ip_default_ttl;
return hoplimit;
}
ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery, their sockets won't accept and install new path mtu information and they will always use the interface mtu for outgoing packets. It is guaranteed that the packet is not fragmented locally. But we won't set the DF-Flag on the outgoing frames. Florian Weimer had the idea to use this flag to ensure DNS servers are never generating outgoing fragments. They may well be fragmented on the path, but the server never stores or usees path mtu values, which could well be forged in an attack. (The root of the problem with path MTU discovery is that there is no reliable way to authenticate ICMP Fragmentation Needed But DF Set messages because they are sent from intermediate routers with their source addresses, and the IMCP payload will not always contain sufficient information to identify a flow.) Recent research in the DNS community showed that it is possible to implement an attack where DNS cache poisoning is feasible by spoofing fragments. This work was done by Amir Herzberg and Haya Shulman: <https://sites.google.com/site/hayashulman/files/fragmentation-poisoning.pdf> This issue was previously discussed among the DNS community, e.g. <http://www.ietf.org/mail-archive/web/dnsext/current/msg01204.html>, without leading to fixes. This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode regarding local fragmentation with UFO/CORK" for the enforcement of the non-fragmentable checks. If other users than ip_append_page/data should use this semantic too, we have to add a new flag to IPCB(skb)->flags to suppress local fragmentation and check for this in ip_finish_output. Many thanks to Florian Weimer for the idea and feedback while implementing this patch. Cc: David S. Miller <davem@davemloft.net> Suggested-by: Florian Weimer <fweimer@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-11-05 09:24:17 +08:00
static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery, their sockets won't accept and install new path mtu information and they will always use the interface mtu for outgoing packets. It is guaranteed that the packet is not fragmented locally. But we won't set the DF-Flag on the outgoing frames. Florian Weimer had the idea to use this flag to ensure DNS servers are never generating outgoing fragments. They may well be fragmented on the path, but the server never stores or usees path mtu values, which could well be forged in an attack. (The root of the problem with path MTU discovery is that there is no reliable way to authenticate ICMP Fragmentation Needed But DF Set messages because they are sent from intermediate routers with their source addresses, and the IMCP payload will not always contain sufficient information to identify a flow.) Recent research in the DNS community showed that it is possible to implement an attack where DNS cache poisoning is feasible by spoofing fragments. This work was done by Amir Herzberg and Haya Shulman: <https://sites.google.com/site/hayashulman/files/fragmentation-poisoning.pdf> This issue was previously discussed among the DNS community, e.g. <http://www.ietf.org/mail-archive/web/dnsext/current/msg01204.html>, without leading to fixes. This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode regarding local fragmentation with UFO/CORK" for the enforcement of the non-fragmentable checks. If other users than ip_append_page/data should use this semantic too, we have to add a new flag to IPCB(skb)->flags to suppress local fragmentation and check for this in ip_finish_output. Many thanks to Florian Weimer for the idea and feedback while implementing this patch. Cc: David S. Miller <davem@davemloft.net> Suggested-by: Florian Weimer <fweimer@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-11-05 09:24:17 +08:00
return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
}
ipv4: introduce new IP_MTU_DISCOVER mode IP_PMTUDISC_INTERFACE Sockets marked with IP_PMTUDISC_INTERFACE won't do path mtu discovery, their sockets won't accept and install new path mtu information and they will always use the interface mtu for outgoing packets. It is guaranteed that the packet is not fragmented locally. But we won't set the DF-Flag on the outgoing frames. Florian Weimer had the idea to use this flag to ensure DNS servers are never generating outgoing fragments. They may well be fragmented on the path, but the server never stores or usees path mtu values, which could well be forged in an attack. (The root of the problem with path MTU discovery is that there is no reliable way to authenticate ICMP Fragmentation Needed But DF Set messages because they are sent from intermediate routers with their source addresses, and the IMCP payload will not always contain sufficient information to identify a flow.) Recent research in the DNS community showed that it is possible to implement an attack where DNS cache poisoning is feasible by spoofing fragments. This work was done by Amir Herzberg and Haya Shulman: <https://sites.google.com/site/hayashulman/files/fragmentation-poisoning.pdf> This issue was previously discussed among the DNS community, e.g. <http://www.ietf.org/mail-archive/web/dnsext/current/msg01204.html>, without leading to fixes. This patch depends on the patch "ipv4: fix DO and PROBE pmtu mode regarding local fragmentation with UFO/CORK" for the enforcement of the non-fragmentable checks. If other users than ip_append_page/data should use this semantic too, we have to add a new flag to IPCB(skb)->flags to suppress local fragmentation and check for this in ip_finish_output. Many thanks to Florian Weimer for the idea and feedback while implementing this patch. Cc: David S. Miller <davem@davemloft.net> Suggested-by: Florian Weimer <fweimer@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-11-05 09:24:17 +08:00
static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
}
static inline int ip_skb_dst_mtu(const struct sk_buff *skb)
{
return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ?
dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu;
}
#endif /* _ROUTE_H */