diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ea60b0854109..db71c4ad8624 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, - u32 id, u32 ts, u32 tsage, long expires, - u32 error); + u32 id, long expires, u32 error); extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7d3bcedc062a..2de9cf46f9fc 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -506,7 +506,6 @@ struct tcp_timewait_sock { u32 tw_rcv_wnd; u32 tw_ts_recent; long tw_ts_recent_stamp; - struct inet_peer *tw_peer; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/dst.h b/include/net/dst.h index b2634e446613..51610468c63d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -209,12 +209,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr return msecs_to_jiffies(dst_metric(dst, metric)); } -static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric, - unsigned long rtt) -{ - dst_metric_set(dst, metric, jiffies_to_msecs(rtt)); -} - static inline u32 dst_allfrag(const struct dst_entry *dst) { diff --git a/include/net/flow.h b/include/net/flow.h index bd524f598561..ce9cb7656b47 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -20,9 +20,8 @@ struct flowi_common { __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 -#define FLOWI_FLAG_PRECOW_METRICS 0x02 -#define FLOWI_FLAG_CAN_SLEEP 0x04 -#define FLOWI_FLAG_RT_NOCACHE 0x08 +#define FLOWI_FLAG_CAN_SLEEP 0x02 +#define FLOWI_FLAG_RT_NOCACHE 0x04 __u32 flowic_secid; }; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index af3c743a40e4..291e7cee14e7 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops { struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); - struct inet_peer *(*get_peer)(struct sock *sk); u16 net_header_len; u16 net_frag_header_len; u16 sockaddr_len; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ae17e1352d7e..924d7b98ab60 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; - if (sk->sk_protocol == IPPROTO_TCP) - flags |= FLOWI_FLAG_PRECOW_METRICS; return flags; } diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index c27c8f10ebdc..53f464d7cddc 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -36,25 +36,19 @@ struct inet_peer { u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ unsigned long rate_last; - unsigned long pmtu_expires; - u32 pmtu_orig; - u32 pmtu_learned; - struct inetpeer_addr_base redirect_learned; union { struct list_head gc_list; struct rcu_head gc_rcu; }; /* * Once inet_peer is queued for deletion (refcnt == -1), following fields - * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp + * are not available: rid, ip_id_count * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ atomic_t ip_id_count; /* IP ID for the next packet */ - __u32 tcp_ts; - __u32 tcp_ts_stamp; }; struct rcu_head rcu; struct inet_peer *gc_next; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 599e48fa97cb..2e089a99d603 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -7,6 +7,7 @@ #include +struct tcpm_hash_bucket; struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; @@ -39,6 +40,8 @@ struct netns_ipv4 { struct sock **icmp_sk; struct sock *tcp_sock; struct inet_peer_base *peers; + struct tcpm_hash_bucket *tcp_metrics_hash; + unsigned int tcp_metrics_hash_mask; struct netns_frags frags; #ifdef CONFIG_NETFILTER struct xt_table *iptable_filter; diff --git a/include/net/route.h b/include/net/route.h index 211e2665139b..52362368af09 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -40,7 +40,6 @@ #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) struct fib_nh; -struct inet_peer; struct fib_info; struct rtable { struct dst_entry dst; @@ -65,45 +64,10 @@ struct rtable { __be32 rt_gateway; /* Miscellaneous cached information */ - u32 rt_peer_genid; - unsigned long _peer; /* long-living peer info */ + u32 rt_pmtu; struct fib_info *fi; /* for client ref to shared metrics */ }; -static inline struct inet_peer *rt_peer_ptr(struct rtable *rt) -{ - return inetpeer_ptr(rt->_peer); -} - -static inline bool rt_has_peer(struct rtable *rt) -{ - return inetpeer_ptr_is_peer(rt->_peer); -} - -static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer) -{ - __inetpeer_ptr_set_peer(&rt->_peer, peer); -} - -static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer) -{ - return inetpeer_ptr_set_peer(&rt->_peer, peer); -} - -static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base) -{ - inetpeer_init_ptr(&rt->_peer, base); -} - -static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort) -{ - rt->_peer = ort->_peer; - if (rt_has_peer(ort)) { - struct inet_peer *peer = rt_peer_ptr(ort); - atomic_inc(&peer->refcnt); - } -} - static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_route_iif != 0; @@ -278,8 +242,6 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; - if (protocol == IPPROTO_TCP) - flow_flags |= FLOWI_FLAG_PRECOW_METRICS; if (can_sleep) flow_flags |= FLOWI_FLAG_CAN_SLEEP; @@ -328,27 +290,6 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable return rt; } -extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create); - -static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create) -{ - if (rt_has_peer(rt)) - return rt_peer_ptr(rt); - - rt_bind_peer(rt, daddr, create); - return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL); -} - -static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr) -{ - return __rt_get_peer(rt, daddr, 0); -} - -static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr) -{ - return __rt_get_peer(rt, daddr, 1); -} - static inline int inet_iif(const struct sk_buff *skb) { return skb_rtable(skb)->rt_iif; diff --git a/include/net/tcp.h b/include/net/tcp.h index 53fb7d814170..3618fefae049 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,6 +388,13 @@ extern void tcp_enter_frto(struct sock *sk); extern void tcp_enter_loss(struct sock *sk, int how); extern void tcp_clear_retrans(struct tcp_sock *tp); extern void tcp_update_metrics(struct sock *sk); +extern void tcp_init_metrics(struct sock *sk); +extern void tcp_metrics_init(void); +extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check); +extern bool tcp_remember_stamp(struct sock *sk); +extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); +extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); +extern void tcp_disable_fack(struct tcp_sock *tp); extern void tcp_close(struct sock *sk, long timeout); extern void tcp_init_sock(struct sock *sk); extern unsigned int tcp_poll(struct file * file, struct socket *sock, @@ -556,6 +563,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return (tp->srtt >> 3) + tp->rttvar; } +extern void tcp_set_rto(struct sock *sk); + static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2b325c340b44..64127eee786d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -615,7 +615,7 @@ nla_put_failure: EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - u32 ts, u32 tsage, long expires, u32 error) + long expires, u32 error) { struct rta_cacheinfo ci = { .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), @@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, - .rta_ts = ts, - .rta_tsage = tsage, }; if (expires) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 6e74b3f110bc..b5594cc73ee1 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, goto errout; expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto errout; @@ -1812,12 +1812,11 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v) char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", - rt->dst.dev ? rt->dst.dev->name : "*", - dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), - dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), - atomic_read(&rt->dst.__refcnt), - rt->dst.__use, - (int) dst_metric(&rt->dst, RTAX_RTT)); + rt->dst.dev ? rt->dst.dev->name : "*", + dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), + dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), + atomic_read(&rt->dst.__refcnt), + rt->dst.__use, 0); return 0; } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3bbcd6a..5a23e8b37106 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ae301c897a19..d71bfbdc0bf4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) val = nla_get_u32(nla); if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; fi->fib_metrics[type - 1] = val; } } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4bce5a2830aa..4a049449305f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -254,9 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr); + struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + inet_putpeer(peer); } out: return rc; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 034ddbe42adf..76825be3b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -375,7 +375,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); - int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS; + int flags = inet_sk_flowi_flags(sk); if (nocache) flags |= FLOWI_FLAG_RT_NOCACHE; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index da90a8cab614..e1e0a4e8fd34 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -508,13 +508,9 @@ relookup: (daddr->family == AF_INET) ? secure_ip_id(daddr->addr.a4) : secure_ipv6_id(daddr->addr.a6)); - p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->rate_last = 0; - p->pmtu_expires = 0; - p->pmtu_orig = 0; - memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 72e88c208025..95bfa1ba5b28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -158,34 +158,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; - u32 *p = NULL; - - peer = rt_get_peer_create(rt, rt->rt_dst); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } else { - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } - } - } - return p; + WARN_ON(1); + return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -423,18 +397,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) int len; seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->dst.dev ? r->dst.dev->name : "*", - (__force u32)r->rt_dst, - (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->dst.__refcnt), - r->dst.__use, 0, (__force u32)r->rt_src, - dst_metric_advmss(&r->dst) + 40, - dst_metric(&r->dst, RTAX_WINDOW), - (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + - dst_metric(&r->dst, RTAX_RTTVAR)), - r->rt_key_tos, - -1, 0, 0, &len); + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + r->dst.dev ? r->dst.dev->name : "*", + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + dst_metric_advmss(&r->dst) + 40, + dst_metric(&r->dst, RTAX_WINDOW), 0, + r->rt_key_tos, + -1, 0, 0, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } @@ -671,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires); + rth->dst.expires; } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -917,7 +889,6 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - inetpeer_invalidate_family(AF_INET); } /* @@ -1244,31 +1215,6 @@ skip_hashing: return rt; } -static atomic_t __rt_peer_genid = ATOMIC_INIT(0); - -static u32 rt_peer_genid(void) -{ - return atomic_read(&__rt_peer_genid); -} - -void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) -{ - struct inet_peer_base *base; - struct inet_peer *peer; - - base = inetpeer_base_ptr(rt->_peer); - if (!base) - return; - - peer = inet_getpeer_v4(base, daddr, create); - if (peer) { - if (!rt_set_peer(rt, peer)) - inet_putpeer(peer); - else - rt->rt_peer_genid = rt_peer_genid(); - } -} - /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. @@ -1291,20 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph) void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { - struct rtable *rt = (struct rtable *) dst; + struct net *net = dev_net(dst->dev); + struct inet_peer *peer; - if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst); - - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. - */ - if (peer) { - iph->id = htons(inet_getid(peer, more)); - return; - } - } else if (!rt) - pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); + peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); + if (peer) { + iph->id = htons(inet_getid(peer, more)); + inet_putpeer(peer); + return; + } ip_select_fb_ident(iph); } @@ -1330,30 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) -{ - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - struct neighbour *n; - - dst_confirm(&rt->dst); - - rt->rt_gateway = peer->redirect_learned.a4; - - n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway); - if (!n) { - rt->rt_gateway = orig_gw; - return; - } - if (!(n->nud_state & NUD_VALID)) { - neigh_event_send(n, NULL); - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); - } - neigh_release(n); -} - /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) @@ -1362,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, struct in_device *in_dev = __in_dev_get_rcu(dev); __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; - struct inet_peer *peer; struct net *net; if (!in_dev) @@ -1395,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rthp = &rt_hash_table[hash].chain; while ((rt = rcu_dereference(*rthp)) != NULL) { + struct neighbour *n; + rthp = &rt->dst.rt_next; if (rt->rt_key_dst != daddr || @@ -1408,13 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->rt_gateway != old_gw) continue; - peer = rt_get_peer_create(rt, rt->rt_dst); - if (peer) { - if (peer->redirect_learned.a4 != new_gw) { - peer->redirect_learned.a4 = new_gw; - atomic_inc(&__rt_peer_genid); + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + rt->rt_gateway = new_gw; + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } - check_peer_redir(&rt->dst, peer); + neigh_release(n); } } } @@ -1432,23 +1353,6 @@ reject_redirect: ; } -static bool peer_pmtu_expired(struct inet_peer *peer) -{ - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); - - return orig && - time_after_eq(jiffies, orig) && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; -} - -static bool peer_pmtu_cleaned(struct inet_peer *peer) -{ - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); - - return orig && - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; -} - static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; @@ -1458,16 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if (rt->rt_flags & RTCF_REDIRECTED) { + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) { unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, rt->rt_oif, rt_genid(dev_net(dst->dev))); rt_del(hash, rt); ret = NULL; - } else if (rt_has_peer(rt)) { - struct inet_peer *peer = rt_peer_ptr(rt); - if (peer_pmtu_expired(peer)) - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); } } return ret; @@ -1494,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; + struct net *net; int log_martians; rcu_read_lock(); @@ -1505,7 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); - peer = rt_get_peer_create(rt, rt->rt_dst); + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); return; @@ -1522,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->rate_tokens >= ip_rt_redirect_number) { peer->rate_last = jiffies; - return; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent @@ -1543,6 +1446,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &rt->rt_dst, &rt->rt_gateway); #endif } +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) @@ -1585,7 +1490,7 @@ static int ip_error(struct sk_buff *skb) break; } - peer = rt_get_peer_create(rt, rt->rt_dst); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); send = true; if (peer) { @@ -1598,6 +1503,7 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; + inet_putpeer(peer); } if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1606,50 +1512,17 @@ out: kfree_skb(skb); return 0; } -static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) -{ - unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); - - if (!expires) - return; - if (time_before(jiffies, expires)) { - u32 orig_dst_mtu = dst_mtu(dst); - if (peer->pmtu_learned < orig_dst_mtu) { - if (!peer->pmtu_orig) - peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); - dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); - } - } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); -} - static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer; dst_confirm(dst); - peer = rt_get_peer_create(rt, rt->rt_dst); - if (peer) { - unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; - if (mtu < ip_rt_min_pmtu) - mtu = ip_rt_min_pmtu; - if (!pmtu_expires || mtu < peer->pmtu_learned) { - - pmtu_expires = jiffies + ip_rt_mtu_expires; - if (!pmtu_expires) - pmtu_expires = 1UL; - - peer->pmtu_learned = mtu; - peer->pmtu_expires = pmtu_expires; - - atomic_inc(&__rt_peer_genid); - rt->rt_peer_genid = rt_peer_genid(); - } - check_peer_pmtu(dst, peer); - } + rt->rt_pmtu = mtu; + dst_set_expires(&rt->dst, ip_rt_mtu_expires); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, @@ -1660,7 +1533,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, struct rtable *rt; flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS, + protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1681,30 +1554,12 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ipv4_validate_peer(struct rtable *rt) -{ - if (rt->rt_peer_genid != rt_peer_genid()) { - struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst); - - if (peer) { - check_peer_pmtu(&rt->dst, peer); - - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) - check_peer_redir(&rt->dst, peer); - } - - rt->rt_peer_genid = rt_peer_genid(); - } -} - static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; if (rt_is_expired(rt)) return NULL; - ipv4_validate_peer(rt); return dst; } @@ -1716,10 +1571,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) fib_info_put(rt->fi); rt->fi = NULL; } - if (rt_has_peer(rt)) { - struct inet_peer *peer = rt_peer_ptr(rt); - inet_putpeer(peer); - } } @@ -1730,11 +1581,8 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt && rt_has_peer(rt)) { - struct inet_peer *peer = rt_peer_ptr(rt); - if (peer_pmtu_cleaned(peer)) - dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig); - } + if (rt) + dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct sk_buff *skb) @@ -1814,7 +1662,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { const struct rtable *rt = (const struct rtable *) dst; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + unsigned int mtu = rt->rt_pmtu; + + if (mtu && time_after_eq(jiffies, rt->dst.expires)) + mtu = 0; + + if (!mtu) + mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu && rt_is_output_route(rt)) return mtu; @@ -1836,63 +1690,27 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, struct fib_info *fi) { - struct inet_peer_base *base; - struct inet_peer *peer; - int create = 0; - - /* If a peer entry exists for this destination, we must hook - * it up in order to get at cached metrics. - */ - if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) - create = 1; - - base = inetpeer_base_ptr(rt->_peer); - BUG_ON(!base); - - peer = inet_getpeer_v4(base, rt->rt_dst, create); - if (peer) { - __rt_set_peer(rt, peer); - rt->rt_peer_genid = rt_peer_genid(); - if (inet_metrics_new(peer)) - memcpy(peer->metrics, fi->fib_metrics, - sizeof(u32) * RTAX_MAX); - dst_init_metrics(&rt->dst, peer->metrics, false); - - check_peer_pmtu(&rt->dst, peer); - - if (peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) { - rt->rt_gateway = peer->redirect_learned.a4; - rt->rt_flags |= RTCF_REDIRECTED; - } - } else { - if (fi->fib_metrics != (u32 *) dst_default_metrics) { - rt->fi = fi; - atomic_inc(&fi->fib_clntref); - } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); } static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { - struct dst_entry *dst = &rt->dst; - if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID - dst->tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } - if (dst_mtu(dst) > IP_MAX_MTU) - dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); - #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); @@ -1964,9 +1782,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_peer_genid = 0; - rt_init_peer(rth, dev_net(dev)->ipv4.peers); rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; @@ -2090,9 +1907,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_peer_genid = 0; - rt_init_peer(rth, &res->table->tb_peers); rth->fi = NULL; rth->dst.input = ip_forward; @@ -2269,9 +2085,8 @@ local_input: rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_mark = skb->mark; + rth->rt_pmtu = 0; rth->rt_gateway = daddr; - rth->rt_peer_genid = 0; - rt_init_peer(rth, net->ipv4.peers); rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -2346,7 +2161,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - ipv4_validate_peer(rth); if (noref) { dst_use_noref(&rth->dst, jiffies); skb_dst_set_noref(skb, &rth->dst); @@ -2468,11 +2282,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; rth->rt_mark = fl4->flowi4_mark; + rth->rt_pmtu = 0; rth->rt_gateway = fl4->daddr; - rth->rt_peer_genid = 0; - rt_init_peer(rth, (res->table ? - &res->table->tb_peers : - dev_net(dev_out)->ipv4.peers)); rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); @@ -2726,7 +2537,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - ipv4_validate_peer(rth); dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2790,7 +2600,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; new->output = dst_discard; - dst_copy_metrics(new, &ort->dst); new->dev = ort->dst.dev; if (new->dev) @@ -2803,6 +2612,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; rt->rt_mark = ort->rt_mark; + rt->rt_pmtu = ort->rt_pmtu; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; @@ -2810,7 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_dst = ort->rt_dst; rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; - rt_transfer_peer(rt, ort); rt->fi = ort->fi; if (rt->fi) atomic_inc(&rt->fi->fib_clntref); @@ -2848,7 +2657,7 @@ static int rt_fill_info(struct net *net, struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; - u32 id = 0, ts = 0, tsage = 0, error; + u32 error; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); if (nlh == NULL) @@ -2901,21 +2710,12 @@ static int rt_fill_info(struct net *net, goto nla_put_failure; error = rt->dst.error; - if (rt_has_peer(rt)) { - const struct inet_peer *peer = rt_peer_ptr(rt); - inet_peer_refcheck(peer); - id = atomic_read(&peer->ip_id_count) & 0xffff; - if (peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } - expires = ACCESS_ONCE(peer->pmtu_expires); - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } + expires = rt->dst.expires; + if (expires) { + if (time_before(jiffies, expires)) + expires -= jiffies; + else + expires = 0; } if (rt_is_input_route(rt)) { @@ -2944,8 +2744,7 @@ static int rt_fill_info(struct net *net, goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ba605f60e4e..29aa0c800cd0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3563,6 +3563,8 @@ void __init tcp_init(void) pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_metrics_init(); + tcp_register_congestion_control(&tcp_reno); memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca0d0e7c9778..055ac49b8b40 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; @@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk) tcp_bound_rto(sk); } -/* Save metrics learned by this TCP session. - This function is called only, when TCP finishes successfully - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (sysctl_tcp_nometrics_save) - return; - - if (dst && (dst->flags & DST_HOST)) { - const struct inet_connection_sock *icsk = inet_csk(sk); - int m; - unsigned long rtt; - - dst_confirm(dst); - - if (icsk->icsk_backoff || !tp->srtt) { - /* This session failed to estimate rtt. Why? - * Probably, no packets returned in time. - * Reset our results. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) - dst_metric_set(dst, RTAX_RTT, 0); - return; - } - - rtt = dst_metric_rtt(dst, RTAX_RTT); - m = rtt - tp->srtt; - - /* If newly calculated rtt larger than stored one, - * store new one. Otherwise, use EWMA. Remember, - * rtt overestimation is always better than underestimation. - */ - if (!(dst_metric_locked(dst, RTAX_RTT))) { - if (m <= 0) - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); - else - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); - } - - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { - unsigned long var; - if (m < 0) - m = -m; - - /* Scale deviation to rttvar fixed point */ - m >>= 1; - if (m < tp->mdev) - m = tp->mdev; - - var = dst_metric_rtt(dst, RTAX_RTTVAR); - if (m >= var) - var = m; - else - var -= (var - m) >> 2; - - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); - } - - if (tcp_in_initial_slowstart(tp)) { - /* Slow start still did not finish. */ - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); - if (!dst_metric_locked(dst, RTAX_CWND) && - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); - } else if (tp->snd_cwnd > tp->snd_ssthresh && - icsk->icsk_ca_state == TCP_CA_Open) { - /* Cong. avoidance phase, cwnd is reliable. */ - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_cwnd) >> 1); - } else { - /* Else slow start did not finish, cwnd is non-sense, - ssthresh may be also invalid. - */ - if (!dst_metric_locked(dst, RTAX_CWND)) - dst_metric_set(dst, RTAX_CWND, - (dst_metric(dst, RTAX_CWND) + - tp->snd_ssthresh) >> 1); - if (dst_metric(dst, RTAX_SSTHRESH) && - !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); - } - - if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && - tp->reordering != sysctl_tcp_reordering) - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); - } - } -} - __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) @@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp) tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; } -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - - if (dst == NULL) - goto reset; - - dst_confirm(dst); - - if (dst_metric_locked(dst, RTAX_CWND)) - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - } - if (dst_metric(dst, RTAX_REORDERING) && - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); - tp->reordering = dst_metric(dst, RTAX_REORDERING); - } - - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) - goto reset; - - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); - tp->rtt_seq = tp->snd_nxt; - } - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { - /* RFC6298: 5.7 We've failed to get a valid RTT sample from - * 3WHS. This is most likely due to retransmission, - * including spurious one. Reset the RTO back to 3secs - * from the more aggressive 1sec to avoid more spurious - * retransmission. - */ - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; - } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 64568fa21d05..ddefd39ac0cf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { - struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) + tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; @@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { - struct inet_peer *peer = NULL; struct flowi4 fl4; /* VJ's idea. We save last timestamp seen @@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && - fl4.daddr == saddr && - (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + fl4.daddr == saddr) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -1867,21 +1847,6 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v4_get_peer(struct sock *sk) -{ - struct rtable *rt = (struct rtable *) __sk_dst_get(sk); - struct inet_sock *inet = inet_sk(sk); - - /* If we don't have a valid cached route, or we're doing IP - * options which make the IPv4 header destination address - * different from our peer's, do not bother with this. - */ - if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr) - return NULL; - return rt_get_peer_create(rt, inet->inet_daddr); -} -EXPORT_SYMBOL(tcp_v4_get_peer); - static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1894,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 000000000000..1fd83d3118fe --- /dev/null +++ b/net/ipv4/tcp_metrics.c @@ -0,0 +1,697 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_tcp_nometrics_save __read_mostly; + +enum tcp_metric_index { + TCP_METRIC_RTT, + TCP_METRIC_RTTVAR, + TCP_METRIC_SSTHRESH, + TCP_METRIC_CWND, + TCP_METRIC_REORDERING, + + /* Always last. */ + TCP_METRIC_MAX, +}; + +struct tcp_metrics_block { + struct tcp_metrics_block __rcu *tcpm_next; + struct inetpeer_addr tcpm_addr; + unsigned long tcpm_stamp; + u32 tcpm_ts; + u32 tcpm_ts_stamp; + u32 tcpm_lock; + u32 tcpm_vals[TCP_METRIC_MAX]; +}; + +static bool tcp_metric_locked(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_lock & (1 << idx); +} + +static u32 tcp_metric_get(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return tm->tcpm_vals[idx]; +} + +static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, + enum tcp_metric_index idx) +{ + return msecs_to_jiffies(tm->tcpm_vals[idx]); +} + +static void tcp_metric_set(struct tcp_metrics_block *tm, + enum tcp_metric_index idx, + u32 val) +{ + tm->tcpm_vals[idx] = val; +} + +static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, + enum tcp_metric_index idx, + u32 val) +{ + tm->tcpm_vals[idx] = jiffies_to_msecs(val); +} + +static bool addr_same(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + const struct in6_addr *a6, *b6; + + if (a->family != b->family) + return false; + if (a->family == AF_INET) + return a->addr.a4 == b->addr.a4; + + a6 = (const struct in6_addr *) &a->addr.a6[0]; + b6 = (const struct in6_addr *) &b->addr.a6[0]; + + return ipv6_addr_equal(a6, b6); +} + +struct tcpm_hash_bucket { + struct tcp_metrics_block __rcu *chain; +}; + +static DEFINE_SPINLOCK(tcp_metrics_lock); + +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + u32 val; + + val = 0; + if (dst_metric_locked(dst, RTAX_RTT)) + val |= 1 << TCP_METRIC_RTT; + if (dst_metric_locked(dst, RTAX_RTTVAR)) + val |= 1 << TCP_METRIC_RTTVAR; + if (dst_metric_locked(dst, RTAX_SSTHRESH)) + val |= 1 << TCP_METRIC_SSTHRESH; + if (dst_metric_locked(dst, RTAX_CWND)) + val |= 1 << TCP_METRIC_CWND; + if (dst_metric_locked(dst, RTAX_REORDERING)) + val |= 1 << TCP_METRIC_REORDERING; + tm->tcpm_lock = val; + + tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); + tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); + tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); + tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); + tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tm->tcpm_ts = 0; + tm->tcpm_ts_stamp = 0; +} + +static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, + struct inetpeer_addr *addr, + unsigned int hash, + bool reclaim) +{ + struct tcp_metrics_block *tm; + struct net *net; + + spin_lock_bh(&tcp_metrics_lock); + net = dev_net(dst->dev); + if (unlikely(reclaim)) { + struct tcp_metrics_block *oldest; + + oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); + for (tm = rcu_dereference(oldest->tcpm_next); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) + oldest = tm; + } + tm = oldest; + } else { + tm = kmalloc(sizeof(*tm), GFP_ATOMIC); + if (!tm) + goto out_unlock; + } + tm->tcpm_addr = *addr; + tm->tcpm_stamp = jiffies; + + tcpm_suck_dst(tm, dst); + + if (likely(!reclaim)) { + tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; + rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + } + +out_unlock: + spin_unlock_bh(&tcp_metrics_lock); + return tm; +} + +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + +static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) +{ + if (tm) + return tm; + if (depth > TCP_METRICS_RECLAIM_DEPTH) + return TCP_METRICS_RECLAIM_PTR; + return NULL; +} + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, + struct net *net, unsigned int hash) +{ + struct tcp_metrics_block *tm; + int depth = 0; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, addr)) + break; + depth++; + } + return tcp_get_encode(tm, depth); +} + +static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, + struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + + addr.family = req->rsk_ops->family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = inet_rsk(req)->rmt_addr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; + hash = ((__force unsigned int) addr.addr.a6[0] ^ + (__force unsigned int) addr.addr.a6[1] ^ + (__force unsigned int) addr.addr.a6[2] ^ + (__force unsigned int) addr.addr.a6[3]); + break; + default: + return NULL; + } + + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); + + net = dev_net(dst->dev); + hash &= net->ipv4.tcp_metrics_hash_mask; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) + break; + } + tcpm_check_stamp(tm, dst); + return tm; +} + +static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) +{ + struct inet6_timewait_sock *tw6; + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + + addr.family = tw->tw_family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + tw6 = inet6_twsk((struct sock *)tw); + *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; + hash = ((__force unsigned int) addr.addr.a6[0] ^ + (__force unsigned int) addr.addr.a6[1] ^ + (__force unsigned int) addr.addr.a6[2] ^ + (__force unsigned int) addr.addr.a6[3]); + break; + default: + return NULL; + } + + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); + + net = twsk_net(tw); + hash &= net->ipv4.tcp_metrics_hash_mask; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) + break; + } + return tm; +} + +static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, + struct dst_entry *dst, + bool create) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + bool reclaim; + + addr.family = sk->sk_family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; + hash = ((__force unsigned int) addr.addr.a6[0] ^ + (__force unsigned int) addr.addr.a6[1] ^ + (__force unsigned int) addr.addr.a6[2] ^ + (__force unsigned int) addr.addr.a6[3]); + break; + default: + return NULL; + } + + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); + + net = dev_net(dst->dev); + hash &= net->ipv4.tcp_metrics_hash_mask; + + tm = __tcp_get_metrics(&addr, net, hash); + reclaim = false; + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (!tm && create) + tm = tcpm_new(dst, &addr, hash, reclaim); + else + tcpm_check_stamp(tm, dst); + + return tm; +} + +/* Save metrics learned by this TCP session. This function is called + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT + * or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + unsigned long rtt; + u32 val; + int m; + + if (sysctl_tcp_nometrics_save || !dst) + return; + + if (dst->flags & DST_HOST) + dst_confirm(dst); + + rcu_read_lock(); + if (icsk->icsk_backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. Reset our + * results. + */ + tm = tcp_get_metrics(sk, dst, false); + if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) + tcp_metric_set(tm, TCP_METRIC_RTT, 0); + goto out_unlock; + } else + tm = tcp_get_metrics(sk, dst, true); + + if (!tm) + goto out_unlock; + + rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + m = rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, store new + * one. Otherwise, use EWMA. Remember, rtt overestimation is + * always better than underestimation. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { + if (m <= 0) + rtt = tp->srtt; + else + rtt -= (m >> 3); + tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); + } + + if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { + unsigned long var; + + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + if (m >= var) + var = m; + else + var -= (var - m) >> 2; + + tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); + } + + if (tcp_in_initial_slowstart(tp)) { + /* Slow start still did not finish. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && (tp->snd_cwnd >> 1) > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_cwnd >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + if (tp->snd_cwnd > val) + tcp_metric_set(tm, TCP_METRIC_CWND, + tp->snd_cwnd); + } + } else if (tp->snd_cwnd > tp->snd_ssthresh && + icsk->icsk_ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1); + } + } else { + /* Else slow start did not finish, cwnd is non-sense, + * ssthresh may be also invalid. + */ + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { + val = tcp_metric_get(tm, TCP_METRIC_CWND); + tcp_metric_set(tm, TCP_METRIC_CWND, + (val + tp->snd_ssthresh) >> 1); + } + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val && tp->snd_ssthresh > val) + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, + tp->snd_ssthresh); + } + if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + tcp_metric_set(tm, TCP_METRIC_REORDERING, + tp->reordering); + } + } + tm->tcpm_stamp = jiffies; +out_unlock: + rcu_read_unlock(); +} + +/* Initialize metrics on socket. */ + +void tcp_init_metrics(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_metrics_block *tm; + u32 val; + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (!tm) { + rcu_read_unlock(); + goto reset; + } + + if (tcp_metric_locked(tm, TCP_METRIC_CWND)) + tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); + + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + if (val) { + tp->snd_ssthresh = val; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + } + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); + if (val && tp->reordering != val) { + tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); + tp->reordering = val; + } + + val = tcp_metric_get(tm, TCP_METRIC_RTT); + if (val == 0 || tp->srtt == 0) { + rcu_read_unlock(); + goto reset; + } + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + val = msecs_to_jiffies(val); + if (val > tp->srtt) { + tp->srtt = val; + tp->rtt_seq = tp->snd_nxt; + } + val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); + if (val > tp->mdev) { + tp->mdev = val; + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + } + rcu_read_unlock(); + + tcp_set_rto(sk); +reset: + if (tp->srtt == 0) { + /* RFC6298: 5.7 We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. + */ + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; + } + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +{ + struct tcp_metrics_block *tm; + bool ret; + + if (!dst) + return false; + + rcu_read_lock(); + tm = __tcp_get_metrics_req(req, dst); + if (paws_check) { + if (tm && + (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && + (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ret = false; + else + ret = true; + } else { + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) + ret = true; + else + ret = false; + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_peer_is_proven); + +void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; + tp->rx_opt.ts_recent = tm->tcpm_ts; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); + +/* VJ's idea. Save last timestamp seen from this destination and hold + * it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter + * synchronized state. + */ +bool tcp_remember_stamp(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + bool ret = false; + + if (dst) { + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + tm->tcpm_ts = tp->rx_opt.ts_recent; + } + ret = true; + } + rcu_read_unlock(); + } + return ret; +} + +bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct tcp_metrics_block *tm; + bool ret = false; + + rcu_read_lock(); + tm = __tcp_get_metrics_tw(tw); + if (tw) { + const struct tcp_timewait_sock *tcptw; + struct sock *sk = (struct sock *) tw; + + tcptw = tcp_twsk(sk); + if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + tm->tcpm_ts = tcptw->tw_ts_recent; + } + ret = true; + } + rcu_read_unlock(); + + return ret; +} + +static unsigned long tcpmhash_entries; +static int __init set_tcpmhash_entries(char *str) +{ + ssize_t ret; + + if (!str) + return 0; + + ret = kstrtoul(str, 0, &tcpmhash_entries); + if (ret) + return 0; + + return 1; +} +__setup("tcpmhash_entries=", set_tcpmhash_entries); + +static int __net_init tcp_net_metrics_init(struct net *net) +{ + int slots, size; + + slots = tcpmhash_entries; + if (!slots) { + if (totalram_pages >= 128 * 1024) + slots = 16 * 1024; + else + slots = 8 * 1024; + } + + size = slots * sizeof(struct tcpm_hash_bucket); + + net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); + if (!net->ipv4.tcp_metrics_hash) + return -ENOMEM; + + net->ipv4.tcp_metrics_hash_mask = (slots - 1); + + return 0; +} + +static void __net_exit tcp_net_metrics_exit(struct net *net) +{ + kfree(net->ipv4.tcp_metrics_hash); +} + +static __net_initdata struct pernet_operations tcp_net_metrics_ops = { + .init = tcp_net_metrics_init, + .exit = tcp_net_metrics_exit, +}; + +void __init tcp_metrics_init(void) +{ + register_pernet_subsys(&tcp_net_metrics_ops); +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 72b7c63b1a39..65608863fdee 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = { }; EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static bool tcp_remember_stamp(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct inet_peer *peer; - - peer = icsk->icsk_af_ops->get_peer(sk); - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - return true; - } - - return false; -} - -static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - const struct tcp_timewait_sock *tcptw; - struct sock *sk = (struct sock *) tw; - struct inet_peer *peer; - - tcptw = tcp_twsk(sk); - peer = tcptw->tw_peer; - if (peer) { - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - return true; - } - return false; -} - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -313,12 +267,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); bool recycle_ok = false; - bool recycle_on = false; - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) { + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - recycle_on = true; - } if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); @@ -327,7 +278,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); struct inet_sock *inet = inet_sk(sk); - struct inet_peer *peer = NULL; tw->tw_transparent = inet->transparent; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -351,12 +301,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } #endif - if (recycle_on) - peer = icsk->icsk_af_ops->get_peer(sk); - tcptw->tw_peer = peer; - if (peer) - atomic_inc(&peer->refcnt); - #ifdef CONFIG_TCP_MD5SIG /* * The timewait bucket does not have the key DB from the @@ -408,11 +352,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) void tcp_twsk_destructor(struct sock *sk) { +#ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_peer) - inet_putpeer(twsk->tw_peer); -#ifdef CONFIG_TCP_MD5SIG if (twsk->tw_md5_key) { tcp_free_md5sig_pool(); kfree_rcu(twsk->tw_md5_key, rcu); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9815ea0bca7f..87d3fcc302d4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -90,8 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.dst.dev = dev; dev_hold(dev); - rt_transfer_peer(&xdst->u.rt, rt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | @@ -100,6 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_src = rt->rt_src; xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_pmtu = rt->rt_pmtu; return 0; } @@ -209,11 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) dst_destroy_metrics_generic(dst); - if (rt_has_peer(&xdst->u.rt)) { - struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt); - inet_putpeer(peer); - } - xfrm_dst_destroy(xdst); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c7da1422cbde..a113f7d7e938 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -194,8 +194,10 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = rt6_get_peer_create(rt); + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c6af5963a202..5b2d63ed793e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -466,13 +466,15 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = rt6_get_peer_create(rt); + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); + if (peer) + inet_putpeer(peer); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -592,10 +594,14 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) int old, new; if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer = rt6_get_peer_create(rt); + struct inet_peer *peer; + struct net *net; + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); if (peer) { fhdr->identification = htonl(inet_getid(peer, 0)); + inet_putpeer(peer); return; } } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 69a6330dea91..0fddd571400d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1486,6 +1486,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) int rd_len; int err; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", @@ -1519,8 +1520,11 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = rt6_get_peer_create(rt); - if (!inet_peer_xrlim_allow(peer, 1*HZ)) + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + ret = inet_peer_xrlim_allow(peer, 1*HZ); + if (peer) + inet_putpeer(peer); + if (!ret) goto release; if (dev->addr_len) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6cc6c881f54f..563f12c1c99c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1093,7 +1093,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; - fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS; + fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; @@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 pid, u32 seq, int prefix, int nowait, unsigned int flags) { - const struct inet_peer *peer; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; u32 table; struct neighbour *n; - u32 ts, tsage; if (prefix) { /* user wants prefix routes only */ if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { @@ -2473,17 +2471,7 @@ static int rt6_fill_node(struct net *net, else expires = INT_MAX; - peer = NULL; - if (rt6_has_peer(rt)) - peer = rt6_peer_ptr(rt); - ts = tsage = 0; - if (peer && peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } - - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage, - expires, rt->dst.error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6cc67ed6c2e6..61175cb2478f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { - struct inet_peer *peer = rt6_get_peer(rt); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) + tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; if (np->opt) @@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->iif = inet6_iif(skb); if (!isn) { - struct inet_peer *peer = NULL; - if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && - (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && - ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, - &treq->rmt_addr)) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -1712,20 +1689,6 @@ do_time_wait: goto discard_it; } -static struct inet_peer *tcp_v6_get_peer(struct sock *sk) -{ - struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - - /* If we don't have a valid cached route, or we're doing IP - * options which make the IPv6 header destination address - * different from our peer's, do not bother with this. - */ - if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) - return NULL; - return rt6_get_peer_create(rt); -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1738,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v6_get_peer, .net_header_len = sizeof(struct ipv6hdr), .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, @@ -1770,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt,