OpenCloudOS-Kernel/net/ipv6/tcp_ipv6.c

2127 lines
54 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TCP over IPv6
* Linux INET6 implementation
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
* Based on:
* linux/net/ipv4/tcp.c
* linux/net/ipv4/tcp_input.c
* linux/net/ipv4/tcp_output.c
*
* Fixes:
* Hideaki YOSHIFUJI : sin6_scope_id support
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file.
*/
#include <linux/bottom_half.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/jiffies.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/ipsec.h>
#include <linux/times.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/indirect_call_wrapper.h>
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
#include <net/inet6_connection_sock.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/ip6_checksum.h>
#include <net/inet_ecn.h>
#include <net/protocol.h>
#include <net/xfrm.h>
#include <net/snmp.h>
#include <net/dsfield.h>
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
#include <trace/events/tcp.h>
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp: Fix kernel panic when calling tcp_v(4/6)_md5_do_lookup If the following packet flow happen, kernel will panic. MathineA MathineB SYN ----------------------> SYN+ACK <---------------------- ACK(bad seq) ----------------------> When a bad seq ACK is received, tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr)) is finally called by tcp_v4_reqsk_send_ack(), but the first parameter(skb->sk) is NULL at that moment, so kernel panic happens. This patch fixes this bug. OOPS output is as following: [ 302.812793] IP: [<c05cfaa6>] tcp_v4_md5_do_lookup+0x12/0x42 [ 302.817075] Oops: 0000 [#1] SMP [ 302.819815] Modules linked in: ipv6 loop dm_multipath rtc_cmos rtc_core rtc_lib pcspkr pcnet32 mii i2c_piix4 parport_pc i2c_core parport ac button ata_piix libata dm_mod mptspi mptscsih mptbase scsi_transport_spi sd_mod scsi_mod crc_t10dif ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last unloaded: scsi_wait_scan] [ 302.849946] [ 302.851198] Pid: 0, comm: swapper Not tainted (2.6.27-rc1-guijf #5) [ 302.855184] EIP: 0060:[<c05cfaa6>] EFLAGS: 00010296 CPU: 0 [ 302.858296] EIP is at tcp_v4_md5_do_lookup+0x12/0x42 [ 302.861027] EAX: 0000001e EBX: 00000000 ECX: 00000046 EDX: 00000046 [ 302.864867] ESI: ceb69e00 EDI: 1467a8c0 EBP: cf75f180 ESP: c0792e54 [ 302.868333] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 [ 302.871287] Process swapper (pid: 0, ti=c0792000 task=c0712340 task.ti=c0746000) [ 302.875592] Stack: c06f413a 00000000 cf75f180 ceb69e00 00000000 c05d0d86 000016d0 ceac5400 [ 302.883275] c05d28f8 000016d0 ceb69e00 ceb69e20 681bf6e3 00001000 00000000 0a67a8c0 [ 302.890971] ceac5400 c04250a3 c06f413a c0792eb0 c0792edc cf59a620 cf59a620 cf59a634 [ 302.900140] Call Trace: [ 302.902392] [<c05d0d86>] tcp_v4_reqsk_send_ack+0x17/0x35 [ 302.907060] [<c05d28f8>] tcp_check_req+0x156/0x372 [ 302.910082] [<c04250a3>] printk+0x14/0x18 [ 302.912868] [<c05d0aa1>] tcp_v4_do_rcv+0x1d3/0x2bf [ 302.917423] [<c05d26be>] tcp_v4_rcv+0x563/0x5b9 [ 302.920453] [<c05bb20f>] ip_local_deliver_finish+0xe8/0x183 [ 302.923865] [<c05bb10a>] ip_rcv_finish+0x286/0x2a3 [ 302.928569] [<c059e438>] dev_alloc_skb+0x11/0x25 [ 302.931563] [<c05a211f>] netif_receive_skb+0x2d6/0x33a [ 302.934914] [<d0917941>] pcnet32_poll+0x333/0x680 [pcnet32] [ 302.938735] [<c05a3b48>] net_rx_action+0x5c/0xfe [ 302.941792] [<c042856b>] __do_softirq+0x5d/0xc1 [ 302.944788] [<c042850e>] __do_softirq+0x0/0xc1 [ 302.948999] [<c040564b>] do_softirq+0x55/0x88 [ 302.951870] [<c04501b1>] handle_fasteoi_irq+0x0/0xa4 [ 302.954986] [<c04284da>] irq_exit+0x35/0x69 [ 302.959081] [<c0405717>] do_IRQ+0x99/0xae [ 302.961896] [<c040422b>] common_interrupt+0x23/0x28 [ 302.966279] [<c040819d>] default_idle+0x2a/0x3d [ 302.969212] [<c0402552>] cpu_idle+0xb2/0xd2 [ 302.972169] ======================= [ 302.974274] Code: fc ff 84 d2 0f 84 df fd ff ff e9 34 fe ff ff 83 c4 0c 5b 5e 5f 5d c3 90 90 57 89 d7 56 53 89 c3 50 68 3a 41 6f c0 e8 e9 55 e5 ff <8b> 93 9c 04 00 00 58 85 d2 59 74 1e 8b 72 10 31 db 31 c9 85 f6 [ 303.011610] EIP: [<c05cfaa6>] tcp_v4_md5_do_lookup+0x12/0x42 SS:ESP 0068:c0792e54 [ 303.018360] Kernel panic - not syncing: Fatal exception in interrupt Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-07 14:50:04 +08:00
struct request_sock *req);
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped;
static const struct inet_connection_sock_af_ops ipv6_specific;
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#else
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
const struct in6_addr *addr)
{
return NULL;
}
#endif
/* Helper returning the inet6 address from a given tcp socket.
* It can be used in TCP stack instead of inet6_sk(sk).
* This avoids a dereference and allow compiler optimizations.
* It is a specialized version of inet6_sk_generic().
*/
static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk)
{
unsigned int offset = sizeof(struct tcp6_sock) - sizeof(struct ipv6_pinfo);
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}
static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
net: fix IP early demux races David Wilder reported crashes caused by dst reuse. <quote David> I am seeing a crash on a distro V4.2.3 kernel caused by a double release of a dst_entry. In ipv4_dst_destroy() the call to list_empty() finds a poisoned next pointer, indicating the dst_entry has already been removed from the list and freed. The crash occurs 18 to 24 hours into a run of a network stress exerciser. </quote> Thanks to his detailed report and analysis, we were able to understand the core issue. IP early demux can associate a dst to skb, after a lookup in TCP/UDP sockets. When socket cache is not properly set, we want to store into sk->sk_dst_cache the dst for future IP early demux lookups, by acquiring a stable refcount on the dst. Problem is this acquisition is simply using an atomic_inc(), which works well, unless the dst was queued for destruction from dst_release() noticing dst refcount went to zero, if DST_NOCACHE was set on dst. We need to make sure current refcount is not zero before incrementing it, or risk double free as David reported. This patch, being a stable candidate, adds two new helpers, and use them only from IP early demux problematic paths. It might be possible to merge in net-next skb_dst_force() and skb_dst_force_safe(), but I prefer having the smallest patch for stable kernels : Maybe some skb_dst_force() callers do not expect skb->dst can suddenly be cleared. Can probably be backported back to linux-3.6 kernels Reported-by: David J. Wilder <dwilder@us.ibm.com> Tested-by: David J. Wilder <dwilder@us.ibm.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-15 06:08:53 +08:00
if (dst && dst_hold_safe(dst)) {
tcp: remove dst refcount false sharing for prequeue mode Alexander Duyck reported high false sharing on dst refcount in tcp stack when prequeue is used. prequeue is the mechanism used when a thread is blocked in recvmsg()/read() on a TCP socket, using a blocking model rather than select()/poll()/epoll() non blocking one. We already try to use RCU in input path as much as possible, but we were forced to take a refcount on the dst when skb escaped RCU protected region. When/if the user thread runs on different cpu, dst_release() will then touch dst refcount again. Commit 093162553c33 (tcp: force a dst refcount when prequeue packet) was an example of a race fix. It turns out the only remaining usage of skb->dst for a packet stored in a TCP socket prequeue is IP early demux. We can add a logic to detect when IP early demux is probably going to use skb->dst. Because we do an optimistic check rather than duplicate existing logic, we need to guard inet_sk_rx_dst_set() and inet6_sk_rx_dst_set() from using a NULL dst. Many thanks to Alexander for providing a nice bug report, git bisection, and reproducer. Tested using Alexander script on a 40Gb NIC, 8 RX queues. Hosts have 24 cores, 48 hyper threads. echo 0 >/proc/sys/net/ipv4/tcp_autocorking for i in `seq 0 47` do for j in `seq 0 2` do netperf -H $DEST -t TCP_STREAM -l 1000 \ -c -C -T $i,$i -P 0 -- \ -m 64 -s 64K -D & done done Before patch : ~6Mpps and ~95% cpu usage on receiver After patch : ~9Mpps and ~35% cpu usage on receiver. Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-08 23:06:07 +08:00
const struct rt6_info *rt = (const struct rt6_info *)dst;
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
tcp: remove dst refcount false sharing for prequeue mode Alexander Duyck reported high false sharing on dst refcount in tcp stack when prequeue is used. prequeue is the mechanism used when a thread is blocked in recvmsg()/read() on a TCP socket, using a blocking model rather than select()/poll()/epoll() non blocking one. We already try to use RCU in input path as much as possible, but we were forced to take a refcount on the dst when skb escaped RCU protected region. When/if the user thread runs on different cpu, dst_release() will then touch dst refcount again. Commit 093162553c33 (tcp: force a dst refcount when prequeue packet) was an example of a race fix. It turns out the only remaining usage of skb->dst for a packet stored in a TCP socket prequeue is IP early demux. We can add a logic to detect when IP early demux is probably going to use skb->dst. Because we do an optimistic check rather than duplicate existing logic, we need to guard inet_sk_rx_dst_set() and inet6_sk_rx_dst_set() from using a NULL dst. Many thanks to Alexander for providing a nice bug report, git bisection, and reproducer. Tested using Alexander script on a 40Gb NIC, 8 RX queues. Hosts have 24 cores, 48 hyper threads. echo 0 >/proc/sys/net/ipv4/tcp_autocorking for i in `seq 0 47` do for j in `seq 0 2` do netperf -H $DEST -t TCP_STREAM -l 1000 \ -c -C -T $i,$i -P 0 -- \ -m 64 -s 64K -D & done done Before patch : ~6Mpps and ~95% cpu usage on receiver After patch : ~9Mpps and ~35% cpu usage on receiver. Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Alexander Duyck <alexander.h.duyck@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-08 23:06:07 +08:00
}
}
static u32 tcp_v6_init_seq(const struct sk_buff *skb)
{
return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
tcp_hdr(skb)->dest,
tcp_hdr(skb)->source);
}
static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32);
}
bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov <rdna@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-03-31 06:08:05 +08:00
static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
/* This check is replicated from tcp_v6_connect() and intended to
* prevent BPF program called below from accessing bytes that are out
* of the bound specified by user in addr_len.
*/
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
sock_owned_by_me(sk);
return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
}
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
struct ipv6_txoptions *opt;
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
int err;
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
IP6_ECN_flow_init(fl6.flowlabel);
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
}
}
/*
* connect() to INADDR_ANY means loopback (BSD'ism).
*/
if (ipv6_addr_any(&usin->sin6_addr)) {
if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
&usin->sin6_addr);
else
usin->sin6_addr = in6addr_loopback;
}
addr_type = ipv6_addr_type(&usin->sin6_addr);
if (addr_type & IPV6_ADDR_MULTICAST)
return -ENETUNREACH;
if (addr_type&IPV6_ADDR_LINKLOCAL) {
if (addr_len >= sizeof(struct sockaddr_in6) &&
usin->sin6_scope_id) {
/* If interface is set while binding, indices
* must coincide.
*/
if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id))
return -EINVAL;
sk->sk_bound_dev_if = usin->sin6_scope_id;
}
/* Connect to link-local address requires an interface */
if (!sk->sk_bound_dev_if)
return -EINVAL;
}
if (tp->rx_opt.ts_recent_stamp &&
!ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
WRITE_ONCE(tp->write_seq, 0);
}
sk->sk_v6_daddr = usin->sin6_addr;
np->flow_label = fl6.flowlabel;
/*
* TCP over IPv4
*/
if (addr_type & IPV6_ADDR_MAPPED) {
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
if (__ipv6_only_sock(sk))
return -ENETUNREACH;
sin.sin_family = AF_INET;
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
icsk->icsk_af_ops = &ipv6_mapped;
sk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
icsk->icsk_af_ops = &ipv6_specific;
sk->sk_backlog_rcv = tcp_v6_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv6_specific;
#endif
goto failure;
}
np->saddr = sk->sk_v6_rcv_saddr;
return err;
}
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
saddr = &sk->sk_v6_rcv_saddr;
fl6.flowi6_proto = IPPROTO_TCP;
fl6.daddr = sk->sk_v6_daddr;
fl6.saddr = saddr ? *saddr : np->saddr;
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
}
if (!saddr) {
saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;
}
/* set the source address */
np->saddr = *saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
[IPV6]: Added GSO support for TCPv6 This patch adds GSO support for IPv6 and TCPv6. This is based on a patch by Ananda Raju <Ananda.Raju@neterion.com>. His original description is: This patch enables TSO over IPv6. Currently Linux network stacks restricts TSO over IPv6 by clearing of the NETIF_F_TSO bit from "dev->features". This patch will remove this restriction. This patch will introduce a new flag NETIF_F_TSO6 which will be used to check whether device supports TSO over IPv6. If device support TSO over IPv6 then we don't clear of NETIF_F_TSO and which will make the TCP layer to create TSO packets. Any device supporting TSO over IPv6 will set NETIF_F_TSO6 flag in "dev->features" along with NETIF_F_TSO. In case when user disables TSO using ethtool, NETIF_F_TSO will get cleared from "dev->features". So even if we have NETIF_F_TSO6 we don't get TSO packets created by TCP layer. SKB_GSO_TCPV4 renamed to SKB_GSO_TCP to make it generic GSO packet. SKB_GSO_UDPV4 renamed to SKB_GSO_UDP as UFO is not a IPv4 feature. UFO is supported over IPv6 also The following table shows there is significant improvement in throughput with normal frames and CPU usage for both normal and jumbo. -------------------------------------------------- | | 1500 | 9600 | | ------------------|-------------------| | | thru CPU | thru CPU | -------------------------------------------------- | TSO OFF | 2.00 5.5% id | 5.66 20.0% id | -------------------------------------------------- | TSO ON | 2.63 78.0 id | 5.67 39.0% id | -------------------------------------------------- Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-07-01 04:37:03 +08:00
sk->sk_gso_type = SKB_GSO_TCPV6;
ip6_dst_store(sk, dst, NULL, NULL);
icsk->icsk_ext_hdr_len = 0;
if (opt)
icsk->icsk_ext_hdr_len = opt->opt_flen +
opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
err = inet6_hash_connect(tcp_death_row, sk);
if (err)
goto late_failure;
sk_set_txhash(sk);
if (likely(!tp->repair)) {
if (!tp->write_seq)
WRITE_ONCE(tp->write_seq,
secure_tcpv6_seq(np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
inet->inet_dport));
tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk),
np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32);
}
net/tcp-fastopen: Add new API support This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an alternative way to perform Fast Open on the active side (client). Prior to this patch, a client needs to replace the connect() call with sendto(MSG_FASTOPEN). This can be cumbersome for applications who want to use Fast Open: these socket operations are often done in lower layer libraries used by many other applications. Changing these libraries and/or the socket call sequences are not trivial. A more convenient approach is to perform Fast Open by simply enabling a socket option when the socket is created w/o changing other socket calls sequence: s = socket() create a new socket setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …); newly introduced sockopt If set, new functionality described below will be used. Return ENOTSUPP if TFO is not supported or not enabled in the kernel. connect() With cookie present, return 0 immediately. With no cookie, initiate 3WHS with TFO cookie-request option and return -1 with errno = EINPROGRESS. write()/sendmsg() With cookie present, send out SYN with data and return the number of bytes buffered. With no cookie, and 3WHS not yet completed, return -1 with errno = EINPROGRESS. No MSG_FASTOPEN flag is needed. read() Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but write() is not called yet. Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is established but no msg is received yet. Return number of bytes read if socket is established and there is msg received. The new API simplifies life for applications that always perform a write() immediately after a successful connect(). Such applications can now take advantage of Fast Open by merely making one new setsockopt() call at the time of creating the socket. Nothing else about the application's socket call sequence needs to change. Signed-off-by: Wei Wang <weiwan@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 02:59:22 +08:00
if (tcp_fastopen_defer_connect(sk, &err))
return err;
if (err)
goto late_failure;
err = tcp_connect(sk);
if (err)
goto late_failure;
return 0;
late_failure:
tcp_set_state(sk, TCP_CLOSE);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
return err;
}
static void tcp_v6_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
if (!dst)
return;
if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
tcp_sync_mss(sk, dst_mtu(dst));
tcp_simple_retransmit(sk);
}
}
static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
struct net *net = dev_net(skb->dev);
struct request_sock *fastopen;
struct ipv6_pinfo *np;
struct tcp_sock *tp;
__u32 seq, snd_una;
struct sock *sk;
bool fatal;
int err;
sk = __inet6_lookup_established(net, &tcp_hashinfo,
&hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
return 0;
}
seq = ntohl(th->seq);
fatal = icmpv6_err_convert(type, code, &err);
if (sk->sk_state == TCP_NEW_SYN_RECV) {
tcp_req_err(sk, seq, fatal);
return 0;
}
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE)
goto out;
if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
np = tcp_inet6_sk(sk);
if (type == NDISC_REDIRECT) {
dccp/tcp: fix routing redirect race As Eric Dumazet pointed out this also needs to be fixed in IPv6. v2: Contains the IPv6 tcp/Ipv6 dccp patches as well. We have seen a few incidents lately where a dst_enty has been freed with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that dst_entry. If the conditions/timings are right a crash then ensues when the freed dst_entry is referenced later on. A Common crashing back trace is: #8 [] page_fault at ffffffff8163e648 [exception RIP: __tcp_ack_snd_check+74] . . #9 [] tcp_rcv_established at ffffffff81580b64 #10 [] tcp_v4_do_rcv at ffffffff8158b54a #11 [] tcp_v4_rcv at ffffffff8158cd02 #12 [] ip_local_deliver_finish at ffffffff815668f4 #13 [] ip_local_deliver at ffffffff81566bd9 #14 [] ip_rcv_finish at ffffffff8156656d #15 [] ip_rcv at ffffffff81566f06 #16 [] __netif_receive_skb_core at ffffffff8152b3a2 #17 [] __netif_receive_skb at ffffffff8152b608 #18 [] netif_receive_skb at ffffffff8152b690 #19 [] vmxnet3_rq_rx_complete at ffffffffa015eeaf [vmxnet3] #20 [] vmxnet3_poll_rx_only at ffffffffa015f32a [vmxnet3] #21 [] net_rx_action at ffffffff8152bac2 #22 [] __do_softirq at ffffffff81084b4f #23 [] call_softirq at ffffffff8164845c #24 [] do_softirq at ffffffff81016fc5 #25 [] irq_exit at ffffffff81084ee5 #26 [] do_IRQ at ffffffff81648ff8 Of course it may happen with other NIC drivers as well. It's found the freed dst_entry here: 224 static bool tcp_in_quickack_mode(struct sock *sk)↩ 225 {↩ 226 ▹ const struct inet_connection_sock *icsk = inet_csk(sk);↩ 227 ▹ const struct dst_entry *dst = __sk_dst_get(sk);↩ 228 ↩ 229 ▹ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||↩ 230 ▹ ▹ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);↩ 231 }↩ But there are other backtraces attributed to the same freed dst_entry in netfilter code as well. All the vmcores showed 2 significant clues: - Remote hosts behind the default gateway had always been redirected to a different gateway. A rtable/dst_entry will be added for that host. Making more dst_entrys with lower reference counts. Making this more probable. - All vmcores showed a postitive LockDroppedIcmps value, e.g: LockDroppedIcmps 267 A closer look at the tcp_v4_err() handler revealed that do_redirect() will run regardless of whether user space has the socket locked. This can result in a race condition where the same dst_entry cached in sk->sk_dst_entry can be decremented twice for the same socket via: do_redirect()->__sk_dst_check()-> dst_release(). Which leads to the dst_entry being prematurely freed with another socket pointing to it via sk->sk_dst_cache and a subsequent crash. To fix this skip do_redirect() if usespace has the socket locked. Instead let the redirect take place later when user space does not have the socket locked. The dccp/IPv6 code is very similar in this respect, so fixing it there too. As Eric Garver pointed out the following commit now invalidates routes. Which can set the dst->obsolete flag so that ipv4_dst_check() returns null and triggers the dst_release(). Fixes: ceb3320610d6 ("ipv4: Kill routes during PMTU/redirect updates.") Cc: Eric Garver <egarver@redhat.com> Cc: Hannes Sowa <hsowa@redhat.com> Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-10 13:40:33 +08:00
if (!sock_owned_by_user(sk)) {
struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
dccp/tcp: fix routing redirect race As Eric Dumazet pointed out this also needs to be fixed in IPv6. v2: Contains the IPv6 tcp/Ipv6 dccp patches as well. We have seen a few incidents lately where a dst_enty has been freed with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that dst_entry. If the conditions/timings are right a crash then ensues when the freed dst_entry is referenced later on. A Common crashing back trace is: #8 [] page_fault at ffffffff8163e648 [exception RIP: __tcp_ack_snd_check+74] . . #9 [] tcp_rcv_established at ffffffff81580b64 #10 [] tcp_v4_do_rcv at ffffffff8158b54a #11 [] tcp_v4_rcv at ffffffff8158cd02 #12 [] ip_local_deliver_finish at ffffffff815668f4 #13 [] ip_local_deliver at ffffffff81566bd9 #14 [] ip_rcv_finish at ffffffff8156656d #15 [] ip_rcv at ffffffff81566f06 #16 [] __netif_receive_skb_core at ffffffff8152b3a2 #17 [] __netif_receive_skb at ffffffff8152b608 #18 [] netif_receive_skb at ffffffff8152b690 #19 [] vmxnet3_rq_rx_complete at ffffffffa015eeaf [vmxnet3] #20 [] vmxnet3_poll_rx_only at ffffffffa015f32a [vmxnet3] #21 [] net_rx_action at ffffffff8152bac2 #22 [] __do_softirq at ffffffff81084b4f #23 [] call_softirq at ffffffff8164845c #24 [] do_softirq at ffffffff81016fc5 #25 [] irq_exit at ffffffff81084ee5 #26 [] do_IRQ at ffffffff81648ff8 Of course it may happen with other NIC drivers as well. It's found the freed dst_entry here: 224 static bool tcp_in_quickack_mode(struct sock *sk)↩ 225 {↩ 226 ▹ const struct inet_connection_sock *icsk = inet_csk(sk);↩ 227 ▹ const struct dst_entry *dst = __sk_dst_get(sk);↩ 228 ↩ 229 ▹ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||↩ 230 ▹ ▹ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);↩ 231 }↩ But there are other backtraces attributed to the same freed dst_entry in netfilter code as well. All the vmcores showed 2 significant clues: - Remote hosts behind the default gateway had always been redirected to a different gateway. A rtable/dst_entry will be added for that host. Making more dst_entrys with lower reference counts. Making this more probable. - All vmcores showed a postitive LockDroppedIcmps value, e.g: LockDroppedIcmps 267 A closer look at the tcp_v4_err() handler revealed that do_redirect() will run regardless of whether user space has the socket locked. This can result in a race condition where the same dst_entry cached in sk->sk_dst_entry can be decremented twice for the same socket via: do_redirect()->__sk_dst_check()-> dst_release(). Which leads to the dst_entry being prematurely freed with another socket pointing to it via sk->sk_dst_cache and a subsequent crash. To fix this skip do_redirect() if usespace has the socket locked. Instead let the redirect take place later when user space does not have the socket locked. The dccp/IPv6 code is very similar in this respect, so fixing it there too. As Eric Garver pointed out the following commit now invalidates routes. Which can set the dst->obsolete flag so that ipv4_dst_check() returns null and triggers the dst_release(). Fixes: ceb3320610d6 ("ipv4: Kill routes during PMTU/redirect updates.") Cc: Eric Garver <egarver@redhat.com> Cc: Hannes Sowa <hsowa@redhat.com> Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-10 13:40:33 +08:00
if (dst)
dst->ops->redirect(dst, sk, skb);
}
goto out;
}
if (type == ICMPV6_PKT_TOOBIG) {
/* We are not interested in TCP_LISTEN and open_requests
* (SYN-ACKs send out by Linux are always <576bytes so
* they should go through unfragmented).
*/
if (sk->sk_state == TCP_LISTEN)
goto out;
if (!ip6_sk_accept_pmtu(sk))
goto out;
tp->mtu_info = ntohl(info);
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}
/* Might be for an request_sock */
switch (sk->sk_state) {
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
* is already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
tcp_done(sk);
} else
sk->sk_err_soft = err;
goto out;
}
if (!sock_owned_by_user(sk) && np->recverr) {
sk->sk_err = err;
sk->sk_error_report(sk);
} else
sk->sk_err_soft = err;
out:
bh_unlock_sock(sk);
sock_put(sk);
return 0;
}
static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
skb = tcp_make_synack(sk, dst, req, foc, synack_type);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6->daddr = ireq->ir_v6_rmt_addr;
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
rcu_read_lock();
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
done:
return err;
}
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
const struct in6_addr *addr)
{
return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6);
}
static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
}
static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
char __user *optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
u8 prefixlen;
if (optlen < sizeof(cmd))
return -EINVAL;
if (copy_from_user(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin6->sin6_family != AF_INET6)
return -EINVAL;
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) &&
prefixlen > 32))
return -EINVAL;
} else {
prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
}
if (!cmd.tcpm_keylen) {
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
AF_INET, prefixlen);
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
AF_INET6, prefixlen);
}
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
AF_INET, prefixlen, cmd.tcpm_key,
cmd.tcpm_keylen, GFP_KERNEL);
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
AF_INET6, prefixlen, cmd.tcpm_key,
cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
const struct in6_addr *daddr,
const struct in6_addr *saddr,
const struct tcphdr *th, int nbytes)
{
struct tcp6_pseudohdr *bp;
struct scatterlist sg;
struct tcphdr *_th;
bp = hp->scratch;
/* 1. TCP pseudo-header (RFC2460) */
bp->saddr = *saddr;
bp->daddr = *daddr;
bp->protocol = cpu_to_be32(IPPROTO_TCP);
bp->len = cpu_to_be32(nbytes);
_th = (struct tcphdr *)(bp + 1);
memcpy(_th, th, sizeof(*th));
_th->check = 0;
sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
ahash_request_set_crypt(hp->md5_req, &sg, NULL,
sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
}
static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
const struct in6_addr *daddr, struct in6_addr *saddr,
const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
struct ahash_request *req;
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
req = hp->md5_req;
if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
ahash_request_set_crypt(req, NULL, md5_hash, 0);
if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
return 0;
clear_hash:
tcp_put_md5sig_pool();
clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
static int tcp_v6_md5_hash_skb(char *md5_hash,
const struct tcp_md5sig_key *key,
const struct sock *sk,
const struct sk_buff *skb)
{
const struct in6_addr *saddr, *daddr;
struct tcp_md5sig_pool *hp;
struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
if (sk) { /* valid for establish/request sockets */
saddr = &sk->sk_v6_rcv_saddr;
daddr = &sk->sk_v6_daddr;
} else {
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
saddr = &ip6h->saddr;
daddr = &ip6h->daddr;
}
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
req = hp->md5_req;
if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
ahash_request_set_crypt(req, NULL, md5_hash, 0);
if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
return 0;
clear_hash:
tcp_put_md5sig_pool();
clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
#endif
static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
int genhash;
u8 newhash[16];
hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
if (!hash_expected && !hash_location)
return false;
if (hash_expected && !hash_location) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
/* check the signature */
genhash = tcp_v6_md5_hash_skb(newhash,
hash_expected,
NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
genhash ? "failed" : "mismatch",
&ip6h->saddr, ntohs(th->source),
&ip6h->daddr, ntohs(th->dest));
return true;
}
#endif
return false;
}
static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
{
bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
struct inet_request_sock *ireq = inet_rsk(req);
const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener);
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
/* So that link locals have meaning */
if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
(ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
np->rxopt.bits.rxohlim || np->repflow)) {
refcount_inc(&skb->users);
ireq->pktopts = skb;
}
}
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
struct flowi *fl,
const struct request_sock *req)
{
return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.family = AF_INET6,
.obj_size = sizeof(struct tcp6_request_sock),
.rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v6_reqsk_send_ack,
.destructor = tcp_v6_reqsk_destructor,
.send_reset = tcp_v6_send_reset,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
#endif
.init_req = tcp_v6_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v6_init_sequence,
#endif
.route_req = tcp_v6_route_req,
.init_seq = tcp_v6_init_seq,
.init_ts_off = tcp_v6_init_ts_off,
.send_synack = tcp_v6_send_synack,
};
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
u8 tclass, __be32 label, u32 priority)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
struct sk_buff *buff;
struct flowi6 fl6;
struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
unsigned int tot_len = sizeof(struct tcphdr);
struct dst_entry *dst;
tcpv6: convert opt[] -> topt in tcp_v6_send_reset after this I get: $ diff-funcs tcp_v6_send_reset tcp_ipv6.c tcp_ipv6.c tcp_v6_send_ack --- tcp_ipv6.c:tcp_v6_send_reset() +++ tcp_ipv6.c:tcp_v6_send_ack() @@ -1,4 +1,5 @@ -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, + struct tcp_md5sig_key *key) { struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; @@ -7,31 +8,14 @@ struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 *topt; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; -#endif - - if (th->rst) - return; - - if (!ipv6_unicast_destination(skb)) - return; + if (ts) + tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG - if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); - else - key = NULL; - if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (buff == NULL) @@ -46,18 +30,20 @@ t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; - t1->rst = 1; - - if(th->ack) { - t1->seq = th->ack_seq; - } else { - t1->ack = 1; - t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin - + skb->len - (th->doff<<2)); - } + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); topt = (__be32 *)(t1 + 1); + if (ts) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tcp_time_stamp); + *topt++ = htonl(ts); + } + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -84,15 +70,10 @@ fl.fl_ip_sport = t1->source; security_skb_classify_flow(skb, &fl); - /* Pass a socket to ip6_dst_lookup either it is for RST - * Underlying function will use this to retrieve the network - * namespace - */ if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) { if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); return; } } ...which starts to be trivial to combine. Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-10 05:42:01 +08:00
__be32 *topt;
__u32 mark = 0;
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_TCP_MD5SIG
if (key)
tot_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
GFP_ATOMIC);
if (!buff)
return;
skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
t1 = skb_push(buff, tot_len);
skb_reset_transport_header(buff);
/* Swap the send and the receive. */
memset(t1, 0, sizeof(*t1));
t1->dest = th->source;
t1->source = th->dest;
t1->doff = tot_len / 4;
t1->seq = htonl(seq);
t1->ack_seq = htonl(ack);
t1->ack = !rst || !th->ack;
t1->rst = rst;
t1->window = htons(win);
tcpv6: convert opt[] -> topt in tcp_v6_send_reset after this I get: $ diff-funcs tcp_v6_send_reset tcp_ipv6.c tcp_ipv6.c tcp_v6_send_ack --- tcp_ipv6.c:tcp_v6_send_reset() +++ tcp_ipv6.c:tcp_v6_send_ack() @@ -1,4 +1,5 @@ -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, + struct tcp_md5sig_key *key) { struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; @@ -7,31 +8,14 @@ struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 *topt; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; -#endif - - if (th->rst) - return; - - if (!ipv6_unicast_destination(skb)) - return; + if (ts) + tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG - if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); - else - key = NULL; - if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (buff == NULL) @@ -46,18 +30,20 @@ t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; - t1->rst = 1; - - if(th->ack) { - t1->seq = th->ack_seq; - } else { - t1->ack = 1; - t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin - + skb->len - (th->doff<<2)); - } + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); topt = (__be32 *)(t1 + 1); + if (ts) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tcp_time_stamp); + *topt++ = htonl(ts); + } + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -84,15 +70,10 @@ fl.fl_ip_sport = t1->source; security_skb_classify_flow(skb, &fl); - /* Pass a socket to ip6_dst_lookup either it is for RST - * Underlying function will use this to retrieve the network - * namespace - */ if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) { if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); return; } } ...which starts to be trivial to combine. Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-10 05:42:01 +08:00
topt = (__be32 *)(t1 + 1);
if (tsecr) {
*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
*topt++ = htonl(tsval);
*topt++ = htonl(tsecr);
}
#ifdef CONFIG_TCP_MD5SIG
if (key) {
tcpv6: convert opt[] -> topt in tcp_v6_send_reset after this I get: $ diff-funcs tcp_v6_send_reset tcp_ipv6.c tcp_ipv6.c tcp_v6_send_ack --- tcp_ipv6.c:tcp_v6_send_reset() +++ tcp_ipv6.c:tcp_v6_send_ack() @@ -1,4 +1,5 @@ -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, + struct tcp_md5sig_key *key) { struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; @@ -7,31 +8,14 @@ struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 *topt; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; -#endif - - if (th->rst) - return; - - if (!ipv6_unicast_destination(skb)) - return; + if (ts) + tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG - if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); - else - key = NULL; - if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (buff == NULL) @@ -46,18 +30,20 @@ t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; - t1->rst = 1; - - if(th->ack) { - t1->seq = th->ack_seq; - } else { - t1->ack = 1; - t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin - + skb->len - (th->doff<<2)); - } + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); topt = (__be32 *)(t1 + 1); + if (ts) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tcp_time_stamp); + *topt++ = htonl(ts); + } + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -84,15 +70,10 @@ fl.fl_ip_sport = t1->source; security_skb_classify_flow(skb, &fl); - /* Pass a socket to ip6_dst_lookup either it is for RST - * Underlying function will use this to retrieve the network - * namespace - */ if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) { if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); return; } } ...which starts to be trivial to combine. Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-10 05:42:01 +08:00
*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
tcp_v6_md5_hash_hdr((__u8 *)topt, key,
&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, t1);
}
#endif
memset(&fl6, 0, sizeof(fl6));
fl6.daddr = ipv6_hdr(skb)->saddr;
fl6.saddr = ipv6_hdr(skb)->daddr;
fl6.flowlabel = label;
buff->ip_summed = CHECKSUM_PARTIAL;
buff->csum = 0;
__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
fl6.flowi6_proto = IPPROTO_TCP;
if (rt6_need_strict(&fl6.daddr) && !oif)
fl6.flowi6_oif = tcp_v6_iif(skb);
else {
if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
oif = skb->skb_iif;
fl6.flowi6_oif = oif;
}
if (sk) {
if (sk->sk_state == TCP_TIME_WAIT) {
mark = inet_twsk(sk)->tw_mark;
/* autoflowlabel relies on buff->hash */
skb_set_hash(buff, inet_twsk(sk)->tw_txhash,
PKT_HASH_TYPE_L4);
} else {
mark = sk->sk_mark;
}
buff->tstamp = tcp_transmit_time(sk);
}
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
* namespace
*/
dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
return;
}
kfree_skb(buff);
}
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u32 seq = 0, ack_seq = 0;
struct tcp_md5sig_key *key = NULL;
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
unsigned char newhash[16];
int genhash;
struct sock *sk1 = NULL;
#endif
__be32 label = 0;
u32 priority = 0;
struct net *net;
int oif = 0;
if (th->rst)
return;
/* If sk not NULL, it means we did a successful lookup and incoming
* route had to be correct. prequeue might have dropped our dst.
*/
if (!sk && !ipv6_unicast_destination(skb))
return;
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr);
} else if (hash_location) {
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
* we are not loose security here:
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
sk1 = inet6_lookup_listener(net,
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
ntohs(th->source),
tcp_v6_iif_l3_slave(skb),
tcp_v6_sdif(skb));
if (!sk1)
goto out;
key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr);
if (!key)
goto out;
genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto out;
}
#endif
if (th->ack)
seq = ntohl(th->ack_seq);
else
ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
(th->doff << 2);
if (sk) {
oif = sk->sk_bound_dev_if;
if (sk_fullsock(sk)) {
const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
trace_tcp_send_reset(sk, skb);
if (np->repflow)
label = ip6_flowlabel(ipv6h);
priority = sk->sk_priority;
}
if (sk->sk_state == TCP_TIME_WAIT) {
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
priority = inet_twsk(sk)->tw_priority;
}
} else {
if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
label = ip6_flowlabel(ipv6h);
}
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
label, priority);
#ifdef CONFIG_TCP_MD5SIG
out:
rcu_read_unlock();
#endif
}
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
__be32 label, u32 priority)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
tclass, label, priority);
}
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
inet_twsk_put(tw);
}
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp: Fix kernel panic when calling tcp_v(4/6)_md5_do_lookup If the following packet flow happen, kernel will panic. MathineA MathineB SYN ----------------------> SYN+ACK <---------------------- ACK(bad seq) ----------------------> When a bad seq ACK is received, tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr)) is finally called by tcp_v4_reqsk_send_ack(), but the first parameter(skb->sk) is NULL at that moment, so kernel panic happens. This patch fixes this bug. OOPS output is as following: [ 302.812793] IP: [<c05cfaa6>] tcp_v4_md5_do_lookup+0x12/0x42 [ 302.817075] Oops: 0000 [#1] SMP [ 302.819815] Modules linked in: ipv6 loop dm_multipath rtc_cmos rtc_core rtc_lib pcspkr pcnet32 mii i2c_piix4 parport_pc i2c_core parport ac button ata_piix libata dm_mod mptspi mptscsih mptbase scsi_transport_spi sd_mod scsi_mod crc_t10dif ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last unloaded: scsi_wait_scan] [ 302.849946] [ 302.851198] Pid: 0, comm: swapper Not tainted (2.6.27-rc1-guijf #5) [ 302.855184] EIP: 0060:[<c05cfaa6>] EFLAGS: 00010296 CPU: 0 [ 302.858296] EIP is at tcp_v4_md5_do_lookup+0x12/0x42 [ 302.861027] EAX: 0000001e EBX: 00000000 ECX: 00000046 EDX: 00000046 [ 302.864867] ESI: ceb69e00 EDI: 1467a8c0 EBP: cf75f180 ESP: c0792e54 [ 302.868333] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 [ 302.871287] Process swapper (pid: 0, ti=c0792000 task=c0712340 task.ti=c0746000) [ 302.875592] Stack: c06f413a 00000000 cf75f180 ceb69e00 00000000 c05d0d86 000016d0 ceac5400 [ 302.883275] c05d28f8 000016d0 ceb69e00 ceb69e20 681bf6e3 00001000 00000000 0a67a8c0 [ 302.890971] ceac5400 c04250a3 c06f413a c0792eb0 c0792edc cf59a620 cf59a620 cf59a634 [ 302.900140] Call Trace: [ 302.902392] [<c05d0d86>] tcp_v4_reqsk_send_ack+0x17/0x35 [ 302.907060] [<c05d28f8>] tcp_check_req+0x156/0x372 [ 302.910082] [<c04250a3>] printk+0x14/0x18 [ 302.912868] [<c05d0aa1>] tcp_v4_do_rcv+0x1d3/0x2bf [ 302.917423] [<c05d26be>] tcp_v4_rcv+0x563/0x5b9 [ 302.920453] [<c05bb20f>] ip_local_deliver_finish+0xe8/0x183 [ 302.923865] [<c05bb10a>] ip_rcv_finish+0x286/0x2a3 [ 302.928569] [<c059e438>] dev_alloc_skb+0x11/0x25 [ 302.931563] [<c05a211f>] netif_receive_skb+0x2d6/0x33a [ 302.934914] [<d0917941>] pcnet32_poll+0x333/0x680 [pcnet32] [ 302.938735] [<c05a3b48>] net_rx_action+0x5c/0xfe [ 302.941792] [<c042856b>] __do_softirq+0x5d/0xc1 [ 302.944788] [<c042850e>] __do_softirq+0x0/0xc1 [ 302.948999] [<c040564b>] do_softirq+0x55/0x88 [ 302.951870] [<c04501b1>] handle_fasteoi_irq+0x0/0xa4 [ 302.954986] [<c04284da>] irq_exit+0x35/0x69 [ 302.959081] [<c0405717>] do_IRQ+0x99/0xae [ 302.961896] [<c040422b>] common_interrupt+0x23/0x28 [ 302.966279] [<c040819d>] default_idle+0x2a/0x3d [ 302.969212] [<c0402552>] cpu_idle+0xb2/0xd2 [ 302.972169] ======================= [ 302.974274] Code: fc ff 84 d2 0f 84 df fd ff ff e9 34 fe ff ff 83 c4 0c 5b 5e 5f 5d c3 90 90 57 89 d7 56 53 89 c3 50 68 3a 41 6f c0 e8 e9 55 e5 ff <8b> 93 9c 04 00 00 58 85 d2 59 74 1e 8b 72 10 31 db 31 c9 85 f6 [ 303.011610] EIP: [<c05cfaa6>] tcp_v4_md5_do_lookup+0x12/0x42 SS:ESP 0068:c0792e54 [ 303.018360] Kernel panic - not syncing: Fatal exception in interrupt Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-07 14:50:04 +08:00
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
/* RFC 7323 2.3
* The window field (SEG.WND) of every outgoing segment, with the
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
0, 0, sk->sk_priority);
}
static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
if (!th->syn)
sk = cookie_v6_check(sk, skb);
#endif
return sk;
}
u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
struct tcphdr *th, u32 *cookie)
{
u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
&tcp_request_sock_ipv6_ops, sk, th);
if (mss) {
*cookie = __cookie_v6_init_sequence(iph, th, &mss);
tcp_synq_overflow(sk);
}
#endif
return mss;
}
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_conn_request(sk, skb);
if (!ipv6_unicast_destination(skb))
goto drop;
return tcp_conn_request(&tcp6_request_sock_ops,
&tcp_request_sock_ipv6_ops, sk, skb);
drop:
tcp_listendrop(sk);
return 0; /* don't send reset */
}
static void tcp_v6_restore_cb(struct sk_buff *skb)
{
/* We need to move header back to the beginning if xfrm6_policy_check()
* and tcp_v6_fill_cb() are going to be called again.
* ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
*/
memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
sizeof(struct inet6_skb_parm));
}
static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req)
{
struct inet_request_sock *ireq;
struct ipv6_pinfo *newnp;
const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
struct flowi6 fl6;
if (skb->protocol == htons(ETH_P_IP)) {
/*
* v6 mapped
*/
newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst,
req_unhash, own_req);
if (!newsk)
return NULL;
inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
newinet = inet_sk(newsk);
newnp = tcp_inet6_sk(newsk);
newtp = tcp_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
newnp->saddr = newsk->sk_v6_rcv_saddr;
inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = inet_iif(skb);
newnp->mcast_hops = ip_hdr(skb)->ttl;
newnp->rcv_flowinfo = 0;
if (np->repflow)
newnp->flow_label = 0;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks count
* here, tcp_create_openreq_child now does this for us, see the comment in
* that function for the gory details. -acme
*/
/* It is tricky place. Until this moment IPv4 tcp
worked with IPv6 icsk.icsk_af_ops.
Sync it now.
*/
tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
return newsk;
}
ireq = inet_rsk(req);
if (sk_acceptq_is_full(sk))
goto out_overflow;
if (!dst) {
dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
if (!dst)
goto out;
}
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
* count here, tcp_create_openreq_child now does this for us, see the
* comment in that function for the gory details. -acme
*/
newsk->sk_gso_type = SKB_GSO_TCPV6;
ip6_dst_store(newsk, dst, NULL, NULL);
inet6_sk_rx_dst_set(newsk, skb);
inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
newnp = tcp_inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
newnp->saddr = ireq->ir_v6_loc_addr;
newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
newsk->sk_bound_dev_if = ireq->ir_iif;
/* Now IPv6 options...
First: no IPv4 options.
*/
newinet->inet_opt = NULL;
newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = tcp_v6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
but we make one more one thing there: reattach optmem
to newsk.
*/
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
}
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (opt)
inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
opt->opt_flen;
net: tcp: add per route congestion control This work adds the possibility to define a per route/destination congestion control algorithm. Generally, this opens up the possibility for a machine with different links to enforce specific congestion control algorithms with optimal strategies for each of them based on their network characteristics, even transparently for a single application listening on all links. For our specific use case, this additionally facilitates deployment of DCTCP, for example, applications can easily serve internal traffic/dsts in DCTCP and external one with CUBIC. Other scenarios would also allow for utilizing e.g. long living, low priority background flows for certain destinations/routes while still being able for normal traffic to utilize the default congestion control algorithm. We also thought about a per netns setting (where different defaults are possible), but given its actually a link specific property, we argue that a per route/destination setting is the most natural and flexible. The administrator can utilize this through ip-route(8) by appending "congctl [lock] <name>", where <name> denotes the name of a congestion control algorithm and the optional lock parameter allows to enforce the given algorithm so that applications in user space would not be allowed to overwrite that algorithm for that destination. The dst metric lookups are being done when a dst entry is already available in order to avoid a costly lookup and still before the algorithms are being initialized, thus overhead is very low when the feature is not being used. While the client side would need to drop the current reference on the module, on server side this can actually even be avoided as we just got a flat-copied socket clone. Joint work with Florian Westphal. Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-06 06:57:48 +08:00
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr);
if (key) {
/* We're using one, so create a matching key
* on the newsk structure. If we fail to get
* memory, then we end up not copying the key
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
AF_INET6, 128, key->key, key->keylen,
sk_gfp_mask(sk, GFP_ATOMIC));
}
#endif
if (__inet_inherit_port(sk, newsk) < 0) {
inet: Fix kmemleak in tcp_v4/6_syn_recv_sock and dccp_v4/6_request_recv_sock If in either of the above functions inet_csk_route_child_sock() or __inet_inherit_port() fails, the newsk will not be freed: unreferenced object 0xffff88022e8a92c0 (size 1592): comm "softirq", pid 0, jiffies 4294946244 (age 726.160s) hex dump (first 32 bytes): 0a 01 01 01 0a 01 01 02 00 00 00 00 a7 cc 16 00 ................ 02 00 03 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<ffffffff8153d190>] kmemleak_alloc+0x21/0x3e [<ffffffff810ab3e7>] kmem_cache_alloc+0xb5/0xc5 [<ffffffff8149b65b>] sk_prot_alloc.isra.53+0x2b/0xcd [<ffffffff8149b784>] sk_clone_lock+0x16/0x21e [<ffffffff814d711a>] inet_csk_clone_lock+0x10/0x7b [<ffffffff814ebbc3>] tcp_create_openreq_child+0x21/0x481 [<ffffffff814e8fa5>] tcp_v4_syn_recv_sock+0x3a/0x23b [<ffffffff814ec5ba>] tcp_check_req+0x29f/0x416 [<ffffffff814e8e10>] tcp_v4_do_rcv+0x161/0x2bc [<ffffffff814eb917>] tcp_v4_rcv+0x6c9/0x701 [<ffffffff814cea9f>] ip_local_deliver_finish+0x70/0xc4 [<ffffffff814cec20>] ip_local_deliver+0x4e/0x7f [<ffffffff814ce9f8>] ip_rcv_finish+0x1fc/0x233 [<ffffffff814cee68>] ip_rcv+0x217/0x267 [<ffffffff814a7bbe>] __netif_receive_skb+0x49e/0x553 [<ffffffff814a7cc3>] netif_receive_skb+0x50/0x82 This happens, because sk_clone_lock initializes sk_refcnt to 2, and thus a single sock_put() is not enough to free the memory. Additionally, things like xfrm, memcg, cookie_values,... may have been initialized. We have to free them properly. This is fixed by forcing a call to tcp_done(), ending up in inet_csk_destroy_sock, doing the final sock_put(). tcp_done() is necessary, because it ends up doing all the cleanup on xfrm, memcg, cookie_values, xfrm,... Before calling tcp_done, we have to set the socket to SOCK_DEAD, to force it entering inet_csk_destroy_sock. To avoid the warning in inet_csk_destroy_sock, inet_num has to be set to 0. As inet_csk_destroy_sock does a dec on orphan_count, we first have to increase it. Calling tcp_done() allows us to remove the calls to tcp_clear_xmit_timer() and tcp_cleanup_congestion_control(). A similar approach is taken for dccp by calling dccp_done(). This is in the kernel since 093d282321 (tproxy: fix hash locking issue when using port redirection in __inet_inherit_port()), thus since version >= 2.6.37. Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-12-14 12:07:58 +08:00
inet_csk_prepare_forced_close(newsk);
tcp_done(newsk);
goto out;
}
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
if (*own_req) {
tcp_move_syn(newtp, req);
/* Clone pktoptions received with SYN, if we own the req */
if (ireq->pktopts) {
newnp->pktoptions = skb_clone(ireq->pktopts,
sk_gfp_mask(sk, GFP_ATOMIC));
consume_skb(ireq->pktopts);
ireq->pktopts = NULL;
if (newnp->pktoptions) {
tcp_v6_restore_cb(newnp->pktoptions);
skb_set_owner_r(newnp->pktoptions, newsk);
}
}
}
return newsk;
out_overflow:
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
out_nonewsk:
dst_release(dst);
out:
tcp_listendrop(sk);
return NULL;
}
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct sk_buff *opt_skb = NULL;
struct tcp_sock *tp;
/* Imagine: socket is IPv6. IPv4 packet arrives,
goes to IPv4 receive handler and backlogged.
From backlog it always goes here. Kerboom...
Fortunately, tcp_rcv_established and rcv_established
handle them correctly, but it is not case with
tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK
*/
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
*/
/* Do Stevens' IPV6_PKTOPTIONS.
Yes, guys, it is the only place in our code, where we
may make it not affecting IPv4.
The rest of code is protocol independent,
and I do not like idea to uglify IPv4.
Actually, all the idea behind IPV6_PKTOPTIONS
looks not very well thought. For now we latch
options, received in the last packet, enqueued
by tcp. Feel free to propose better solution.
--ANK (980728)
*/
if (np->rxopt.all)
opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
}
if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_cookie_check(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb))
goto reset;
if (opt_skb)
__kfree_skb(opt_skb);
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb))
goto reset;
if (opt_skb)
goto ipv6_pktoptions;
return 0;
reset:
tcp_v6_send_reset(sk, skb);
discard:
if (opt_skb)
__kfree_skb(opt_skb);
kfree_skb(skb);
return 0;
csum_err:
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
ipv6_pktoptions:
/* Do you ask, what is it?
1. skb was enqueued by tcp.
2. skb is added to tail of read queue, rather than out of order.
3. socket is not in passive state.
4. Finally, it really contains options, which user wants to receive.
*/
tp = tcp_sk(sk);
if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
np->mcast_oif = tcp_v6_iif(opt_skb);
if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
if (np->repflow)
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
skb_set_owner_r(opt_skb, sk);
ipv6: tcp: restore IP6CB for pktoptions skbs Baozeng Ding reported following KASAN splat : BUG: KASAN: use-after-free in ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 at addr ffff880029c84ec8 Read of size 1 by task poc/25548 Call Trace: [<ffffffff82cf43c9>] dump_stack+0x12e/0x185 /lib/dump_stack.c:15 [< inline >] print_address_description /mm/kasan/report.c:204 [<ffffffff817ced3b>] kasan_report_error+0x48b/0x4b0 /mm/kasan/report.c:283 [< inline >] kasan_report /mm/kasan/report.c:303 [<ffffffff817ced9e>] __asan_report_load1_noabort+0x3e/0x40 /mm/kasan/report.c:321 [<ffffffff85c71da1>] ip6_datagram_recv_specific_ctl+0x13f1/0x15c0 /net/ipv6/datagram.c:687 [<ffffffff85c734c3>] ip6_datagram_recv_ctl+0x33/0x40 [<ffffffff85c0b07c>] do_ipv6_getsockopt.isra.4+0xaec/0x2150 [<ffffffff85c0c7f6>] ipv6_getsockopt+0x116/0x230 [<ffffffff859b5a12>] tcp_getsockopt+0x82/0xd0 /net/ipv4/tcp.c:3035 [<ffffffff855fb385>] sock_common_getsockopt+0x95/0xd0 /net/core/sock.c:2647 [< inline >] SYSC_getsockopt /net/socket.c:1776 [<ffffffff855f8ba2>] SyS_getsockopt+0x142/0x230 /net/socket.c:1758 [<ffffffff8685cdc5>] entry_SYSCALL_64_fastpath+0x23/0xc6 Memory state around the buggy address: ffff880029c84d80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff880029c84e00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff > ffff880029c84e80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff880029c84f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff880029c84f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff He also provided a syzkaller reproducer. Issue is that ip6_datagram_recv_specific_ctl() expects to find IP6CB data that was moved at a different place in tcp_v6_rcv() This patch moves tcp_v6_restore_cb() up and calls it from tcp_v6_do_rcv() when np->pktoptions is set. Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Baozeng Ding <sploving1@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-13 01:01:45 +08:00
tcp_v6_restore_cb(opt_skb);
opt_skb = xchg(&np->pktoptions, opt_skb);
} else {
__kfree_skb(opt_skb);
opt_skb = xchg(&np->pktoptions, NULL);
}
}
kfree_skb(opt_skb);
return 0;
}
static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
const struct tcphdr *th)
{
/* This is tricky: we move IP6CB at its correct location into
* TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
* _decode_session6() uses IP6CB().
* barrier() makes sure compiler won't play aliasing games.
*/
memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
sizeof(struct inet6_skb_parm));
barrier();
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff*4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}
INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-22 23:56:40 +08:00
struct sk_buff *skb_to_free;
int sdif = inet6_sdif(skb);
const struct tcphdr *th;
const struct ipv6hdr *hdr;
bool refcounted;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev);
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/*
* Count it even if it's bad.
*/
__TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = (const struct tcphdr *)skb->data;
if (unlikely(th->doff < sizeof(struct tcphdr)/4))
goto bad_packet;
if (!pskb_may_pull(skb, th->doff*4))
goto discard_it;
if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
goto csum_error;
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
lookup:
sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
goto no_tcp_socket;
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen = false;
struct sock *nsk;
sk = req->rsk_listener;
if (tcp_v6_inbound_md5_hash(sk, skb)) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
if (tcp_checksum_complete(skb)) {
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
sock_hold(sk);
refcounted = true;
nsk = NULL;
tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() James Morris reported kernel stack corruption bug [1] while running the SELinux testsuite, and bisected to a recent commit bffa72cf7f9d ("net: sk_buff rbnode reorg") We believe this commit is fine, but exposes an older bug. SELinux code runs from tcp_filter() and might send an ICMP, expecting IP options to be found in skb->cb[] using regular IPCB placement. We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very similar way we added them for IPv6. [1] [ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet [ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 [ 339.822505] [ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 [ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 [ 339.885060] Call Trace: [ 339.896875] <IRQ> [ 339.908103] dump_stack+0x63/0x87 [ 339.920645] panic+0xe8/0x248 [ 339.932668] ? ip_push_pending_frames+0x33/0x40 [ 339.946328] ? icmp_send+0x525/0x530 [ 339.958861] ? kfree_skbmem+0x60/0x70 [ 339.971431] __stack_chk_fail+0x1b/0x20 [ 339.984049] icmp_send+0x525/0x530 [ 339.996205] ? netlbl_skbuff_err+0x36/0x40 [ 340.008997] ? selinux_netlbl_err+0x11/0x20 [ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 [ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 [ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 [ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 [ 340.074562] ? tcp_filter+0x2c/0x40 [ 340.086400] ? tcp_v4_rcv+0x820/0xa20 [ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 [ 340.111279] ? ip_local_deliver+0x6f/0xe0 [ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 [ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 [ 340.147442] ? ip_rcv+0x27c/0x3c0 [ 340.158668] ? inet_del_offload+0x40/0x40 [ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 [ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 [ 340.195282] ? __netif_receive_skb+0x18/0x60 [ 340.207288] ? process_backlog+0x95/0x140 [ 340.218948] ? net_rx_action+0x26c/0x3b0 [ 340.230416] ? __do_softirq+0xc9/0x26a [ 340.241625] ? do_softirq_own_stack+0x2a/0x40 [ 340.253368] </IRQ> [ 340.262673] ? do_softirq+0x50/0x60 [ 340.273450] ? __local_bh_enable_ip+0x57/0x60 [ 340.285045] ? ip_finish_output2+0x175/0x350 [ 340.296403] ? ip_finish_output+0x127/0x1d0 [ 340.307665] ? nf_hook_slow+0x3c/0xb0 [ 340.318230] ? ip_output+0x72/0xe0 [ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 [ 340.340070] ? ip_local_out+0x35/0x40 [ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 [ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 [ 340.372484] ? __skb_clone+0x2e/0x130 [ 340.382633] ? tcp_transmit_skb+0x558/0xa10 [ 340.393262] ? tcp_connect+0x938/0xad0 [ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 [ 340.414206] ? tcp_v4_connect+0x457/0x4e0 [ 340.424471] ? __inet_stream_connect+0xb3/0x300 [ 340.435195] ? inet_stream_connect+0x3b/0x60 [ 340.445607] ? SYSC_connect+0xd9/0x110 [ 340.455455] ? __audit_syscall_entry+0xaf/0x100 [ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 [ 340.476636] ? __audit_syscall_exit+0x209/0x290 [ 340.487151] ? SyS_connect+0xe/0x10 [ 340.496453] ? do_syscall_64+0x67/0x1b0 [ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: James Morris <james.l.morris@oracle.com> Tested-by: James Morris <james.l.morris@oracle.com> Tested-by: Casey Schaufler <casey@schaufler-ca.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-04 01:32:59 +08:00
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() James Morris reported kernel stack corruption bug [1] while running the SELinux testsuite, and bisected to a recent commit bffa72cf7f9d ("net: sk_buff rbnode reorg") We believe this commit is fine, but exposes an older bug. SELinux code runs from tcp_filter() and might send an ICMP, expecting IP options to be found in skb->cb[] using regular IPCB placement. We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very similar way we added them for IPv6. [1] [ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet [ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 [ 339.822505] [ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 [ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 [ 339.885060] Call Trace: [ 339.896875] <IRQ> [ 339.908103] dump_stack+0x63/0x87 [ 339.920645] panic+0xe8/0x248 [ 339.932668] ? ip_push_pending_frames+0x33/0x40 [ 339.946328] ? icmp_send+0x525/0x530 [ 339.958861] ? kfree_skbmem+0x60/0x70 [ 339.971431] __stack_chk_fail+0x1b/0x20 [ 339.984049] icmp_send+0x525/0x530 [ 339.996205] ? netlbl_skbuff_err+0x36/0x40 [ 340.008997] ? selinux_netlbl_err+0x11/0x20 [ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 [ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 [ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 [ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 [ 340.074562] ? tcp_filter+0x2c/0x40 [ 340.086400] ? tcp_v4_rcv+0x820/0xa20 [ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 [ 340.111279] ? ip_local_deliver+0x6f/0xe0 [ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 [ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 [ 340.147442] ? ip_rcv+0x27c/0x3c0 [ 340.158668] ? inet_del_offload+0x40/0x40 [ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 [ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 [ 340.195282] ? __netif_receive_skb+0x18/0x60 [ 340.207288] ? process_backlog+0x95/0x140 [ 340.218948] ? net_rx_action+0x26c/0x3b0 [ 340.230416] ? __do_softirq+0xc9/0x26a [ 340.241625] ? do_softirq_own_stack+0x2a/0x40 [ 340.253368] </IRQ> [ 340.262673] ? do_softirq+0x50/0x60 [ 340.273450] ? __local_bh_enable_ip+0x57/0x60 [ 340.285045] ? ip_finish_output2+0x175/0x350 [ 340.296403] ? ip_finish_output+0x127/0x1d0 [ 340.307665] ? nf_hook_slow+0x3c/0xb0 [ 340.318230] ? ip_output+0x72/0xe0 [ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 [ 340.340070] ? ip_local_out+0x35/0x40 [ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 [ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 [ 340.372484] ? __skb_clone+0x2e/0x130 [ 340.382633] ? tcp_transmit_skb+0x558/0xa10 [ 340.393262] ? tcp_connect+0x938/0xad0 [ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 [ 340.414206] ? tcp_v4_connect+0x457/0x4e0 [ 340.424471] ? __inet_stream_connect+0xb3/0x300 [ 340.435195] ? inet_stream_connect+0x3b/0x60 [ 340.445607] ? SYSC_connect+0xd9/0x110 [ 340.455455] ? __audit_syscall_entry+0xaf/0x100 [ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 [ 340.476636] ? __audit_syscall_exit+0x209/0x290 [ 340.487151] ? SyS_connect+0xe/0x10 [ 340.496453] ? do_syscall_64+0x67/0x1b0 [ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: James Morris <james.l.morris@oracle.com> Tested-by: James Morris <james.l.morris@oracle.com> Tested-by: Casey Schaufler <casey@schaufler-ca.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-04 01:32:59 +08:00
}
if (!nsk) {
reqsk_put(req);
if (req_stolen) {
/* Another cpu got exclusive access to req
* and created a full blown socket.
* Try to feed this packet to this socket
* instead of discarding it.
*/
tcp_v6_restore_cb(skb);
sock_put(sk);
goto lookup;
}
goto discard_and_relse;
}
if (nsk == sk) {
reqsk_put(req);
tcp_v6_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v6_send_reset(nsk, skb);
goto discard_and_relse;
} else {
sock_put(sk);
return 0;
}
}
if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
if (tcp_v6_inbound_md5_hash(sk, skb))
goto discard_and_relse;
if (tcp_filter(sk, skb))
goto discard_and_relse;
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb() James Morris reported kernel stack corruption bug [1] while running the SELinux testsuite, and bisected to a recent commit bffa72cf7f9d ("net: sk_buff rbnode reorg") We believe this commit is fine, but exposes an older bug. SELinux code runs from tcp_filter() and might send an ICMP, expecting IP options to be found in skb->cb[] using regular IPCB placement. We need to defer TCP mangling of skb->cb[] after tcp_filter() calls. This patch adds tcp_v4_fill_cb()/tcp_v4_restore_cb() in a very similar way we added them for IPv6. [1] [ 339.806024] SELinux: failure in selinux_parse_skb(), unable to parse packet [ 339.822505] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81745af5 [ 339.822505] [ 339.852250] CPU: 4 PID: 3642 Comm: client Not tainted 4.15.0-rc1-test #15 [ 339.868498] Hardware name: LENOVO 10FGS0VA1L/30BC, BIOS FWKT68A 01/19/2017 [ 339.885060] Call Trace: [ 339.896875] <IRQ> [ 339.908103] dump_stack+0x63/0x87 [ 339.920645] panic+0xe8/0x248 [ 339.932668] ? ip_push_pending_frames+0x33/0x40 [ 339.946328] ? icmp_send+0x525/0x530 [ 339.958861] ? kfree_skbmem+0x60/0x70 [ 339.971431] __stack_chk_fail+0x1b/0x20 [ 339.984049] icmp_send+0x525/0x530 [ 339.996205] ? netlbl_skbuff_err+0x36/0x40 [ 340.008997] ? selinux_netlbl_err+0x11/0x20 [ 340.021816] ? selinux_socket_sock_rcv_skb+0x211/0x230 [ 340.035529] ? security_sock_rcv_skb+0x3b/0x50 [ 340.048471] ? sk_filter_trim_cap+0x44/0x1c0 [ 340.061246] ? tcp_v4_inbound_md5_hash+0x69/0x1b0 [ 340.074562] ? tcp_filter+0x2c/0x40 [ 340.086400] ? tcp_v4_rcv+0x820/0xa20 [ 340.098329] ? ip_local_deliver_finish+0x71/0x1a0 [ 340.111279] ? ip_local_deliver+0x6f/0xe0 [ 340.123535] ? ip_rcv_finish+0x3a0/0x3a0 [ 340.135523] ? ip_rcv_finish+0xdb/0x3a0 [ 340.147442] ? ip_rcv+0x27c/0x3c0 [ 340.158668] ? inet_del_offload+0x40/0x40 [ 340.170580] ? __netif_receive_skb_core+0x4ac/0x900 [ 340.183285] ? rcu_accelerate_cbs+0x5b/0x80 [ 340.195282] ? __netif_receive_skb+0x18/0x60 [ 340.207288] ? process_backlog+0x95/0x140 [ 340.218948] ? net_rx_action+0x26c/0x3b0 [ 340.230416] ? __do_softirq+0xc9/0x26a [ 340.241625] ? do_softirq_own_stack+0x2a/0x40 [ 340.253368] </IRQ> [ 340.262673] ? do_softirq+0x50/0x60 [ 340.273450] ? __local_bh_enable_ip+0x57/0x60 [ 340.285045] ? ip_finish_output2+0x175/0x350 [ 340.296403] ? ip_finish_output+0x127/0x1d0 [ 340.307665] ? nf_hook_slow+0x3c/0xb0 [ 340.318230] ? ip_output+0x72/0xe0 [ 340.328524] ? ip_fragment.constprop.54+0x80/0x80 [ 340.340070] ? ip_local_out+0x35/0x40 [ 340.350497] ? ip_queue_xmit+0x15c/0x3f0 [ 340.361060] ? __kmalloc_reserve.isra.40+0x31/0x90 [ 340.372484] ? __skb_clone+0x2e/0x130 [ 340.382633] ? tcp_transmit_skb+0x558/0xa10 [ 340.393262] ? tcp_connect+0x938/0xad0 [ 340.403370] ? ktime_get_with_offset+0x4c/0xb0 [ 340.414206] ? tcp_v4_connect+0x457/0x4e0 [ 340.424471] ? __inet_stream_connect+0xb3/0x300 [ 340.435195] ? inet_stream_connect+0x3b/0x60 [ 340.445607] ? SYSC_connect+0xd9/0x110 [ 340.455455] ? __audit_syscall_entry+0xaf/0x100 [ 340.466112] ? syscall_trace_enter+0x1d0/0x2b0 [ 340.476636] ? __audit_syscall_exit+0x209/0x290 [ 340.487151] ? SyS_connect+0xe/0x10 [ 340.496453] ? do_syscall_64+0x67/0x1b0 [ 340.506078] ? entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: James Morris <james.l.morris@oracle.com> Tested-by: James Morris <james.l.morris@oracle.com> Tested-by: Casey Schaufler <casey@schaufler-ca.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-04 01:32:59 +08:00
tcp_v6_fill_cb(skb, hdr, th);
skb->dev = NULL;
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v6_do_rcv(sk, skb);
goto put_and_return;
}
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In Per RFC4898, they count segments sent/received containing a positive length data segment (that includes retransmission segments carrying data). Unlike tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments carrying no data (e.g. pure ack). The patch also updates the segs_in in tcp_fastopen_add_skb() so that segs_in >= data_segs_in property is kept. Together with retransmission data, tcpi_data_segs_out gives a better signal on the rxmit rate. v6: Rebase on the latest net-next v5: Eric pointed out that checking skb->len is still needed in tcp_fastopen_add_skb() because skb can carry a FIN without data. Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in() helper is used. Comment is added to the fastopen case to explain why segs_in has to be reset and tcp_segs_in() has to be called before __skb_pull(). v4: Add comment to the changes in tcp_fastopen_add_skb() and also add remark on this case in the commit message. v3: Add const modifier to the skb parameter in tcp_segs_in() v2: Rework based on recent fix by Eric: commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Cc: Chris Rapier <rapier@psc.edu> Cc: Eric Dumazet <edumazet@google.com> Cc: Marcelo Ricardo Leitner <mleitner@redhat.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-15 01:52:15 +08:00
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-22 23:56:40 +08:00
skb_to_free = sk->sk_rx_skb_cache;
sk->sk_rx_skb_cache = NULL;
ret = tcp_v6_do_rcv(sk, skb);
tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-22 23:56:40 +08:00
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
skb_to_free = NULL;
}
bh_unlock_sock(sk);
tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-22 23:56:40 +08:00
if (skb_to_free)
__kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);
return ret ? -1 : 0;
no_tcp_socket:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_checksum_complete(skb)) {
csum_error:
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v6_send_reset(NULL, skb);
}
discard_it:
kfree_skb(skb);
return 0;
discard_and_relse:
sk_drops_add(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN:
{
struct sock *sk2;
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest),
tcp_v6_iif_l3_slave(skb),
sdif);
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
inet_twsk_deschedule_put(tw);
sk = sk2;
tcp_v6_restore_cb(skb);
refcounted = false;
goto process;
}
}
/* to ACK */
/* fall through */
case TCP_TW_ACK:
tcp_v6_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
tcp_v6_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
;
}
goto discard_it;
}
INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
{
const struct ipv6hdr *hdr;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
return;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
return;
hdr = ipv6_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
&hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
}
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
.twsk_unique = tcp_twsk_unique,
.twsk_destructor = tcp_twsk_destructor,
};
static const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
.sk_rx_dst_set = inet6_sk_rx_dst_set,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct ipv6hdr),
ipv6: RTAX_FEATURE_ALLFRAG causes inefficient TCP segment sizing Quoting Tore Anderson from : https://bugzilla.kernel.org/show_bug.cgi?id=42572 When RTAX_FEATURE_ALLFRAG is set on a route, the effective TCP segment size does not take into account the size of the IPv6 Fragmentation header that needs to be included in outbound packets, causing every transmitted TCP segment to be fragmented across two IPv6 packets, the latter of which will only contain 8 bytes of actual payload. RTAX_FEATURE_ALLFRAG is typically set on a route in response to receving a ICMPv6 Packet Too Big message indicating a Path MTU of less than 1280 bytes. 1280 bytes is the minimum IPv6 MTU, however ICMPv6 PTBs with MTU < 1280 are still valid, in particular when an IPv6 packet is sent to an IPv4 destination through a stateless translator. Any ICMPv4 Need To Fragment packets originated from the IPv4 part of the path will be translated to ICMPv6 PTB which may then indicate an MTU of less than 1280. The Linux kernel refuses to reduce the effective MTU to anything below 1280 bytes, instead it sets it to exactly 1280 bytes, and RTAX_FEATURE_ALLFRAG is also set. However, the TCP segment size appears to be set to 1240 bytes (1280 Path MTU - 40 bytes of IPv6 header), instead of 1232 (additionally taking into account the 8 bytes required by the IPv6 Fragmentation extension header). This in turn results in rather inefficient transmission, as every transmitted TCP segment now is split in two fragments containing 1232+8 bytes of payload. After this patch, all the outgoing packets that includes a Fragmentation header all are "atomic" or "non-fragmented" fragments, i.e., they both have Offset=0 and More Fragments=0. With help from David S. Miller Reported-by: Tore Anderson <tore@fud.no> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Maciej Żenczykowski <maze@google.com> Cc: Tom Herbert <therbert@google.com> Tested-by: Tore Anderson <tore@fud.no> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-24 15:37:38 +08:00
.net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
.mtu_reduced = tcp_v6_mtu_reduced,
};
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
.md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
};
#endif
/*
* TCP over IPv4 via INET6 API
*/
static const struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
net: tcp: ipv6_mapped needs sk_rx_dst_set method commit 5d299f3d3c8a2fb (net: ipv6: fix TCP early demux) added a regression for ipv6_mapped case. [ 67.422369] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 67.449678] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 92.631060] BUG: unable to handle kernel NULL pointer dereference at (null) [ 92.631435] IP: [< (null)>] (null) [ 92.631645] PGD 0 [ 92.631846] Oops: 0010 [#1] SMP [ 92.632095] Modules linked in: autofs4 sunrpc ipv6 dm_mirror dm_region_hash dm_log dm_multipath dm_mod video sbs sbshc battery ac lp parport sg snd_hda_intel snd_hda_codec snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device pcspkr snd_pcm_oss snd_mixer_oss snd_pcm snd_timer serio_raw button floppy snd i2c_i801 i2c_core soundcore snd_page_alloc shpchp ide_cd_mod cdrom microcode ehci_hcd ohci_hcd uhci_hcd [ 92.634294] CPU 0 [ 92.634294] Pid: 4469, comm: sendmail Not tainted 3.6.0-rc1 #3 [ 92.634294] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 92.634294] RSP: 0018:ffff880245fc7cb0 EFLAGS: 00010282 [ 92.634294] RAX: ffffffffa01985f0 RBX: ffff88024827ad00 RCX: 0000000000000000 [ 92.634294] RDX: 0000000000000218 RSI: ffff880254735380 RDI: ffff88024827ad00 [ 92.634294] RBP: ffff880245fc7cc8 R08: 0000000000000001 R09: 0000000000000000 [ 92.634294] R10: 0000000000000000 R11: ffff880245fc7bf8 R12: ffff880254735380 [ 92.634294] R13: ffff880254735380 R14: 0000000000000000 R15: 7fffffffffff0218 [ 92.634294] FS: 00007f4516ccd6f0(0000) GS:ffff880256600000(0000) knlGS:0000000000000000 [ 92.634294] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 92.634294] CR2: 0000000000000000 CR3: 0000000245ed1000 CR4: 00000000000007f0 [ 92.634294] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.634294] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 92.634294] Process sendmail (pid: 4469, threadinfo ffff880245fc6000, task ffff880254b8cac0) [ 92.634294] Stack: [ 92.634294] ffffffff813837a7 ffff88024827ad00 ffff880254b6b0e8 ffff880245fc7d68 [ 92.634294] ffffffff81385083 00000000001d2680 ffff8802547353a8 ffff880245fc7d18 [ 92.634294] ffffffff8105903a ffff88024827ad60 0000000000000002 00000000000000ff [ 92.634294] Call Trace: [ 92.634294] [<ffffffff813837a7>] ? tcp_finish_connect+0x2c/0xfa [ 92.634294] [<ffffffff81385083>] tcp_rcv_state_process+0x2b6/0x9c6 [ 92.634294] [<ffffffff8105903a>] ? sched_clock_cpu+0xc3/0xd1 [ 92.634294] [<ffffffff81059073>] ? local_clock+0x2b/0x3c [ 92.634294] [<ffffffff8138caf3>] tcp_v4_do_rcv+0x63a/0x670 [ 92.634294] [<ffffffff8133278e>] release_sock+0x128/0x1bd [ 92.634294] [<ffffffff8139f060>] __inet_stream_connect+0x1b1/0x352 [ 92.634294] [<ffffffff813325f5>] ? lock_sock_nested+0x74/0x7f [ 92.634294] [<ffffffff8104b333>] ? wake_up_bit+0x25/0x25 [ 92.634294] [<ffffffff813325f5>] ? lock_sock_nested+0x74/0x7f [ 92.634294] [<ffffffff8139f223>] ? inet_stream_connect+0x22/0x4b [ 92.634294] [<ffffffff8139f234>] inet_stream_connect+0x33/0x4b [ 92.634294] [<ffffffff8132e8cf>] sys_connect+0x78/0x9e [ 92.634294] [<ffffffff813fd407>] ? sysret_check+0x1b/0x56 [ 92.634294] [<ffffffff81088503>] ? __audit_syscall_entry+0x195/0x1c8 [ 92.634294] [<ffffffff811cc26e>] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 92.634294] [<ffffffff813fd3e2>] system_call_fastpath+0x16/0x1b [ 92.634294] Code: Bad RIP value. [ 92.634294] RIP [< (null)>] (null) [ 92.634294] RSP <ffff880245fc7cb0> [ 92.634294] CR2: 0000000000000000 [ 92.648982] ---[ end trace 24e2bed94314c8d9 ]--- [ 92.649146] Kernel panic - not syncing: Fatal exception in interrupt Fix this using inet_sk_rx_dst_set(), and export this function in case IPv6 is modular. Reported-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-08-09 22:11:00 +08:00
.sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
};
#endif
/* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
static int tcp_v6_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv6_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
#endif
return 0;
}
static void tcp_v6_destroy_sock(struct sock *sk)
{
tcp_v4_destroy_sock(sk);
inet6_destroy_sock(sk);
}
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
const struct request_sock *req, int i)
{
long ttd = req->rsk_timer.expires - jiffies;
const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;
if (ttd < 0)
ttd = 0;
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3],
inet_rsk(req)->ir_num,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3],
ntohs(inet_rsk(req)->ir_rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
jiffies_to_clock_t(ttd),
tcp: better retrans tracking for defer-accept For passive TCP connections using TCP_DEFER_ACCEPT facility, we incorrectly increment req->retrans each time timeout triggers while no SYNACK is sent. SYNACK are not sent for TCP_DEFER_ACCEPT that were established (for which we received the ACK from client). Only the last SYNACK is sent so that we can receive again an ACK from client, to move the req into accept queue. We plan to change this later to avoid the useless retransmit (and potential problem as this SYNACK could be lost) TCP_INFO later gives wrong information to user, claiming imaginary retransmits. Decouple req->retrans field into two independent fields : num_retrans : number of retransmit num_timeout : number of timeouts num_timeout is the counter that is incremented at each timeout, regardless of actual SYNACK being sent or not, and used to compute the exponential timeout. Introduce inet_rtx_syn_ack() helper to increment num_retrans only if ->rtx_syn_ack() succeeded. Use inet_rtx_syn_ack() from tcp_check_req() to increment num_retrans when we re-send a SYNACK in answer to a (retransmitted) SYN. Prior to this patch, we were not counting these retransmits. Change tcp_v[46]_rtx_synack() to increment TCP_MIB_RETRANSSEGS only if a synack packet was successfully queued. Reported-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Julian Anastasov <ja@ssi.bg> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Elliott Hughes <enh@google.com> Cc: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-10-28 07:16:46 +08:00
req->num_timeout,
from_kuid_munged(seq_user_ns(seq),
sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0, req);
}
static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
{
const struct in6_addr *dest, *src;
__u16 destp, srcp;
int timer_active;
unsigned long timer_expires;
const struct inet_sock *inet = inet_sk(sp);
const struct tcp_sock *tp = tcp_sk(sp);
const struct inet_connection_sock *icsk = inet_csk(sp);
const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
int rx_queue;
int state;
dest = &sp->sk_v6_daddr;
src = &sp->sk_v6_rcv_saddr;
destp = ntohs(inet->inet_dport);
srcp = ntohs(inet->inet_sport);
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
tcp: add reordering timer in RACK loss detection This patch makes RACK install a reordering timer when it suspects some packets might be lost, but wants to delay the decision a little bit to accomodate reordering. It does not create a new timer but instead repurposes the existing RTO timer, because both are meant to retransmit packets. Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when the RACK timing check fails. The wait time is set to RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge This translates to expecting a packet (Packet) should take (RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent. When there are multiple packets that need a timer, we use one timer with the maximum timeout. Therefore the timer conservatively uses the maximum window to expire N packets by one timeout, instead of N timeouts to expire N packets sent at different times. The fudge factor is 2 jiffies to ensure when the timer fires, all the suspected packets would exceed the deadline and be marked lost by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the clock may tick between calling icsk_reset_xmit_timer(timeout) and actually hang the timer. The next jiffy is to lower-bound the timeout to 2 jiffies when reo_wnd is < 1ms. When the reordering timer fires (tcp_rack_reo_timeout): If we aren't in Recovery we'll enter fast recovery and force fast retransmit. This is very similar to the early retransmit (RFC5827) except RACK is not constrained to only enter recovery for small outstanding flights. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-13 14:11:33 +08:00
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
}
state = inet_sk_state_load(sp);
if (state == TCP_LISTEN)
rx_queue = READ_ONCE(sp->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
tcp: annotate tp->rcv_nxt lockless reads There are few places where we fetch tp->rcv_nxt while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that tcp_inq_hint() was already using READ_ONCE(tp->rcv_nxt) syzbot reported : BUG: KCSAN: data-race in tcp_poll / tcp_queue_rcv write to 0xffff888120425770 of 4 bytes by interrupt on cpu 0: tcp_rcv_nxt_update net/ipv4/tcp_input.c:3365 [inline] tcp_queue_rcv+0x180/0x380 net/ipv4/tcp_input.c:4638 tcp_rcv_established+0xbf1/0xf50 net/ipv4/tcp_input.c:5616 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 tcp_v4_rcv+0x1a03/0x1bf0 net/ipv4/tcp_ipv4.c:1923 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 read to 0xffff888120425770 of 4 bytes by task 7254 on cpu 1: tcp_stream_is_readable net/ipv4/tcp.c:480 [inline] tcp_poll+0x204/0x6b0 net/ipv4/tcp.c:554 sock_poll+0xed/0x250 net/socket.c:1256 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll.isra.0+0x90/0x190 fs/eventpoll.c:892 ep_send_events_proc+0x113/0x5c0 fs/eventpoll.c:1749 ep_scan_ready_list.constprop.0+0x189/0x500 fs/eventpoll.c:704 ep_send_events fs/eventpoll.c:1793 [inline] ep_poll+0xe3/0x900 fs/eventpoll.c:1930 do_epoll_wait+0x162/0x180 fs/eventpoll.c:2294 __do_sys_epoll_pwait fs/eventpoll.c:2325 [inline] __se_sys_epoll_pwait fs/eventpoll.c:2311 [inline] __x64_sys_epoll_pwait+0xcd/0x170 fs/eventpoll.c:2311 do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7254 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-11 11:17:39 +08:00
rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
READ_ONCE(tp->copied_seq), 0);
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
state,
READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
icsk->icsk_probes_out,
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
tp->snd_cwnd,
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
);
}
static void get_timewait6_sock(struct seq_file *seq,
struct inet_timewait_sock *tw, int i)
{
tcp/dccp: get rid of central timewait timer Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 09:51:09 +08:00
long delta = tw->tw_timer.expires - jiffies;
const struct in6_addr *dest, *src;
__u16 destp, srcp;
dest = &tw->tw_v6_daddr;
src = &tw->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
net: convert %p usage to %pK The %pK format specifier is designed to hide exposed kernel pointers, specifically via /proc interfaces. Exposing these pointers provides an easy target for kernel write vulnerabilities, since they reveal the locations of writable structures containing easily triggerable function pointers. The behavior of %pK depends on the kptr_restrict sysctl. If kptr_restrict is set to 0, no deviation from the standard %p behavior occurs. If kptr_restrict is set to 1, the default, if the current user (intended to be a reader via seq_printf(), etc.) does not have CAP_SYSLOG (currently in the LSM tree), kernel pointers using %pK are printed as 0's. If kptr_restrict is set to 2, kernel pointers using %pK are printed as 0's regardless of privileges. Replacing with 0's was chosen over the default "(null)", which cannot be parsed by userland %p, which expects "(nil)". The supporting code for kptr_restrict and %pK are currently in the -mm tree. This patch converts users of %p in net/ to %pK. Cases of printing pointers to the syslog are not covered, since this would eliminate useful information for postmortem debugging and the reading of the syslog is already optionally protected by the dmesg_restrict sysctl. Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com> Cc: James Morris <jmorris@namei.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Thomas Graf <tgraf@infradead.org> Cc: Eugene Teo <eugeneteo@kernel.org> Cc: Kees Cook <kees.cook@canonical.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: David S. Miller <davem@davemloft.net> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Eric Paris <eparis@parisplace.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-05-23 20:17:35 +08:00
"%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
i,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
tw->tw_substate, 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
refcount_read(&tw->tw_refcnt), tw);
}
static int tcp6_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
tcp/dccp: remove twchain TCP listener refactoring, part 3 : Our goal is to hash SYN_RECV sockets into main ehash for fast lookup, and parallel SYN processing. Current inet_ehash_bucket contains two chains, one for ESTABLISH (and friend states) sockets, another for TIME_WAIT sockets only. As the hash table is sized to get at most one socket per bucket, it makes little sense to have separate twchain, as it makes the lookup slightly more complicated, and doubles hash table memory usage. If we make sure all socket types have the lookup keys at the same offsets, we can use a generic and faster lookup. It turns out TIME_WAIT and ESTABLISHED sockets already have common lookup fields for IPv4. [ INET_TW_MATCH() is no longer needed ] I'll provide a follow-up to factorize IPv6 lookup as well, to remove INET6_TW_MATCH() This way, SYN_RECV pseudo sockets will be supported the same. A new sock_gen_put() helper is added, doing either a sock_put() or inet_twsk_put() [ and will support SYN_RECV later ]. Note this helper should only be called in real slow path, when rcu lookup found a socket that was moved to another identity (freed/reused immediately), but could eventually be used in other contexts, like sock_edemux() Before patch : dmesg | grep "TCP established" TCP established hash table entries: 524288 (order: 11, 8388608 bytes) After patch : TCP established hash table entries: 524288 (order: 10, 4194304 bytes) Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
struct sock *sk = v;
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
" sl "
"local_address "
"remote_address "
"st tx_queue rx_queue tr tm->when retrnsmt"
" uid timeout inode\n");
goto out;
}
st = seq->private;
if (sk->sk_state == TCP_TIME_WAIT)
get_timewait6_sock(seq, v, st->num);
else if (sk->sk_state == TCP_NEW_SYN_RECV)
get_openreq6(seq, v, st->num);
else
get_tcp6_sock(seq, v, st->num);
out:
return 0;
}
static const struct seq_operations tcp6_seq_ops = {
.show = tcp6_seq_show,
.start = tcp_seq_start,
.next = tcp_seq_next,
.stop = tcp_seq_stop,
};
static struct tcp_seq_afinfo tcp6_seq_afinfo = {
.family = AF_INET6,
};
int __net_init tcp6_proc_init(struct net *net)
{
if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
return -ENOMEM;
return 0;
}
void tcp6_proc_exit(struct net *net)
{
remove_proc_entry("tcp6", net->proc_net);
}
#endif
struct proto tcpv6_prot = {
.name = "TCPv6",
.owner = THIS_MODULE,
.close = tcp_close,
bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov <rdna@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-03-31 06:08:05 +08:00
.pre_connect = tcp_v6_pre_connect,
.connect = tcp_v6_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
.destroy = tcp_v6_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v6_do_rcv,
tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Dave Taht <dave.taht@bufferbloat.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-11 13:50:31 +08:00
.release_cb = tcp_release_cb,
.hash = inet6_hash,
[SOCK] proto: Add hashinfo member to struct proto This way we can remove TCP and DCCP specific versions of sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port sk->sk_prot->hash: inet_hash is directly used, only v6 need a specific version to deal with mapped sockets sk->sk_prot->unhash: both v4 and v6 use inet_hash directly struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so that inet_csk_get_port can find the per family routine. Now only the lookup routines receive as a parameter a struct inet_hashtable. With this we further reuse code, reducing the difference among INET transport protocols. Eventually work has to be done on UDP and SCTP to make them share this infrastructure and get as a bonus inet_diag interfaces so that iproute can be used with these protocols. net-2.6/net/ipv4/inet_hashtables.c: struct proto | +8 struct inet_connection_sock_af_ops | +8 2 structs changed __inet_hash_nolisten | +18 __inet_hash | -210 inet_put_port | +8 inet_bind_bucket_create | +1 __inet_hash_connect | -8 5 functions changed, 27 bytes added, 218 bytes removed, diff: -191 net-2.6/net/core/sock.c: proto_seq_show | +3 1 function changed, 3 bytes added, diff: +3 net-2.6/net/ipv4/inet_connection_sock.c: inet_csk_get_port | +15 1 function changed, 15 bytes added, diff: +15 net-2.6/net/ipv4/tcp.c: tcp_set_state | -7 1 function changed, 7 bytes removed, diff: -7 net-2.6/net/ipv4/tcp_ipv4.c: tcp_v4_get_port | -31 tcp_v4_hash | -48 tcp_v4_destroy_sock | -7 tcp_v4_syn_recv_sock | -2 tcp_unhash | -179 5 functions changed, 267 bytes removed, diff: -267 net-2.6/net/ipv6/inet6_hashtables.c: __inet6_hash | +8 1 function changed, 8 bytes added, diff: +8 net-2.6/net/ipv4/inet_hashtables.c: inet_unhash | +190 inet_hash | +242 2 functions changed, 432 bytes added, diff: +432 vmlinux: 16 functions changed, 485 bytes added, 492 bytes removed, diff: -7 /home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c: tcp_v6_get_port | -31 tcp_v6_hash | -7 tcp_v6_syn_recv_sock | -9 3 functions changed, 47 bytes removed, diff: -47 /home/acme/git/net-2.6/net/dccp/proto.c: dccp_destroy_sock | -7 dccp_unhash | -179 dccp_hash | -49 dccp_set_state | -7 dccp_done | +1 5 functions changed, 1 bytes added, 242 bytes removed, diff: -241 /home/acme/git/net-2.6/net/dccp/ipv4.c: dccp_v4_get_port | -31 dccp_v4_request_recv_sock | -2 2 functions changed, 33 bytes removed, diff: -33 /home/acme/git/net-2.6/net/dccp/ipv6.c: dccp_v6_get_port | -31 dccp_v6_hash | -7 dccp_v6_request_recv_sock | +5 3 functions changed, 5 bytes added, 38 bytes removed, diff: -33 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 20:06:04 +08:00
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
tcp: TCP_NOTSENT_LOWAT socket option Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-By: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-07-23 11:27:07 +08:00
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy = tcp_abort,
};
/* thinking of making this const? Don't.
* early_demux can change based on sysctl.
*/
static struct inet6_protocol tcpv6_protocol = {
.early_demux = tcp_v6_early_demux,
.early_demux_handler = tcp_v6_early_demux,
.handler = tcp_v6_rcv,
.err_handler = tcp_v6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcpv6_prot,
.ops = &inet6_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
};
static int __net_init tcpv6_net_init(struct net *net)
{
return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
SOCK_RAW, IPPROTO_TCP, net);
}
static void __net_exit tcpv6_net_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv6.tcp_sk);
}
static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
{
inet_twsk_purge(&tcp_hashinfo, AF_INET6);
}
static struct pernet_operations tcpv6_net_ops = {
.init = tcpv6_net_init,
.exit = tcpv6_net_exit,
.exit_batch = tcpv6_net_exit_batch,
};
int __init tcpv6_init(void)
{
int ret;
ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP);
if (ret)
goto out;
/* register inet6 protocol */
ret = inet6_register_protosw(&tcpv6_protosw);
if (ret)
goto out_tcpv6_protocol;
ret = register_pernet_subsys(&tcpv6_net_ops);
if (ret)
goto out_tcpv6_protosw;
out:
return ret;
out_tcpv6_protosw:
inet6_unregister_protosw(&tcpv6_protosw);
out_tcpv6_protocol:
inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
goto out;
}
void tcpv6_exit(void)
{
unregister_pernet_subsys(&tcpv6_net_ops);
inet6_unregister_protosw(&tcpv6_protosw);
inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
}