2017-11-01 22:09:13 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
|
2012-10-13 17:46:48 +08:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions for the TCP protocol.
|
|
|
|
*
|
|
|
|
* Version: @(#)tcp.h 1.0.2 04/28/93
|
|
|
|
*
|
|
|
|
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
#ifndef _UAPI_LINUX_TCP_H
|
|
|
|
#define _UAPI_LINUX_TCP_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
#include <linux/socket.h>
|
|
|
|
|
|
|
|
struct tcphdr {
|
|
|
|
__be16 source;
|
|
|
|
__be16 dest;
|
|
|
|
__be32 seq;
|
|
|
|
__be32 ack_seq;
|
|
|
|
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
|
|
__u16 res1:4,
|
|
|
|
doff:4,
|
|
|
|
fin:1,
|
|
|
|
syn:1,
|
|
|
|
rst:1,
|
|
|
|
psh:1,
|
|
|
|
ack:1,
|
|
|
|
urg:1,
|
|
|
|
ece:1,
|
|
|
|
cwr:1;
|
|
|
|
#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
|
|
__u16 doff:4,
|
|
|
|
res1:4,
|
|
|
|
cwr:1,
|
|
|
|
ece:1,
|
|
|
|
urg:1,
|
|
|
|
ack:1,
|
|
|
|
psh:1,
|
|
|
|
rst:1,
|
|
|
|
syn:1,
|
|
|
|
fin:1;
|
|
|
|
#else
|
|
|
|
#error "Adjust your <asm/byteorder.h> defines"
|
|
|
|
#endif
|
|
|
|
__be16 window;
|
|
|
|
__sum16 check;
|
|
|
|
__be16 urg_ptr;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The union cast uses a gcc extension to avoid aliasing problems
|
|
|
|
* (union is compatible to any of its members)
|
|
|
|
* This means this part of the code is -fstrict-aliasing safe now.
|
|
|
|
*/
|
|
|
|
union tcp_word_hdr {
|
|
|
|
struct tcphdr hdr;
|
|
|
|
__be32 words[5];
|
|
|
|
};
|
|
|
|
|
|
|
|
#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3])
|
|
|
|
|
|
|
|
enum {
|
|
|
|
TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000),
|
|
|
|
TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000),
|
|
|
|
TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000),
|
|
|
|
TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000),
|
|
|
|
TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000),
|
|
|
|
TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000),
|
|
|
|
TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000),
|
|
|
|
TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000),
|
|
|
|
TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000),
|
|
|
|
TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000)
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TCP general constants
|
|
|
|
*/
|
|
|
|
#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */
|
|
|
|
#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */
|
|
|
|
|
|
|
|
/* TCP socket options */
|
|
|
|
#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */
|
|
|
|
#define TCP_MAXSEG 2 /* Limit MSS */
|
|
|
|
#define TCP_CORK 3 /* Never send partially complete segments */
|
|
|
|
#define TCP_KEEPIDLE 4 /* Start keeplives after this period */
|
|
|
|
#define TCP_KEEPINTVL 5 /* Interval between keepalives */
|
|
|
|
#define TCP_KEEPCNT 6 /* Number of keepalives before death */
|
|
|
|
#define TCP_SYNCNT 7 /* Number of SYN retransmits */
|
|
|
|
#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */
|
|
|
|
#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */
|
|
|
|
#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */
|
|
|
|
#define TCP_INFO 11 /* Information about this connection. */
|
|
|
|
#define TCP_QUICKACK 12 /* Block/reenable quick acks */
|
|
|
|
#define TCP_CONGESTION 13 /* Congestion control algorithm */
|
|
|
|
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
|
|
|
|
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
|
|
|
|
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
|
|
|
|
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
|
|
|
|
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
|
|
|
|
#define TCP_REPAIR_QUEUE 20
|
|
|
|
#define TCP_QUEUE_SEQ 21
|
|
|
|
#define TCP_REPAIR_OPTIONS 22
|
|
|
|
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
|
2013-02-11 13:50:18 +08:00
|
|
|
#define TCP_TIMESTAMP 24
|
tcp: TCP_NOTSENT_LOWAT socket option
Idea of this patch is to add optional limitation of number of
unsent bytes in TCP sockets, to reduce usage of kernel memory.
TCP receiver might announce a big window, and TCP sender autotuning
might allow a large amount of bytes in write queue, but this has little
performance impact if a large part of this buffering is wasted :
Write queue needs to be large only to deal with large BDP, not
necessarily to cope with scheduling delays (incoming ACKS make room
for the application to queue more bytes)
For most workloads, using a value of 128 KB or less is OK to give
applications enough time to react to POLLOUT events in time
(or being awaken in a blocking sendmsg())
This patch adds two ways to set the limit :
1) Per socket option TCP_NOTSENT_LOWAT
2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets
not using TCP_NOTSENT_LOWAT socket option (or setting a zero value)
Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect.
This changes poll()/select()/epoll() to report POLLOUT
only if number of unsent bytes is below tp->nosent_lowat
Note this might increase number of sendmsg()/sendfile() calls
when using non blocking sockets,
and increase number of context switches for blocking sockets.
Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is
defined as :
Specify the minimum number of bytes in the buffer until
the socket layer will pass the data to the protocol)
Tested:
netperf sessions, and watching /proc/net/protocols "memory" column for TCP
With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory
used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458)
lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y
TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y
lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y
TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y
Using 128KB has no bad effect on the throughput or cpu usage
of a single flow, although there is an increase of context switches.
A bonus is that we hold socket lock for a shorter amount
of time and should improve latencies of ACK processing.
lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service
Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand
Size Size Size (sec) Util Util Util Util Demand Demand Units
Final Final % Method % Method
1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB
Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':
412,514 context-switches
200.034645535 seconds time elapsed
lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service
Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand
Size Size Size (sec) Util Util Util Util Demand Demand Units
Final Final % Method % Method
1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB
Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':
2,675,818 context-switches
200.029651391 seconds time elapsed
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-07-23 11:27:07 +08:00
|
|
|
#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
|
2015-04-29 07:23:49 +08:00
|
|
|
#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */
|
2015-05-04 12:34:46 +08:00
|
|
|
#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */
|
|
|
|
#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */
|
2016-06-28 06:33:56 +08:00
|
|
|
#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */
|
net/tcp-fastopen: Add new API support
This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
s = socket()
create a new socket
setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
newly introduced sockopt
If set, new functionality described below will be used.
Return ENOTSUPP if TFO is not supported or not enabled in the
kernel.
connect()
With cookie present, return 0 immediately.
With no cookie, initiate 3WHS with TFO cookie-request option and
return -1 with errno = EINPROGRESS.
write()/sendmsg()
With cookie present, send out SYN with data and return the number of
bytes buffered.
With no cookie, and 3WHS not yet completed, return -1 with errno =
EINPROGRESS.
No MSG_FASTOPEN flag is needed.
read()
Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
write() is not called yet.
Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
established but no msg is received yet.
Return number of bytes read if socket is established and there is
msg received.
The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 02:59:22 +08:00
|
|
|
#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */
|
2017-06-16 09:07:07 +08:00
|
|
|
#define TCP_ULP 31 /* Attach a ULP to a TCP connection */
|
|
|
|
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
|
2017-10-19 02:22:51 +08:00
|
|
|
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
|
2017-10-24 04:22:23 +08:00
|
|
|
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
|
tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.
Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.
1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
This operation does not involve any TCP locking.
2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
the transfert of pages from skbs to one VMA.
This operation only uses down_read(¤t->mm->mmap_sem) after
holding TCP lock, thus solving the lockdep issue.
This new implementation was suggested by Andy Lutomirski with great details.
Benefits are :
- Better scalability, in case multiple threads reuse VMAS
(without mmap()/munmap() calls) since mmap_sem wont be write locked.
- Better error recovery.
The previous mmap() model had to provide the expected size of the
mapping. If for some reason one part could not be mapped (partial MSS),
the whole operation had to be aborted.
With the tcp_zerocopy_receive struct, kernel can report how
many bytes were successfuly mapped, and how many bytes should
be read to skip the problematic sequence.
- No more memory allocation to hold an array of page pointers.
16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/
- skbs are freed while mmap_sem has been released
Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)
Note that memcg might require additional changes.
Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 23:58:08 +08:00
|
|
|
#define TCP_ZEROCOPY_RECEIVE 35
|
tcp: send in-queue bytes in cmsg upon read
Applications with many concurrent connections, high variance
in receive queue length and tight memory bounds cannot
allocate worst-case buffer size to drain sockets. Knowing
the size of receive queue length, applications can optimize
how they allocate buffers to read from the socket.
The number of bytes pending on the socket is directly
available through ioctl(FIONREAD/SIOCINQ) and can be
approximated using getsockopt(MEMINFO) (rmem_alloc includes
skb overheads in addition to application data). But, both of
these options add an extra syscall per recvmsg. Moreover,
ioctl(FIONREAD/SIOCINQ) takes the socket lock.
Add the TCP_INQ socket option to TCP. When this socket
option is set, recvmsg() relays the number of bytes available
on the socket for reading to the application via the
TCP_CM_INQ control message.
Calculate the number of bytes after releasing the socket lock
to include the processed backlog, if any. To avoid an extra
branch in the hot path of recvmsg() for this new control
message, move all cmsg processing inside an existing branch for
processing receive timestamps. Since the socket lock is not held
when calculating the size of receive queue, TCP_INQ is a hint.
For example, it can overestimate the queue size by one byte,
if FIN is received.
With this method, applications can start reading from the socket
using a small buffer, and then use larger buffers based on the
remaining data when needed.
V3 change-log:
As suggested by David Miller, added loads with barrier
to check whether we have multiple threads calling recvmsg
in parallel. When that happens we lock the socket to
calculate inq.
V4 change-log:
Removed inline from a static function.
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-02 03:39:15 +08:00
|
|
|
#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */
|
|
|
|
|
|
|
|
#define TCP_CM_INQ TCP_INQ
|
2012-10-13 17:46:48 +08:00
|
|
|
|
tcp: add optional per socket transmit delay
Adding delays to TCP flows is crucial for studying behavior
of TCP stacks, including congestion control modules.
Linux offers netem module, but it has unpractical constraints :
- Need root access to change qdisc
- Hard to setup on egress if combined with non trivial qdisc like FQ
- Single delay for all flows.
EDT (Earliest Departure Time) adoption in TCP stack allows us
to enable a per socket delay at a very small cost.
Networking tools can now establish thousands of flows, each of them
with a different delay, simulating real world conditions.
This requires FQ packet scheduler or a EDT-enabled NIC.
This patchs adds TCP_TX_DELAY socket option, to set a delay in
usec units.
unsigned int tx_delay = 10000; /* 10 msec */
setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay));
Note that FQ packet scheduler limits might need some tweaking :
man tc-fq
PARAMETERS
limit
Hard limit on the real queue size. When this limit is
reached, new packets are dropped. If the value is lowered,
packets are dropped so that the new limit is met. Default
is 10000 packets.
flow_limit
Hard limit on the maximum number of packets queued per
flow. Default value is 100.
Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc,
so packets would be dropped if any of the previous limit is hit.
Use of a jump label makes this support runtime-free, for hosts
never using the option.
Also note that TSQ (TCP Small Queues) limits are slightly changed
with this patch : we need to account that skbs artificially delayed
wont stop us providind more skbs to feed the pipe (netem uses
skb_orphan_partial() for this purpose, but FQ can not use this trick)
Because of that, using big delays might very well trigger
old bugs in TSO auto defer logic and/or sndbuf limited detection.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-13 02:57:25 +08:00
|
|
|
#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */
|
|
|
|
|
|
|
|
|
2018-07-15 23:36:37 +08:00
|
|
|
#define TCP_REPAIR_ON 1
|
|
|
|
#define TCP_REPAIR_OFF 0
|
|
|
|
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
|
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
struct tcp_repair_opt {
|
|
|
|
__u32 opt_code;
|
|
|
|
__u32 opt_val;
|
|
|
|
};
|
|
|
|
|
2016-06-28 06:33:56 +08:00
|
|
|
struct tcp_repair_window {
|
|
|
|
__u32 snd_wl1;
|
|
|
|
__u32 snd_wnd;
|
|
|
|
__u32 max_window;
|
|
|
|
|
|
|
|
__u32 rcv_wnd;
|
|
|
|
__u32 rcv_wup;
|
|
|
|
};
|
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
enum {
|
|
|
|
TCP_NO_QUEUE,
|
|
|
|
TCP_RECV_QUEUE,
|
|
|
|
TCP_SEND_QUEUE,
|
|
|
|
TCP_QUEUES_NR,
|
|
|
|
};
|
|
|
|
|
tcp: add TCP_INFO status for failed client TFO
The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether
or not data-in-SYN was ack'd on both the client and server side. We'd like
to gather more information on the client-side in the failure case in order
to indicate the reason for the failure. This can be useful for not only
debugging TFO, but also for creating TFO socket policies. For example, if
a middle box removes the TFO option or drops a data-in-SYN, we can
can detect this case, and turn off TFO for these connections saving the
extra retransmits.
The newly added tcpi_fastopen_client_fail status is 2 bits and has the
following 4 states:
1) TFO_STATUS_UNSPEC
Catch-all state which includes when TFO is disabled via black hole
detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE.
2) TFO_COOKIE_UNAVAILABLE
If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie
is available in the cache.
3) TFO_DATA_NOT_ACKED
Data was sent with SYN, we received a SYN/ACK but it did not cover the data
portion. Cookie is not accepted by server because the cookie may be invalid
or the server may be overloaded.
4) TFO_SYN_RETRANSMITTED
Data was sent with SYN, we received a SYN/ACK which did not cover the data
after at least 1 additional SYN was sent (without data). It may be the case
that a middle-box is dropping data-in-SYN packets. Thus, it would be more
efficient to not use TFO on this connection to avoid extra retransmits
during connection establishment.
These new fields do not cover all the cases where TFO may fail, but other
failures, such as SYN/ACK + data being dropped, will result in the
connection not becoming established. And a connection blackhole after
session establishment shows up as a stalled connection.
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Christoph Paasch <cpaasch@apple.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 23:09:26 +08:00
|
|
|
/* why fastopen failed from client perspective */
|
|
|
|
enum tcp_fastopen_client_fail {
|
|
|
|
TFO_STATUS_UNSPEC, /* catch-all */
|
|
|
|
TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */
|
|
|
|
TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */
|
|
|
|
TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */
|
|
|
|
};
|
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
/* for TCP_INFO socket option */
|
|
|
|
#define TCPI_OPT_TIMESTAMPS 1
|
|
|
|
#define TCPI_OPT_SACK 2
|
|
|
|
#define TCPI_OPT_WSCALE 4
|
|
|
|
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
|
|
|
|
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
|
2012-10-19 23:14:44 +08:00
|
|
|
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
|
2012-10-13 17:46:48 +08:00
|
|
|
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* Sender's congestion state indicating normal or abnormal situations
|
|
|
|
* in the last round of packets sent. The state is driven by the ACK
|
|
|
|
* information and timer events.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
enum tcp_ca_state {
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* Nothing bad has been observed recently.
|
|
|
|
* No apparent reordering, packet loss, or ECN marks.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
TCP_CA_Open = 0,
|
|
|
|
#define TCPF_CA_Open (1<<TCP_CA_Open)
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* The sender enters disordered state when it has received DUPACKs or
|
|
|
|
* SACKs in the last round of packets sent. This could be due to packet
|
|
|
|
* loss or reordering but needs further information to confirm packets
|
|
|
|
* have been lost.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
TCP_CA_Disorder = 1,
|
|
|
|
#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* The sender enters Congestion Window Reduction (CWR) state when it
|
|
|
|
* has received ACKs with ECN-ECE marks, or has experienced congestion
|
|
|
|
* or packet discard on the sender host (e.g. qdisc).
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
TCP_CA_CWR = 2,
|
|
|
|
#define TCPF_CA_CWR (1<<TCP_CA_CWR)
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* The sender is in fast recovery and retransmitting lost packets,
|
|
|
|
* typically triggered by ACK events.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
TCP_CA_Recovery = 3,
|
|
|
|
#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
|
2019-03-22 22:59:47 +08:00
|
|
|
/*
|
|
|
|
* The sender is in loss recovery triggered by retransmission timeout.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
TCP_CA_Loss = 4
|
|
|
|
#define TCPF_CA_Loss (1<<TCP_CA_Loss)
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tcp_info {
|
|
|
|
__u8 tcpi_state;
|
|
|
|
__u8 tcpi_ca_state;
|
|
|
|
__u8 tcpi_retransmits;
|
|
|
|
__u8 tcpi_probes;
|
|
|
|
__u8 tcpi_backoff;
|
|
|
|
__u8 tcpi_options;
|
|
|
|
__u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
|
tcp: add TCP_INFO status for failed client TFO
The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether
or not data-in-SYN was ack'd on both the client and server side. We'd like
to gather more information on the client-side in the failure case in order
to indicate the reason for the failure. This can be useful for not only
debugging TFO, but also for creating TFO socket policies. For example, if
a middle box removes the TFO option or drops a data-in-SYN, we can
can detect this case, and turn off TFO for these connections saving the
extra retransmits.
The newly added tcpi_fastopen_client_fail status is 2 bits and has the
following 4 states:
1) TFO_STATUS_UNSPEC
Catch-all state which includes when TFO is disabled via black hole
detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE.
2) TFO_COOKIE_UNAVAILABLE
If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie
is available in the cache.
3) TFO_DATA_NOT_ACKED
Data was sent with SYN, we received a SYN/ACK but it did not cover the data
portion. Cookie is not accepted by server because the cookie may be invalid
or the server may be overloaded.
4) TFO_SYN_RETRANSMITTED
Data was sent with SYN, we received a SYN/ACK which did not cover the data
after at least 1 additional SYN was sent (without data). It may be the case
that a middle-box is dropping data-in-SYN packets. Thus, it would be more
efficient to not use TFO on this connection to avoid extra retransmits
during connection establishment.
These new fields do not cover all the cases where TFO may fail, but other
failures, such as SYN/ACK + data being dropped, will result in the
connection not becoming established. And a connection blackhole after
session establishment shows up as a stalled connection.
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Christoph Paasch <cpaasch@apple.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 23:09:26 +08:00
|
|
|
__u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
|
2012-10-13 17:46:48 +08:00
|
|
|
|
|
|
|
__u32 tcpi_rto;
|
|
|
|
__u32 tcpi_ato;
|
|
|
|
__u32 tcpi_snd_mss;
|
|
|
|
__u32 tcpi_rcv_mss;
|
|
|
|
|
|
|
|
__u32 tcpi_unacked;
|
|
|
|
__u32 tcpi_sacked;
|
|
|
|
__u32 tcpi_lost;
|
|
|
|
__u32 tcpi_retrans;
|
|
|
|
__u32 tcpi_fackets;
|
|
|
|
|
|
|
|
/* Times. */
|
|
|
|
__u32 tcpi_last_data_sent;
|
|
|
|
__u32 tcpi_last_ack_sent; /* Not remembered, sorry. */
|
|
|
|
__u32 tcpi_last_data_recv;
|
|
|
|
__u32 tcpi_last_ack_recv;
|
|
|
|
|
|
|
|
/* Metrics. */
|
|
|
|
__u32 tcpi_pmtu;
|
|
|
|
__u32 tcpi_rcv_ssthresh;
|
|
|
|
__u32 tcpi_rtt;
|
|
|
|
__u32 tcpi_rttvar;
|
|
|
|
__u32 tcpi_snd_ssthresh;
|
|
|
|
__u32 tcpi_snd_cwnd;
|
|
|
|
__u32 tcpi_advmss;
|
|
|
|
__u32 tcpi_reordering;
|
|
|
|
|
|
|
|
__u32 tcpi_rcv_rtt;
|
|
|
|
__u32 tcpi_rcv_space;
|
|
|
|
|
|
|
|
__u32 tcpi_total_retrans;
|
tcp: add pacing_rate information into tcp_info
Add two new fields to struct tcp_info, to report sk_pacing_rate
and sk_max_pacing_rate to monitoring applications, as ss from iproute2.
User exported fields are 64bit, even if kernel is currently using 32bit
fields.
lpaa5:~# ss -i
..
skmem:(r0,rb357120,t0,tb2097152,f1584,w1980880,o0,bl0) ts sack cubic
wscale:6,6 rto:400 rtt:0.875/0.75 mss:1448 cwnd:1 ssthresh:12 send
13.2Mbps pacing_rate 3336.2Mbps unacked:15 retrans:1/5448 lost:15
rcv_space:29200
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-14 06:27:40 +08:00
|
|
|
|
|
|
|
__u64 tcpi_pacing_rate;
|
|
|
|
__u64 tcpi_max_pacing_rate;
|
2015-05-21 07:35:41 +08:00
|
|
|
__u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
2015-04-29 06:28:18 +08:00
|
|
|
__u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
2015-05-21 07:35:41 +08:00
|
|
|
__u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
|
|
|
|
__u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
|
2016-02-12 14:02:53 +08:00
|
|
|
|
|
|
|
__u32 tcpi_notsent_bytes;
|
|
|
|
__u32 tcpi_min_rtt;
|
2016-03-15 01:52:15 +08:00
|
|
|
__u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
|
|
|
|
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
|
2016-09-20 11:39:16 +08:00
|
|
|
|
|
|
|
__u64 tcpi_delivery_rate;
|
2016-11-28 15:07:17 +08:00
|
|
|
|
|
|
|
__u64 tcpi_busy_time; /* Time (usec) busy sending data */
|
|
|
|
__u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
|
|
|
|
__u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
|
2018-04-18 14:18:49 +08:00
|
|
|
|
|
|
|
__u32 tcpi_delivered;
|
|
|
|
__u32 tcpi_delivered_ce;
|
2018-08-01 08:46:21 +08:00
|
|
|
|
|
|
|
__u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
|
2018-08-01 08:46:22 +08:00
|
|
|
__u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */
|
2018-08-01 08:46:23 +08:00
|
|
|
__u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */
|
2018-08-01 08:46:24 +08:00
|
|
|
__u32 tcpi_reord_seen; /* reordering events seen */
|
2019-09-14 07:23:34 +08:00
|
|
|
|
|
|
|
__u32 tcpi_rcv_ooopack; /* Out-of-order packets received */
|
2019-09-14 07:23:35 +08:00
|
|
|
|
|
|
|
__u32 tcpi_snd_wnd; /* peer's advertised receive window after
|
|
|
|
* scaling (bytes)
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
};
|
|
|
|
|
2016-11-28 15:07:18 +08:00
|
|
|
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
|
|
|
|
enum {
|
|
|
|
TCP_NLA_PAD,
|
|
|
|
TCP_NLA_BUSY, /* Time (usec) busy sending data */
|
|
|
|
TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
|
|
|
|
TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */
|
2017-01-28 08:24:38 +08:00
|
|
|
TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */
|
|
|
|
TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */
|
2017-07-29 01:28:21 +08:00
|
|
|
TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */
|
|
|
|
TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */
|
|
|
|
TCP_NLA_SND_CWND, /* Sending congestion window */
|
|
|
|
TCP_NLA_REORDERING, /* Reordering metric */
|
|
|
|
TCP_NLA_MIN_RTT, /* minimum RTT */
|
|
|
|
TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */
|
|
|
|
TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
|
2018-03-05 02:38:35 +08:00
|
|
|
TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */
|
2018-03-05 02:38:36 +08:00
|
|
|
TCP_NLA_CA_STATE, /* ca_state of socket */
|
2018-03-17 01:51:07 +08:00
|
|
|
TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */
|
2018-04-18 14:18:49 +08:00
|
|
|
TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */
|
|
|
|
TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */
|
2018-08-01 08:46:21 +08:00
|
|
|
TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */
|
2018-08-01 08:46:22 +08:00
|
|
|
TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */
|
2018-08-01 08:46:23 +08:00
|
|
|
TCP_NLA_DSACK_DUPS, /* DSACK blocks received */
|
2018-08-01 08:46:24 +08:00
|
|
|
TCP_NLA_REORD_SEEN, /* reordering events seen */
|
2018-11-16 08:44:12 +08:00
|
|
|
TCP_NLA_SRTT, /* smoothed RTT in usecs */
|
2020-01-25 05:34:02 +08:00
|
|
|
TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */
|
2020-03-10 04:16:40 +08:00
|
|
|
TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */
|
2016-11-28 15:07:18 +08:00
|
|
|
};
|
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
/* for TCP_MD5SIG socket option */
|
|
|
|
#define TCP_MD5SIG_MAXKEYLEN 80
|
|
|
|
|
2017-06-16 09:07:07 +08:00
|
|
|
/* tcp_md5sig extension flags for TCP_MD5SIG_EXT */
|
2019-12-31 06:14:29 +08:00
|
|
|
#define TCP_MD5SIG_FLAG_PREFIX 0x1 /* address prefix length */
|
|
|
|
#define TCP_MD5SIG_FLAG_IFINDEX 0x2 /* ifindex set */
|
2017-06-16 09:07:07 +08:00
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
struct tcp_md5sig {
|
|
|
|
struct __kernel_sockaddr_storage tcpm_addr; /* address associated */
|
2017-06-16 09:07:07 +08:00
|
|
|
__u8 tcpm_flags; /* extension flags */
|
|
|
|
__u8 tcpm_prefixlen; /* address prefix */
|
2012-10-13 17:46:48 +08:00
|
|
|
__u16 tcpm_keylen; /* key length */
|
2019-12-31 06:14:29 +08:00
|
|
|
int tcpm_ifindex; /* device index for scope */
|
2012-10-13 17:46:48 +08:00
|
|
|
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */
|
|
|
|
};
|
|
|
|
|
2017-09-01 00:59:39 +08:00
|
|
|
/* INET_DIAG_MD5SIG */
|
|
|
|
struct tcp_diag_md5sig {
|
|
|
|
__u8 tcpm_family;
|
|
|
|
__u8 tcpm_prefixlen;
|
|
|
|
__u16 tcpm_keylen;
|
|
|
|
__be32 tcpm_addr[4];
|
|
|
|
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
|
|
|
|
};
|
|
|
|
|
tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.
Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.
1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
This operation does not involve any TCP locking.
2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
the transfert of pages from skbs to one VMA.
This operation only uses down_read(¤t->mm->mmap_sem) after
holding TCP lock, thus solving the lockdep issue.
This new implementation was suggested by Andy Lutomirski with great details.
Benefits are :
- Better scalability, in case multiple threads reuse VMAS
(without mmap()/munmap() calls) since mmap_sem wont be write locked.
- Better error recovery.
The previous mmap() model had to provide the expected size of the
mapping. If for some reason one part could not be mapped (partial MSS),
the whole operation had to be aborted.
With the tcp_zerocopy_receive struct, kernel can report how
many bytes were successfuly mapped, and how many bytes should
be read to skip the problematic sequence.
- No more memory allocation to hold an array of page pointers.
16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/
- skbs are freed while mmap_sem has been released
Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)
Note that memcg might require additional changes.
Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 23:58:08 +08:00
|
|
|
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
|
|
|
|
|
|
|
|
struct tcp_zerocopy_receive {
|
|
|
|
__u64 address; /* in: address of mapping */
|
|
|
|
__u32 length; /* in/out: number of bytes to map/mapped */
|
|
|
|
__u32 recv_skip_hint; /* out: amount of bytes to skip */
|
2020-02-15 07:30:49 +08:00
|
|
|
__u32 inq; /* out: amount of bytes in read queue */
|
2020-02-15 07:30:50 +08:00
|
|
|
__s32 err; /* out: socket error */
|
tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive
When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.
Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.
1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
This operation does not involve any TCP locking.
2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
the transfert of pages from skbs to one VMA.
This operation only uses down_read(¤t->mm->mmap_sem) after
holding TCP lock, thus solving the lockdep issue.
This new implementation was suggested by Andy Lutomirski with great details.
Benefits are :
- Better scalability, in case multiple threads reuse VMAS
(without mmap()/munmap() calls) since mmap_sem wont be write locked.
- Better error recovery.
The previous mmap() model had to provide the expected size of the
mapping. If for some reason one part could not be mapped (partial MSS),
the whole operation had to be aborted.
With the tcp_zerocopy_receive struct, kernel can report how
many bytes were successfuly mapped, and how many bytes should
be read to skip the problematic sequence.
- No more memory allocation to hold an array of page pointers.
16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/
- skbs are freed while mmap_sem has been released
Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)
Note that memcg might require additional changes.
Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-27 23:58:08 +08:00
|
|
|
};
|
2012-10-13 17:46:48 +08:00
|
|
|
#endif /* _UAPI_LINUX_TCP_H */
|