tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT

TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12
as a step to enable bigger tcp sndbuf limits.

It works reasonably well, but the following happens :

Once the limit is reached, TCP stack generates
an [E]POLLOUT event for every incoming ACK packet.

This causes a high number of context switches.

This patch implements the strategy David Miller added
in sock_def_write_space() :

 - If TCP socket has a notsent_lowat constraint of X bytes,
   allow sendmsg() to fill up to X bytes, but send [E]POLLOUT
   only if number of notsent bytes is below X/2

This considerably reduces TCP_NOTSENT_LOWAT overhead,
while allowing to keep the pipe full.

Tested:
 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM

A:/# cat /proc/sys/net/ipv4/tcp_wmem
4096	262144	64000000
A:/# super_netperf 100 -H B -l 1000 -- -K bbr &

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/

A:/# vmstat 5 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0 256220672  13532 694976    0    0    10     0   28   14  0  1 99  0  0
 2  0      0 256320016  13532 698480    0    0   512     0 715901 5927  0 10 90  0  0
 0  0      0 256197232  13532 700992    0    0   735    13 771161 5849  0 11 89  0  0
 1  0      0 256233824  13532 703320    0    0   512    23 719650 6635  0 11 89  0  0
 2  0      0 256226880  13532 705780    0    0   642     4 775650 6009  0 12 88  0  0

A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow

A:/# vmstat 5 5  # check that context switches have not inflated too much.
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 2  0      0 260386512  13592 662148    0    0    10     0   17   14  0  1 99  0  0
 0  0      0 260519680  13592 604184    0    0   512    13 726843 12424  0 10 90  0  0
 1  1      0 260435424  13592 598360    0    0   512    25 764645 12925  0 10 90  0  0
 1  0      0 260855392  13592 578380    0    0   512     7 722943 13624  0 11 88  0  0
 1  0      0 260445008  13592 601176    0    0   614    34 772288 14317  0 10 90  0  0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2018-12-04 07:58:17 -08:00 committed by David S. Miller
parent 4dc88ce672
commit a74f0fa082
3 changed files with 22 additions and 8 deletions

View File

@ -1110,7 +1110,7 @@ struct proto {
unsigned int inuse_idx; unsigned int inuse_idx;
#endif #endif
bool (*stream_memory_free)(const struct sock *sk); bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*stream_memory_read)(const struct sock *sk); bool (*stream_memory_read)(const struct sock *sk);
/* Memory pressure */ /* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk); void (*enter_memory_pressure)(struct sock *sk);
@ -1192,19 +1192,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
#define sk_refcnt_debug_release(sk) do { } while (0) #define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */ #endif /* SOCK_REFCNT_DEBUG */
static inline bool sk_stream_memory_free(const struct sock *sk) static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{ {
if (sk->sk_wmem_queued >= sk->sk_sndbuf) if (sk->sk_wmem_queued >= sk->sk_sndbuf)
return false; return false;
return sk->sk_prot->stream_memory_free ? return sk->sk_prot->stream_memory_free ?
sk->sk_prot->stream_memory_free(sk) : true; sk->sk_prot->stream_memory_free(sk, wake) : true;
}
static inline bool sk_stream_memory_free(const struct sock *sk)
{
return __sk_stream_memory_free(sk, 0);
}
static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
__sk_stream_memory_free(sk, wake);
} }
static inline bool sk_stream_is_writeable(const struct sock *sk) static inline bool sk_stream_is_writeable(const struct sock *sk)
{ {
return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && return __sk_stream_is_writeable(sk, 0);
sk_stream_memory_free(sk);
} }
static inline int sk_under_cgroup_hierarchy(struct sock *sk, static inline int sk_under_cgroup_hierarchy(struct sock *sk,

View File

@ -1870,12 +1870,16 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
} }
static inline bool tcp_stream_memory_free(const struct sock *sk) /* @wake is one when sk_stream_write_space() calls us.
* This sends EPOLLOUT only if notsent_bytes is half the limit.
* This mimics the strategy used in sock_def_write_space().
*/
static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
u32 notsent_bytes = tp->write_seq - tp->snd_nxt; u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
return notsent_bytes < tcp_notsent_lowat(tp); return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
} }
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS

View File

@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk)
struct socket *sock = sk->sk_socket; struct socket *sock = sk->sk_socket;
struct socket_wq *wq; struct socket_wq *wq;
if (sk_stream_is_writeable(sk) && sock) { if (__sk_stream_is_writeable(sk, 1) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags); clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock(); rcu_read_lock();