tcp: schedule EPOLLOUT after a partial sendmsg
For EPOLLET, applications must call sendmsg until they get EAGAIN. Otherwise, there is no guarantee that EPOLLOUT is sent if there was a failure upon memory allocation. As a result on high-speed NICs, userspace observes multiple small sendmsgs after a partial sendmsg until EAGAIN, since TCP can send 1-2 TSOs in between two sendmsg syscalls: // One large partial send due to memory allocation failure. sendmsg(20MB) = 2MB // Many small sends until EAGAIN. sendmsg(18MB) = 64KB sendmsg(17.9MB) = 128KB sendmsg(17.8MB) = 64KB ... sendmsg(...) = EAGAIN // At this point, userspace can assume an EPOLLOUT. To fix this, set the SOCK_NOSPACE on all partial sendmsg scenarios to guarantee that we send EPOLLOUT after partial sendmsg. After this commit userspace can assume that it will receive an EPOLLOUT after the first partial sendmsg. This EPOLLOUT will benefit from sk_stream_write_space() logic delaying the EPOLLOUT until significant space is available in write queue. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
8ba3c9d1c6
commit
afb83012cc
|
@ -1004,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
|
|||
!tcp_skb_can_collapse_to(skb)) {
|
||||
new_segment:
|
||||
if (!sk_stream_memory_free(sk))
|
||||
goto wait_for_sndbuf;
|
||||
goto wait_for_space;
|
||||
|
||||
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
|
||||
tcp_rtx_and_write_queues_empty(sk));
|
||||
if (!skb)
|
||||
goto wait_for_memory;
|
||||
goto wait_for_space;
|
||||
|
||||
#ifdef CONFIG_TLS_DEVICE
|
||||
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
|
||||
|
@ -1028,7 +1028,7 @@ new_segment:
|
|||
goto new_segment;
|
||||
}
|
||||
if (!sk_wmem_schedule(sk, copy))
|
||||
goto wait_for_memory;
|
||||
goto wait_for_space;
|
||||
|
||||
if (can_coalesce) {
|
||||
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
|
||||
|
@ -1069,9 +1069,8 @@ new_segment:
|
|||
tcp_push_one(sk, mss_now);
|
||||
continue;
|
||||
|
||||
wait_for_sndbuf:
|
||||
wait_for_space:
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
wait_for_memory:
|
||||
tcp_push(sk, flags & ~MSG_MORE, mss_now,
|
||||
TCP_NAGLE_PUSH, size_goal);
|
||||
|
||||
|
@ -1282,7 +1281,7 @@ restart:
|
|||
|
||||
new_segment:
|
||||
if (!sk_stream_memory_free(sk))
|
||||
goto wait_for_sndbuf;
|
||||
goto wait_for_space;
|
||||
|
||||
if (unlikely(process_backlog >= 16)) {
|
||||
process_backlog = 0;
|
||||
|
@ -1293,7 +1292,7 @@ new_segment:
|
|||
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
|
||||
first_skb);
|
||||
if (!skb)
|
||||
goto wait_for_memory;
|
||||
goto wait_for_space;
|
||||
|
||||
process_backlog++;
|
||||
skb->ip_summed = CHECKSUM_PARTIAL;
|
||||
|
@ -1326,7 +1325,7 @@ new_segment:
|
|||
struct page_frag *pfrag = sk_page_frag(sk);
|
||||
|
||||
if (!sk_page_frag_refill(sk, pfrag))
|
||||
goto wait_for_memory;
|
||||
goto wait_for_space;
|
||||
|
||||
if (!skb_can_coalesce(skb, i, pfrag->page,
|
||||
pfrag->offset)) {
|
||||
|
@ -1340,7 +1339,7 @@ new_segment:
|
|||
copy = min_t(int, copy, pfrag->size - pfrag->offset);
|
||||
|
||||
if (!sk_wmem_schedule(sk, copy))
|
||||
goto wait_for_memory;
|
||||
goto wait_for_space;
|
||||
|
||||
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
|
||||
pfrag->page,
|
||||
|
@ -1393,9 +1392,8 @@ new_segment:
|
|||
tcp_push_one(sk, mss_now);
|
||||
continue;
|
||||
|
||||
wait_for_sndbuf:
|
||||
wait_for_space:
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
wait_for_memory:
|
||||
if (copied)
|
||||
tcp_push(sk, flags & ~MSG_MORE, mss_now,
|
||||
TCP_NAGLE_PUSH, size_goal);
|
||||
|
|
Loading…
Reference in New Issue