Merge branch 'tcp-policer-drops'

Yuchung Cheng says:

====================
tcp: reducing lost retransmits in recovery

This patch series reduces lost retransmits in recovery, in particular
when dealing with traffic policers. The main problem is that
slow start in recovery under policing can cause massive lost and
retransmit storms: any excess sending rate turns into drops. The
solution is to avoid doing slow start when lost retransmit is
detected and use packet conservation instead.

On networks with traffic policers the patches have lowered the
TCP loss rates by ~20% from Google servers without latency regressions.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-07-08 13:29:46 -07:00
commit 71c3353775
1 changed files with 23 additions and 20 deletions

View File

@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@ -1037,7 +1038,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
* highest SACK block). Also calculate the lowest snd_nxt among the remaining * highest SACK block). Also calculate the lowest snd_nxt among the remaining
* retransmitted skbs to avoid some costly processing per ACKs. * retransmitted skbs to avoid some costly processing per ACKs.
*/ */
static void tcp_mark_lost_retrans(struct sock *sk) static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
{ {
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
@ -1078,7 +1079,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
if (after(received_upto, ack_seq)) { if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb);
*flag |= FLAG_LOST_RETRANS;
tcp_skb_mark_lost_uncond_verify(tp, skb); tcp_skb_mark_lost_uncond_verify(tp, skb);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
} else { } else {
@ -1818,7 +1819,7 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
tcp_mark_lost_retrans(sk); tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp); tcp_verify_left_out(tp);
out: out:
@ -2475,15 +2476,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
return false; return false;
} }
/* The cwnd reduction in CWR and Recovery use the PRR algorithm /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
* It computes the number of packets to send (sndcnt) based on packets newly * It computes the number of packets to send (sndcnt) based on packets newly
* delivered: * delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the * 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT. * cwnd reductions across a full RTT.
* 2) If packets in flight is lower than ssthresh (such as due to excess * 2) Otherwise PRR uses packet conservation to send as much as delivered.
* losses and/or application stalls), do not perform any further cwnd * But when the retransmits are acked without further losses, PRR
* reductions, but instead slow start up to ssthresh. * slow starts cwnd up to ssthresh to speed up the recovery.
*/ */
static void tcp_init_cwnd_reduction(struct sock *sk) static void tcp_init_cwnd_reduction(struct sock *sk)
{ {
@ -2500,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
} }
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
int fast_rexmit) int fast_rexmit, int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0; int sndcnt = 0;
@ -2509,16 +2509,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
(tp->packets_out - tp->sacked_out); (tp->packets_out - tp->sacked_out);
tp->prr_delivered += newly_acked_sacked; tp->prr_delivered += newly_acked_sacked;
if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1; tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
} else { } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
!(flag & FLAG_LOST_RETRANS)) {
sndcnt = min_t(int, delta, sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out, max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1); newly_acked_sacked) + 1);
} else {
sndcnt = min(delta, newly_acked_sacked);
} }
sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
} }
@ -2579,7 +2581,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk); tcp_try_keep_open(sk);
} else { } else {
tcp_cwnd_reduction(sk, prior_unsacked, 0); tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
} }
} }
@ -2676,7 +2678,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tp->prior_ssthresh = 0; tp->prior_ssthresh = 0;
tcp_init_undo(tp); tcp_init_undo(tp);
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack) if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk); tcp_init_cwnd_reduction(sk);
@ -2736,7 +2738,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
/* Undo during fast recovery after partial ACK. */ /* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, const int acked, static bool tcp_try_undo_partial(struct sock *sk, const int acked,
const int prior_unsacked) const int prior_unsacked, int flag)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
@ -2752,7 +2754,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* mark more packets lost or retransmit more. * mark more packets lost or retransmit more.
*/ */
if (tp->retrans_out) { if (tp->retrans_out) {
tcp_cwnd_reduction(sk, prior_unsacked, 0); tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
return true; return true;
} }
@ -2839,7 +2841,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack) if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk); tcp_add_reno_sack(sk);
} else { } else {
if (tcp_try_undo_partial(sk, acked, prior_unsacked)) if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
return; return;
/* Partial ACK arrived. Force fast retransmit. */ /* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) || do_lost = tcp_is_reno(tp) ||
@ -2852,9 +2854,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
break; break;
case TCP_CA_Loss: case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack); tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open) if (icsk->icsk_ca_state != TCP_CA_Open &&
!(flag & FLAG_LOST_RETRANS))
return; return;
/* Fall through to processing in Open state. */ /* Change state if cwnd is undone or retransmits are lost */
default: default:
if (tcp_is_reno(tp)) { if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED) if (flag & FLAG_SND_UNA_ADVANCED)
@ -2889,7 +2892,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost) if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit); tcp_update_scoreboard(sk, fast_rexmit);
tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk); tcp_xmit_retransmit_queue(sk);
} }