tcp: remove early retransmit
This patch removes the support of RFC5827 early retransmit (i.e., fast recovery on small inflight with <3 dupacks) because it is subsumed by the new RACK loss detection. More specifically when RACK receives DUPACKs, it'll arm a reordering timer to start fast recovery after a quarter of (min)RTT, hence it covers the early retransmit except RACK does not limit itself to specific inflight or dupack numbers. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
840a3cbe89
commit
bec41a11dd
|
@ -246,21 +246,12 @@ tcp_dsack - BOOLEAN
|
||||||
Allows TCP to send "duplicate" SACKs.
|
Allows TCP to send "duplicate" SACKs.
|
||||||
|
|
||||||
tcp_early_retrans - INTEGER
|
tcp_early_retrans - INTEGER
|
||||||
Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
|
Tail loss probe (TLP) converts RTOs occurring due to tail
|
||||||
for triggering fast retransmit when the amount of outstanding data is
|
losses into fast recovery (draft-ietf-tcpm-rack). Note that
|
||||||
small and when no previously unsent data can be transmitted (such
|
TLP requires RACK to function properly (see tcp_recovery below)
|
||||||
that limited transmit could be used). Also controls the use of
|
|
||||||
Tail loss probe (TLP) that converts RTOs occurring due to tail
|
|
||||||
losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
|
|
||||||
Possible values:
|
Possible values:
|
||||||
0 disables ER
|
0 disables TLP
|
||||||
1 enables ER
|
3 or 4 enables TLP
|
||||||
2 enables ER but delays fast recovery and fast retransmit
|
|
||||||
by a fourth of RTT. This mitigates connection falsely
|
|
||||||
recovers when network has a small degree of reordering
|
|
||||||
(less than 3 packets).
|
|
||||||
3 enables delayed ER and TLP.
|
|
||||||
4 enables TLP only.
|
|
||||||
Default: 3
|
Default: 3
|
||||||
|
|
||||||
tcp_ecn - INTEGER
|
tcp_ecn - INTEGER
|
||||||
|
|
|
@ -224,8 +224,7 @@ struct tcp_sock {
|
||||||
repair : 1,
|
repair : 1,
|
||||||
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
|
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
|
||||||
u8 repair_queue;
|
u8 repair_queue;
|
||||||
u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
|
u8 syn_data:1, /* SYN includes data */
|
||||||
syn_data:1, /* SYN includes data */
|
|
||||||
syn_fastopen:1, /* SYN includes Fast Open option */
|
syn_fastopen:1, /* SYN includes Fast Open option */
|
||||||
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
|
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
|
||||||
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
|
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
|
||||||
|
|
|
@ -565,7 +565,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
|
||||||
const struct sk_buff *next_skb);
|
const struct sk_buff *next_skb);
|
||||||
|
|
||||||
/* tcp_input.c */
|
/* tcp_input.c */
|
||||||
void tcp_resume_early_retransmit(struct sock *sk);
|
|
||||||
void tcp_rearm_rto(struct sock *sk);
|
void tcp_rearm_rto(struct sock *sk);
|
||||||
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
|
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
|
||||||
void tcp_reset(struct sock *sk);
|
void tcp_reset(struct sock *sk);
|
||||||
|
@ -1037,24 +1036,6 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
|
||||||
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
|
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TCP early-retransmit (ER) is similar to but more conservative than
|
|
||||||
* the thin-dupack feature. Enable ER only if thin-dupack is disabled.
|
|
||||||
*/
|
|
||||||
static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
|
|
||||||
{
|
|
||||||
struct net *net = sock_net((struct sock *)tp);
|
|
||||||
|
|
||||||
tp->do_early_retrans = sysctl_tcp_early_retrans &&
|
|
||||||
sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
|
|
||||||
!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
|
|
||||||
net->ipv4.sysctl_tcp_reordering == 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
|
|
||||||
{
|
|
||||||
tp->do_early_retrans = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
|
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
|
||||||
{
|
{
|
||||||
return tp->sacked_out + tp->lost_out;
|
return tp->sacked_out + tp->lost_out;
|
||||||
|
|
|
@ -215,7 +215,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
||||||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
|
||||||
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
||||||
r->idiag_timer = 1;
|
r->idiag_timer = 1;
|
||||||
|
|
|
@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
|
||||||
tp->mss_cache = TCP_MSS_DEFAULT;
|
tp->mss_cache = TCP_MSS_DEFAULT;
|
||||||
|
|
||||||
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
|
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
|
||||||
tcp_enable_early_retrans(tp);
|
|
||||||
tcp_assign_congestion_control(sk);
|
tcp_assign_congestion_control(sk);
|
||||||
|
|
||||||
tp->tsoffset = 0;
|
tp->tsoffset = 0;
|
||||||
|
@ -2477,8 +2476,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
else {
|
else {
|
||||||
tp->thin_dupack = val;
|
tp->thin_dupack = val;
|
||||||
if (tp->thin_dupack)
|
|
||||||
tcp_disable_early_retrans(tp);
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
|
@ -904,8 +904,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
|
||||||
tcp_disable_fack(tp);
|
tcp_disable_fack(tp);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (metric > 0)
|
|
||||||
tcp_disable_early_retrans(tp);
|
|
||||||
tp->rack.reord = 1;
|
tp->rack.reord = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2054,30 +2052,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
|
||||||
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
|
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
|
|
||||||
{
|
|
||||||
struct tcp_sock *tp = tcp_sk(sk);
|
|
||||||
unsigned long delay;
|
|
||||||
|
|
||||||
/* Delay early retransmit and entering fast recovery for
|
|
||||||
* max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
|
|
||||||
* available, or RTO is scheduled to fire first.
|
|
||||||
*/
|
|
||||||
if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
|
|
||||||
(flag & FLAG_ECE) || !tp->srtt_us)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
|
|
||||||
msecs_to_jiffies(2));
|
|
||||||
|
|
||||||
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
|
|
||||||
TCP_RTO_MAX);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Linux NewReno/SACK/FACK/ECN state machine.
|
/* Linux NewReno/SACK/FACK/ECN state machine.
|
||||||
* --------------------------------------
|
* --------------------------------------
|
||||||
*
|
*
|
||||||
|
@ -2221,16 +2195,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
|
||||||
tcp_is_sack(tp) && !tcp_send_head(sk))
|
tcp_is_sack(tp) && !tcp_send_head(sk))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
|
|
||||||
* retransmissions due to small network reorderings, we implement
|
|
||||||
* Mitigation A.3 in the RFC and delay the retransmission for a short
|
|
||||||
* interval if appropriate.
|
|
||||||
*/
|
|
||||||
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
|
|
||||||
(tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
|
|
||||||
!tcp_may_send_now(sk))
|
|
||||||
return !tcp_pause_early_retransmit(sk, flag);
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3050,8 +3014,7 @@ void tcp_rearm_rto(struct sock *sk)
|
||||||
} else {
|
} else {
|
||||||
u32 rto = inet_csk(sk)->icsk_rto;
|
u32 rto = inet_csk(sk)->icsk_rto;
|
||||||
/* Offset the time elapsed after installing regular RTO */
|
/* Offset the time elapsed after installing regular RTO */
|
||||||
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
||||||
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
||||||
struct sk_buff *skb = tcp_write_queue_head(sk);
|
struct sk_buff *skb = tcp_write_queue_head(sk);
|
||||||
const u32 rto_time_stamp =
|
const u32 rto_time_stamp =
|
||||||
|
@ -3068,24 +3031,6 @@ void tcp_rearm_rto(struct sock *sk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This function is called when the delayed ER timer fires. TCP enters
|
|
||||||
* fast recovery and performs fast-retransmit.
|
|
||||||
*/
|
|
||||||
void tcp_resume_early_retransmit(struct sock *sk)
|
|
||||||
{
|
|
||||||
struct tcp_sock *tp = tcp_sk(sk);
|
|
||||||
|
|
||||||
tcp_rearm_rto(sk);
|
|
||||||
|
|
||||||
/* Stop if ER is disabled after the delayed ER timer is scheduled */
|
|
||||||
if (!tp->do_early_retrans)
|
|
||||||
return;
|
|
||||||
|
|
||||||
tcp_enter_recovery(sk, false);
|
|
||||||
tcp_update_scoreboard(sk, 1);
|
|
||||||
tcp_xmit_retransmit_queue(sk);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we get here, the whole TSO packet has not been acked. */
|
/* If we get here, the whole TSO packet has not been acked. */
|
||||||
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
|
@ -3651,8 +3596,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
||||||
|
|
||||||
skb_mstamp_get(&sack_state.ack_time);
|
skb_mstamp_get(&sack_state.ack_time);
|
||||||
|
|
||||||
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
|
|
||||||
tcp_rearm_rto(sk);
|
tcp_rearm_rto(sk);
|
||||||
|
|
||||||
if (after(ack, prior_snd_una)) {
|
if (after(ack, prior_snd_una)) {
|
||||||
|
|
|
@ -2229,7 +2229,6 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
|
||||||
int state;
|
int state;
|
||||||
|
|
||||||
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
||||||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
|
||||||
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
||||||
timer_active = 1;
|
timer_active = 1;
|
||||||
|
|
|
@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk)
|
||||||
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
|
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
|
||||||
if (val && tp->reordering != val) {
|
if (val && tp->reordering != val) {
|
||||||
tcp_disable_fack(tp);
|
tcp_disable_fack(tp);
|
||||||
tcp_disable_early_retrans(tp);
|
|
||||||
tp->reordering = val;
|
tp->reordering = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -468,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
||||||
newtp->sacked_out = 0;
|
newtp->sacked_out = 0;
|
||||||
newtp->fackets_out = 0;
|
newtp->fackets_out = 0;
|
||||||
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
||||||
tcp_enable_early_retrans(newtp);
|
|
||||||
newtp->tlp_high_seq = 0;
|
newtp->tlp_high_seq = 0;
|
||||||
newtp->lsndtime = treq->snt_synack.stamp_jiffies;
|
newtp->lsndtime = treq->snt_synack.stamp_jiffies;
|
||||||
newsk->sk_txhash = treq->txhash;
|
newsk->sk_txhash = treq->txhash;
|
||||||
|
|
|
@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
|
||||||
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
||||||
|
|
||||||
tp->packets_out += tcp_skb_pcount(skb);
|
tp->packets_out += tcp_skb_pcount(skb);
|
||||||
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
|
||||||
tcp_rearm_rto(sk);
|
tcp_rearm_rto(sk);
|
||||||
}
|
|
||||||
|
|
||||||
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
|
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
|
||||||
tcp_skb_pcount(skb));
|
tcp_skb_pcount(skb));
|
||||||
|
@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||||
u32 timeout, tlp_time_stamp, rto_time_stamp;
|
u32 timeout, tlp_time_stamp, rto_time_stamp;
|
||||||
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
|
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
|
||||||
|
|
||||||
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
|
|
||||||
return false;
|
|
||||||
/* No consecutive loss probes. */
|
/* No consecutive loss probes. */
|
||||||
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
|
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
|
||||||
tcp_rearm_rto(sk);
|
tcp_rearm_rto(sk);
|
||||||
|
@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||||
/* Schedule a loss probe in 2*RTT for SACK capable connections
|
/* Schedule a loss probe in 2*RTT for SACK capable connections
|
||||||
* in Open state, that are either limited by cwnd or application.
|
* in Open state, that are either limited by cwnd or application.
|
||||||
*/
|
*/
|
||||||
if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
|
if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
|
||||||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
|
!tp->packets_out || !tcp_is_sack(tp) ||
|
||||||
|
icsk->icsk_ca_state != TCP_CA_Open)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
|
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
|
||||||
|
|
|
@ -566,9 +566,6 @@ void tcp_write_timer_handler(struct sock *sk)
|
||||||
case ICSK_TIME_REO_TIMEOUT:
|
case ICSK_TIME_REO_TIMEOUT:
|
||||||
tcp_rack_reo_timeout(sk);
|
tcp_rack_reo_timeout(sk);
|
||||||
break;
|
break;
|
||||||
case ICSK_TIME_EARLY_RETRANS:
|
|
||||||
tcp_resume_early_retransmit(sk);
|
|
||||||
break;
|
|
||||||
case ICSK_TIME_LOSS_PROBE:
|
case ICSK_TIME_LOSS_PROBE:
|
||||||
tcp_send_loss_probe(sk);
|
tcp_send_loss_probe(sk);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1745,7 +1745,6 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
|
||||||
srcp = ntohs(inet->inet_sport);
|
srcp = ntohs(inet->inet_sport);
|
||||||
|
|
||||||
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
||||||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
|
||||||
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
||||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
||||||
timer_active = 1;
|
timer_active = 1;
|
||||||
|
|
Loading…
Reference in New Issue