tcp: early retransmit

This patch implements RFC 5827 early retransmit (ER) for TCP.
It reduces DUPACK threshold (dupthresh) if outstanding packets are
less than 4 to recover losses by fast recovery instead of timeout.

While the algorithm is simple, small but frequent network reordering
makes this feature dangerous: the connection repeatedly enter
false recovery and degrade performance. Therefore we implement
a mitigation suggested in the appendix of the RFC that delays
entering fast recovery by a small interval, i.e., RTT/4. Currently
ER is conservative and is disabled for the rest of the connection
after the first reordering event. A large scale web server
experiment on the performance impact of ER is summarized in
section 6 of the paper "Proportional Rate Reduction for TCP”,
IMC 2011. http://conferences.sigcomm.org/imc/2011/docs/p155.pdf

Note that Linux has a similar feature called THIN_DUPACK. The
differences are THIN_DUPACK do not mitigate reorderings and is only
used after slow start. Currently ER is disabled if THIN_DUPACK is
enabled. I would be happy to merge THIN_DUPACK feature with ER if
people think it's a good idea.

ER is enabled by sysctl_tcp_early_retrans:
  0: Disables ER

  1: Reduce dupthresh to packets_out - 1 when outstanding packets < 4.

  2: (Default) reduce dupthresh like mode 1. In addition, delay
     entering fast recovery by RTT/4.

Note: mode 2 is implemented in the third part of this patch series.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Yuchung Cheng 2012-05-02 13:30:03 +00:00 committed by David S. Miller
parent 1fbc340514
commit eed530b6c6
7 changed files with 59 additions and 0 deletions

View File

@ -190,6 +190,20 @@ tcp_cookie_size - INTEGER
tcp_dsack - BOOLEAN tcp_dsack - BOOLEAN
Allows TCP to send "duplicate" SACKs. Allows TCP to send "duplicate" SACKs.
tcp_early_retrans - INTEGER
Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
for triggering fast retransmit when the amount of outstanding data is
small and when no previously unsent data can be transmitted (such
that limited transmit could be used).
Possible values:
0 disables ER
1 enables ER
2 enables ER but delays fast recovery and fast retransmit
by a fourth of RTT. This mitigates connection falsely
recovers when network has a small degree of reordering
(less than 3 packets).
Default: 2
tcp_ecn - INTEGER tcp_ecn - INTEGER
Enable Explicit Congestion Notification (ECN) in TCP. ECN is only Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
used when both ends of the TCP flow support it. It is useful to used when both ends of the TCP flow support it. It is useful to

View File

@ -372,6 +372,7 @@ struct tcp_sock {
repair : 1, repair : 1,
unused : 1; unused : 1;
u8 repair_queue; u8 repair_queue;
u8 do_early_retrans:1;/* Enable RFC5827 early-retransmit */
/* RTT measurement */ /* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */ u32 srtt; /* smoothed round trip time << 3 */

View File

@ -252,6 +252,7 @@ extern int sysctl_tcp_max_ssthresh;
extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_cookie_size;
extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_linear_timeouts;
extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern atomic_long_t tcp_memory_allocated; extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated; extern struct percpu_counter tcp_sockets_allocated;
@ -797,6 +798,20 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; tp->rx_opt.sack_ok |= TCP_FACK_ENABLED;
} }
/* TCP early-retransmit (ER) is similar to but more conservative than
* the thin-dupack feature. Enable ER only if thin-dupack is disabled.
*/
static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
{
tp->do_early_retrans = sysctl_tcp_early_retrans &&
!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
}
static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
{
tp->do_early_retrans = 0;
}
static inline unsigned int tcp_left_out(const struct tcp_sock *tp) static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{ {
return tp->sacked_out + tp->lost_out; return tp->sacked_out + tp->lost_out;

View File

@ -27,6 +27,7 @@
#include <net/tcp_memcontrol.h> #include <net/tcp_memcontrol.h>
static int zero; static int zero;
static int two = 2;
static int tcp_retr1_max = 255; static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 }; static int ip_local_port_range_max[] = { 65535, 65535 };
@ -676,6 +677,15 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_early_retrans",
.data = &sysctl_tcp_early_retrans,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &two,
},
{ {
.procname = "udp_mem", .procname = "udp_mem",
.data = &sysctl_udp_mem, .data = &sysctl_udp_mem,

View File

@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT; tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sysctl_tcp_reordering; tp->reordering = sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
icsk->icsk_ca_ops = &tcp_init_congestion_ops; icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE; sk->sk_state = TCP_CLOSE;
@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL; err = -EINVAL;
else else
tp->thin_dupack = val; tp->thin_dupack = val;
if (tp->thin_dupack)
tcp_disable_early_retrans(tp);
break; break;
case TCP_REPAIR: case TCP_REPAIR:

View File

@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly; int sysctl_tcp_abc __read_mostly;
int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk)
if (dst_metric(dst, RTAX_REORDERING) && if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
tcp_disable_fack(tp); tcp_disable_fack(tp);
tcp_disable_early_retrans(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING); tp->reordering = dst_metric(dst, RTAX_REORDERING);
} }
@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
#endif #endif
tcp_disable_fack(tp); tcp_disable_fack(tp);
} }
if (metric > 0)
tcp_disable_early_retrans(tp);
} }
/* This must be called before lost_out is incremented */ /* This must be called before lost_out is incremented */
@ -2492,6 +2497,16 @@ static int tcp_time_to_recover(struct sock *sk)
tcp_is_sack(tp) && !tcp_send_head(sk)) tcp_is_sack(tp) && !tcp_send_head(sk))
return 1; return 1;
/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
* retransmissions due to small network reorderings, we implement
* Mitigation A.3 in the RFC and delay the retransmission for a short
* interval if appropriate.
*/
if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
(tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
!tcp_may_send_now(sk))
return 1;
return 0; return 0;
} }

View File

@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->sacked_out = 0; newtp->sacked_out = 0;
newtp->fackets_out = 0; newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_enable_early_retrans(newtp);
/* So many TCP implementations out there (incorrectly) count the /* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control * initial SYN frame in their delayed-ACK and congestion control