tcp: fix segment accounting when DSACK range covers multiple segments
Currently, while processing DSACK, we assume DSACK covers only one segment. This leads to significant underestimation of DSACKs with LRO/GRO. This patch fixes segment accounting with DSACK by estimating segment count from DSACK sequence range / MSS. Signed-off-by: Priyaranjan Jha <priyarjha@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Yousuk Seung <ysseung@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
dcc82bb072
commit
a71d77e6be
|
@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
|
||||||
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
|
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct tcp_sacktag_state {
|
||||||
|
/* Timestamps for earliest and latest never-retransmitted segment
|
||||||
|
* that was SACKed. RTO needs the earliest RTT to stay conservative,
|
||||||
|
* but congestion control should still get an accurate delay signal.
|
||||||
|
*/
|
||||||
|
u64 first_sackt;
|
||||||
|
u64 last_sackt;
|
||||||
|
u32 reord;
|
||||||
|
u32 sack_delivered;
|
||||||
|
int flag;
|
||||||
|
unsigned int mss_now;
|
||||||
|
struct rate_sample *rate;
|
||||||
|
};
|
||||||
|
|
||||||
/* Take a notice that peer is sending D-SACKs */
|
/* Take a notice that peer is sending D-SACKs */
|
||||||
static void tcp_dsack_seen(struct tcp_sock *tp)
|
static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
|
||||||
|
u32 end_seq, struct tcp_sacktag_state *state)
|
||||||
{
|
{
|
||||||
|
u32 seq_len, dup_segs = 1;
|
||||||
|
|
||||||
|
if (before(start_seq, end_seq)) {
|
||||||
|
seq_len = end_seq - start_seq;
|
||||||
|
if (seq_len > tp->mss_cache)
|
||||||
|
dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
|
||||||
|
}
|
||||||
|
|
||||||
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
|
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
|
||||||
tp->rack.dsack_seen = 1;
|
tp->rack.dsack_seen = 1;
|
||||||
tp->dsack_dups++;
|
tp->dsack_dups += dup_segs;
|
||||||
|
|
||||||
|
state->flag |= FLAG_DSACKING_ACK;
|
||||||
|
/* A spurious retransmission is delivered */
|
||||||
|
state->sack_delivered += dup_segs;
|
||||||
|
|
||||||
|
return dup_segs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* It's reordering when higher sequence was delivered (i.e. sacked) before
|
/* It's reordering when higher sequence was delivered (i.e. sacked) before
|
||||||
|
@ -1103,53 +1132,37 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
|
||||||
|
|
||||||
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
|
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
|
||||||
struct tcp_sack_block_wire *sp, int num_sacks,
|
struct tcp_sack_block_wire *sp, int num_sacks,
|
||||||
u32 prior_snd_una)
|
u32 prior_snd_una, struct tcp_sacktag_state *state)
|
||||||
{
|
{
|
||||||
struct tcp_sock *tp = tcp_sk(sk);
|
struct tcp_sock *tp = tcp_sk(sk);
|
||||||
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
|
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
|
||||||
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
|
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
|
||||||
bool dup_sack = false;
|
u32 dup_segs;
|
||||||
|
|
||||||
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
|
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
|
||||||
dup_sack = true;
|
|
||||||
tcp_dsack_seen(tp);
|
|
||||||
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
|
||||||
} else if (num_sacks > 1) {
|
} else if (num_sacks > 1) {
|
||||||
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
|
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
|
||||||
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
|
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
|
||||||
|
|
||||||
if (!after(end_seq_0, end_seq_1) &&
|
if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
|
||||||
!before(start_seq_0, start_seq_1)) {
|
return false;
|
||||||
dup_sack = true;
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
|
||||||
tcp_dsack_seen(tp);
|
} else {
|
||||||
NET_INC_STATS(sock_net(sk),
|
return false;
|
||||||
LINUX_MIB_TCPDSACKOFORECV);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
|
||||||
|
|
||||||
/* D-SACK for already forgotten data... Do dumb counting. */
|
/* D-SACK for already forgotten data... Do dumb counting. */
|
||||||
if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
|
if (tp->undo_marker && tp->undo_retrans > 0 &&
|
||||||
!after(end_seq_0, prior_snd_una) &&
|
!after(end_seq_0, prior_snd_una) &&
|
||||||
after(end_seq_0, tp->undo_marker))
|
after(end_seq_0, tp->undo_marker))
|
||||||
tp->undo_retrans--;
|
tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
|
||||||
|
|
||||||
return dup_sack;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct tcp_sacktag_state {
|
|
||||||
u32 reord;
|
|
||||||
/* Timestamps for earliest and latest never-retransmitted segment
|
|
||||||
* that was SACKed. RTO needs the earliest RTT to stay conservative,
|
|
||||||
* but congestion control should still get an accurate delay signal.
|
|
||||||
*/
|
|
||||||
u64 first_sackt;
|
|
||||||
u64 last_sackt;
|
|
||||||
struct rate_sample *rate;
|
|
||||||
int flag;
|
|
||||||
unsigned int mss_now;
|
|
||||||
u32 sack_delivered;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
|
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
|
||||||
* the incoming SACK may not exactly match but we can find smaller MSS
|
* the incoming SACK may not exactly match but we can find smaller MSS
|
||||||
* aligned portion of it that matches. Therefore we might need to fragment
|
* aligned portion of it that matches. Therefore we might need to fragment
|
||||||
|
@ -1692,12 +1705,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
|
||||||
tcp_highest_sack_reset(sk);
|
tcp_highest_sack_reset(sk);
|
||||||
|
|
||||||
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
|
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
|
||||||
num_sacks, prior_snd_una);
|
num_sacks, prior_snd_una, state);
|
||||||
if (found_dup_sack) {
|
|
||||||
state->flag |= FLAG_DSACKING_ACK;
|
|
||||||
/* A spurious retransmission is delivered */
|
|
||||||
state->sack_delivered++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Eliminate too old ACKs, but take into
|
/* Eliminate too old ACKs, but take into
|
||||||
* account more or less fresh ones, they can
|
* account more or less fresh ones, they can
|
||||||
|
|
Loading…
Reference in New Issue