inet: frags: re-introduce skb coalescing for local delivery
Before commitd4289fcc9b
("net: IP6 defrag: use rbtrees for IPv6 defrag"), a netperf UDP_STREAM test[0] using big IPv6 datagrams (thus generating many fragments) and running over an IPsec tunnel, reported more than 6Gbps throughput. After that patch, the same test gets only 9Mbps when receiving on a be2net nic (driver can make a big difference here, for example, ixgbe doesn't seem to be affected). By reusing the IPv4 defragmentation code, IPv6 lost fragment coalescing (IPv4 fragment coalescing was dropped by commit14fe22e334
("Revert "ipv4: use skb coalescing in defragmentation"")). Without fragment coalescing, be2net runs out of Rx ring entries and starts to drop frames (ethtool reports rx_drops_no_frags errors). Since the netperf traffic is only composed of UDP fragments, any lost packet prevents reassembly of the full datagram. Therefore, fragments which have no possibility to ever get reassembled pile up in the reassembly queue, until the memory accounting exeeds the threshold. At that point no fragment is accepted anymore, which effectively discards all netperf traffic. When reassembly timeout expires, some stale fragments are removed from the reassembly queue, so a few packets can be received, reassembled and delivered to the netperf receiver. But the nic still drops frames and soon the reassembly queue gets filled again with stale fragments. These long time frames where no datagram can be received explain why the performance drop is so significant. Re-introducing fragment coalescing is enough to get the initial performances again (6.6Gbps with be2net): driver doesn't drop frames anymore (no more rx_drops_no_frags errors) and the reassembly engine works at full speed. This patch is quite conservative and only coalesces skbs for local IPv4 and IPv6 delivery (in order to avoid changing skb geometry when forwarding). Coalescing could be extended in the future if need be, as more scenarios would probably benefit from it. [0]: Test configuration Sender: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir in tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir out tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow netserver -D -L fc00:2::1 Receiver: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir in tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir out tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow netperf -H fc00:2::1 -f k -P 0 -L fc00:1::1 -l 60 -t UDP_STREAM -I 99,5 -i 5,5 -T5,5 -6 Signed-off-by: Guillaume Nault <gnault@redhat.com> Acked-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
f6649feb26
commit
891584f48a
|
@ -171,7 +171,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
|
||||||
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
|
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
|
||||||
struct sk_buff *parent);
|
struct sk_buff *parent);
|
||||||
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
||||||
void *reasm_data);
|
void *reasm_data, bool try_coalesce);
|
||||||
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q);
|
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -170,7 +170,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
|
||||||
reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
|
reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
|
||||||
if (!reasm_data)
|
if (!reasm_data)
|
||||||
goto out_oom;
|
goto out_oom;
|
||||||
inet_frag_reasm_finish(&fq->q, skb, reasm_data);
|
inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
|
||||||
|
|
||||||
skb->dev = ldev;
|
skb->dev = ldev;
|
||||||
skb->tstamp = fq->q.stamp;
|
skb->tstamp = fq->q.stamp;
|
||||||
|
|
|
@ -475,11 +475,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
|
||||||
EXPORT_SYMBOL(inet_frag_reasm_prepare);
|
EXPORT_SYMBOL(inet_frag_reasm_prepare);
|
||||||
|
|
||||||
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
||||||
void *reasm_data)
|
void *reasm_data, bool try_coalesce)
|
||||||
{
|
{
|
||||||
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
|
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
|
||||||
struct rb_node *rbn;
|
struct rb_node *rbn;
|
||||||
struct sk_buff *fp;
|
struct sk_buff *fp;
|
||||||
|
int sum_truesize;
|
||||||
|
|
||||||
skb_push(head, head->data - skb_network_header(head));
|
skb_push(head, head->data - skb_network_header(head));
|
||||||
|
|
||||||
|
@ -487,25 +488,41 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
||||||
fp = FRAG_CB(head)->next_frag;
|
fp = FRAG_CB(head)->next_frag;
|
||||||
rbn = rb_next(&head->rbnode);
|
rbn = rb_next(&head->rbnode);
|
||||||
rb_erase(&head->rbnode, &q->rb_fragments);
|
rb_erase(&head->rbnode, &q->rb_fragments);
|
||||||
|
|
||||||
|
sum_truesize = head->truesize;
|
||||||
while (rbn || fp) {
|
while (rbn || fp) {
|
||||||
/* fp points to the next sk_buff in the current run;
|
/* fp points to the next sk_buff in the current run;
|
||||||
* rbn points to the next run.
|
* rbn points to the next run.
|
||||||
*/
|
*/
|
||||||
/* Go through the current run. */
|
/* Go through the current run. */
|
||||||
while (fp) {
|
while (fp) {
|
||||||
*nextp = fp;
|
struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
|
||||||
nextp = &fp->next;
|
bool stolen;
|
||||||
fp->prev = NULL;
|
int delta;
|
||||||
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
|
|
||||||
fp->sk = NULL;
|
sum_truesize += fp->truesize;
|
||||||
head->data_len += fp->len;
|
|
||||||
head->len += fp->len;
|
|
||||||
if (head->ip_summed != fp->ip_summed)
|
if (head->ip_summed != fp->ip_summed)
|
||||||
head->ip_summed = CHECKSUM_NONE;
|
head->ip_summed = CHECKSUM_NONE;
|
||||||
else if (head->ip_summed == CHECKSUM_COMPLETE)
|
else if (head->ip_summed == CHECKSUM_COMPLETE)
|
||||||
head->csum = csum_add(head->csum, fp->csum);
|
head->csum = csum_add(head->csum, fp->csum);
|
||||||
head->truesize += fp->truesize;
|
|
||||||
fp = FRAG_CB(fp)->next_frag;
|
if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
|
||||||
|
&delta)) {
|
||||||
|
kfree_skb_partial(fp, stolen);
|
||||||
|
} else {
|
||||||
|
fp->prev = NULL;
|
||||||
|
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
|
||||||
|
fp->sk = NULL;
|
||||||
|
|
||||||
|
head->data_len += fp->len;
|
||||||
|
head->len += fp->len;
|
||||||
|
head->truesize += fp->truesize;
|
||||||
|
|
||||||
|
*nextp = fp;
|
||||||
|
nextp = &fp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
fp = next_frag;
|
||||||
}
|
}
|
||||||
/* Move to the next run. */
|
/* Move to the next run. */
|
||||||
if (rbn) {
|
if (rbn) {
|
||||||
|
@ -516,7 +533,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
|
||||||
rbn = rbnext;
|
rbn = rbnext;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sub_frag_mem_limit(q->fqdir, head->truesize);
|
sub_frag_mem_limit(q->fqdir, sum_truesize);
|
||||||
|
|
||||||
*nextp = NULL;
|
*nextp = NULL;
|
||||||
skb_mark_not_on_list(head);
|
skb_mark_not_on_list(head);
|
||||||
|
|
|
@ -393,6 +393,11 @@ err:
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ip_frag_coalesce_ok(const struct ipq *qp)
|
||||||
|
{
|
||||||
|
return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
|
||||||
|
}
|
||||||
|
|
||||||
/* Build a new IP datagram from all its fragments. */
|
/* Build a new IP datagram from all its fragments. */
|
||||||
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
|
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
|
||||||
struct sk_buff *prev_tail, struct net_device *dev)
|
struct sk_buff *prev_tail, struct net_device *dev)
|
||||||
|
@ -421,7 +426,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
|
||||||
if (len > 65535)
|
if (len > 65535)
|
||||||
goto out_oversize;
|
goto out_oversize;
|
||||||
|
|
||||||
inet_frag_reasm_finish(&qp->q, skb, reasm_data);
|
inet_frag_reasm_finish(&qp->q, skb, reasm_data,
|
||||||
|
ip_frag_coalesce_ok(qp));
|
||||||
|
|
||||||
skb->dev = dev;
|
skb->dev = dev;
|
||||||
IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
|
IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
|
||||||
|
|
|
@ -348,7 +348,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
|
||||||
|
|
||||||
skb_reset_transport_header(skb);
|
skb_reset_transport_header(skb);
|
||||||
|
|
||||||
inet_frag_reasm_finish(&fq->q, skb, reasm_data);
|
inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
|
||||||
|
|
||||||
skb->ignore_df = 1;
|
skb->ignore_df = 1;
|
||||||
skb->dev = dev;
|
skb->dev = dev;
|
||||||
|
|
|
@ -282,7 +282,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
|
||||||
|
|
||||||
skb_reset_transport_header(skb);
|
skb_reset_transport_header(skb);
|
||||||
|
|
||||||
inet_frag_reasm_finish(&fq->q, skb, reasm_data);
|
inet_frag_reasm_finish(&fq->q, skb, reasm_data, true);
|
||||||
|
|
||||||
skb->dev = dev;
|
skb->dev = dev;
|
||||||
ipv6_hdr(skb)->payload_len = htons(payload_len);
|
ipv6_hdr(skb)->payload_len = htons(payload_len);
|
||||||
|
|
Loading…
Reference in New Issue