2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Routines having to do with the 'struct sk_buff' memory handlers.
|
|
|
|
*
|
2008-10-14 10:01:08 +08:00
|
|
|
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
|
2005-04-17 06:20:36 +08:00
|
|
|
* Florian La Roche <rzsfl@rz.uni-sb.de>
|
|
|
|
*
|
|
|
|
* Fixes:
|
|
|
|
* Alan Cox : Fixed the worst of the load
|
|
|
|
* balancer bugs.
|
|
|
|
* Dave Platt : Interrupt stacking fix.
|
|
|
|
* Richard Kooijman : Timestamp fixes.
|
|
|
|
* Alan Cox : Changed buffer format.
|
|
|
|
* Alan Cox : destructor hook for AF_UNIX etc.
|
|
|
|
* Linus Torvalds : Better skb_clone.
|
|
|
|
* Alan Cox : Added skb_copy.
|
|
|
|
* Alan Cox : Added all the changed routines Linus
|
|
|
|
* only put in the headers
|
|
|
|
* Ray VanTassle : Fixed --skb->lock in free
|
|
|
|
* Alan Cox : skb_copy copy arp field
|
|
|
|
* Andi Kleen : slabified it.
|
|
|
|
* Robert Olsson : Removed skb_head_pool
|
|
|
|
*
|
|
|
|
* NOTE:
|
|
|
|
* The __skb_ routines should be called with interrupts
|
|
|
|
* disabled, or you better be *real* sure that the operation is atomic
|
|
|
|
* with respect to whatever list is being frobbed (e.g. via lock_sock()
|
|
|
|
* or via disabling bottom half handlers, etc).
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The functions in this file will not compile correctly with gcc 2.4.x
|
|
|
|
*/
|
|
|
|
|
2012-05-17 03:58:40 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kernel.h>
|
2008-08-30 18:16:35 +08:00
|
|
|
#include <linux/kmemcheck.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/inet.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
#include <net/pkt_sched.h>
|
|
|
|
#endif
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/skbuff.h>
|
2007-11-07 15:30:13 +08:00
|
|
|
#include <linux/splice.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/rtnetlink.h>
|
|
|
|
#include <linux/init.h>
|
2007-04-03 11:19:53 +08:00
|
|
|
#include <linux/scatterlist.h>
|
2009-02-12 13:03:37 +08:00
|
|
|
#include <linux/errqueue.h>
|
2011-05-21 03:50:29 +08:00
|
|
|
#include <linux/prefetch.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <net/protocol.h>
|
|
|
|
#include <net/dst.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/checksum.h>
|
|
|
|
#include <net/xfrm.h>
|
|
|
|
|
|
|
|
#include <asm/uaccess.h>
|
2009-04-15 07:39:12 +08:00
|
|
|
#include <trace/events/skb.h>
|
2012-04-05 17:35:15 +08:00
|
|
|
#include <linux/highmem.h>
|
2006-10-20 04:08:53 +08:00
|
|
|
|
2012-04-30 16:10:34 +08:00
|
|
|
struct kmem_cache *skbuff_head_cache __read_mostly;
|
2006-12-07 12:33:20 +08:00
|
|
|
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-11-07 15:30:13 +08:00
|
|
|
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
|
|
|
|
struct pipe_buffer *buf)
|
|
|
|
{
|
2009-01-20 09:03:56 +08:00
|
|
|
put_page(buf->page);
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
|
|
|
|
struct pipe_buffer *buf)
|
|
|
|
{
|
2009-01-20 09:03:56 +08:00
|
|
|
get_page(buf->page);
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
|
|
|
|
struct pipe_buffer *buf)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Pipe buffer operations for a socket. */
|
2009-12-16 08:46:48 +08:00
|
|
|
static const struct pipe_buf_operations sock_pipe_buf_ops = {
|
2007-11-07 15:30:13 +08:00
|
|
|
.can_merge = 0,
|
|
|
|
.map = generic_pipe_buf_map,
|
|
|
|
.unmap = generic_pipe_buf_unmap,
|
|
|
|
.confirm = generic_pipe_buf_confirm,
|
|
|
|
.release = sock_pipe_buf_release,
|
|
|
|
.steal = sock_pipe_buf_steal,
|
|
|
|
.get = sock_pipe_buf_get,
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
2013-02-11 21:30:38 +08:00
|
|
|
* skb_panic - private function for out-of-line support
|
|
|
|
* @skb: buffer
|
|
|
|
* @sz: size
|
|
|
|
* @addr: address
|
2013-02-13 19:20:27 +08:00
|
|
|
* @msg: skb_over_panic or skb_under_panic
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2013-02-11 21:30:38 +08:00
|
|
|
* Out-of-line support for skb_put() and skb_push().
|
|
|
|
* Called via the wrapper skb_over_panic() or skb_under_panic().
|
|
|
|
* Keep out of line to prevent kernel bloat.
|
|
|
|
* __builtin_return_address is not used because it is not always reliable.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2013-02-11 21:30:38 +08:00
|
|
|
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
|
2013-02-13 19:20:27 +08:00
|
|
|
const char msg[])
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-05-17 03:58:40 +08:00
|
|
|
pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
|
2013-02-13 19:20:27 +08:00
|
|
|
msg, addr, skb->len, sz, skb->head, skb->data,
|
2012-05-17 03:58:40 +08:00
|
|
|
(unsigned long)skb->tail, (unsigned long)skb->end,
|
|
|
|
skb->dev ? skb->dev->name : "<NULL>");
|
2005-04-17 06:20:36 +08:00
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2013-02-11 21:30:38 +08:00
|
|
|
static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2013-02-11 21:30:38 +08:00
|
|
|
skb_panic(skb, sz, addr, __func__);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-02-11 21:30:38 +08:00
|
|
|
static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
|
|
|
|
{
|
|
|
|
skb_panic(skb, sz, addr, __func__);
|
|
|
|
}
|
2012-08-01 07:44:19 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
|
|
|
|
* the caller if emergency pfmemalloc reserves are being used. If it is and
|
|
|
|
* the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
|
|
|
|
* may be used. Otherwise, the packet data may be discarded until enough
|
|
|
|
* memory is free
|
|
|
|
*/
|
|
|
|
#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
|
|
|
|
__kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
|
2012-12-29 02:24:28 +08:00
|
|
|
|
|
|
|
static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
|
|
|
|
unsigned long ip, bool *pfmemalloc)
|
2012-08-01 07:44:19 +08:00
|
|
|
{
|
|
|
|
void *obj;
|
|
|
|
bool ret_pfmemalloc = false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try a regular allocation, when that fails and we're not entitled
|
|
|
|
* to the reserves, fail.
|
|
|
|
*/
|
|
|
|
obj = kmalloc_node_track_caller(size,
|
|
|
|
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
|
|
|
|
node);
|
|
|
|
if (obj || !(gfp_pfmemalloc_allowed(flags)))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Try again but now we are using pfmemalloc reserves */
|
|
|
|
ret_pfmemalloc = true;
|
|
|
|
obj = kmalloc_node_track_caller(size, flags, node);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (pfmemalloc)
|
|
|
|
*pfmemalloc = ret_pfmemalloc;
|
|
|
|
|
|
|
|
return obj;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
|
|
|
|
* 'private' fields and also do memory statistics to find all the
|
|
|
|
* [BEEP] leaks.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
2005-08-18 05:57:30 +08:00
|
|
|
* __alloc_skb - allocate a network buffer
|
2005-04-17 06:20:36 +08:00
|
|
|
* @size: size to allocate
|
|
|
|
* @gfp_mask: allocation mask
|
2012-08-01 07:44:19 +08:00
|
|
|
* @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
|
|
|
|
* instead of head cache and allocate a cloned (child) skb.
|
|
|
|
* If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
|
|
|
|
* allocations in case the data is required for writeback
|
2006-12-07 12:32:36 +08:00
|
|
|
* @node: numa node to allocate memory on
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Allocate a new &sk_buff. The returned buffer has no headroom and a
|
2012-06-06 23:23:37 +08:00
|
|
|
* tail room of at least size bytes. The object has a reference count
|
|
|
|
* of one. The return is the buffer. On a failure the return is %NULL.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Buffers may only be allocated from interrupts using a @gfp_mask of
|
|
|
|
* %GFP_ATOMIC.
|
|
|
|
*/
|
2005-10-07 14:46:04 +08:00
|
|
|
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
|
2012-08-01 07:44:19 +08:00
|
|
|
int flags, int node)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-12-07 12:33:20 +08:00
|
|
|
struct kmem_cache *cache;
|
2006-01-04 06:06:50 +08:00
|
|
|
struct skb_shared_info *shinfo;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sk_buff *skb;
|
|
|
|
u8 *data;
|
2012-08-01 07:44:19 +08:00
|
|
|
bool pfmemalloc;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-08-01 07:44:19 +08:00
|
|
|
cache = (flags & SKB_ALLOC_FCLONE)
|
|
|
|
? skbuff_fclone_cache : skbuff_head_cache;
|
|
|
|
|
|
|
|
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
|
|
|
|
gfp_mask |= __GFP_MEMALLOC;
|
2006-01-24 08:32:45 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Get the HEAD */
|
2006-12-07 12:32:36 +08:00
|
|
|
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!skb)
|
|
|
|
goto out;
|
2010-05-05 16:07:37 +08:00
|
|
|
prefetchw(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-10-13 15:28:54 +08:00
|
|
|
/* We do our best to align skb_shared_info on a separate cache
|
|
|
|
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
|
|
|
|
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
|
|
|
|
* Both skb->head and skb_shared_info are cache line aligned.
|
|
|
|
*/
|
2011-11-02 21:40:28 +08:00
|
|
|
size = SKB_DATA_ALIGN(size);
|
2011-10-13 15:28:54 +08:00
|
|
|
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
2012-08-01 07:44:19 +08:00
|
|
|
data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!data)
|
|
|
|
goto nodata;
|
2011-10-13 15:28:54 +08:00
|
|
|
/* kmalloc(size) might give us more room than requested.
|
|
|
|
* Put skb_shared_info exactly at the end of allocated zone,
|
|
|
|
* to allow max possible filling before reallocation.
|
|
|
|
*/
|
|
|
|
size = SKB_WITH_OVERHEAD(ksize(data));
|
2010-05-05 16:07:37 +08:00
|
|
|
prefetchw(data + size);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-19 21:48:59 +08:00
|
|
|
/*
|
2008-05-04 11:56:42 +08:00
|
|
|
* Only clear those fields we need to clear, not those that we will
|
|
|
|
* actually initialise below. Hence, don't put any more fields after
|
|
|
|
* the tail pointer in struct sk_buff!
|
2007-03-19 21:48:59 +08:00
|
|
|
*/
|
|
|
|
memset(skb, 0, offsetof(struct sk_buff, tail));
|
2011-10-13 15:28:54 +08:00
|
|
|
/* Account for allocated memory : skb + skb->head */
|
|
|
|
skb->truesize = SKB_TRUESIZE(size);
|
2012-08-01 07:44:19 +08:00
|
|
|
skb->pfmemalloc = pfmemalloc;
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_set(&skb->users, 1);
|
|
|
|
skb->head = data;
|
|
|
|
skb->data = data;
|
2007-04-20 11:29:13 +08:00
|
|
|
skb_reset_tail_pointer(skb);
|
2007-04-20 11:43:29 +08:00
|
|
|
skb->end = skb->tail + size;
|
2009-06-17 13:23:27 +08:00
|
|
|
#ifdef NET_SKBUFF_DATA_USES_OFFSET
|
|
|
|
skb->mac_header = ~0U;
|
2013-01-07 17:28:21 +08:00
|
|
|
skb->transport_header = ~0U;
|
2009-06-17 13:23:27 +08:00
|
|
|
#endif
|
|
|
|
|
2006-01-04 06:06:50 +08:00
|
|
|
/* make sure we initialize shinfo sequentially */
|
|
|
|
shinfo = skb_shinfo(skb);
|
2010-05-05 16:07:37 +08:00
|
|
|
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
|
2006-01-04 06:06:50 +08:00
|
|
|
atomic_set(&shinfo->dataref, 1);
|
2011-01-26 07:18:38 +08:00
|
|
|
kmemcheck_annotate_variable(shinfo->destructor_arg);
|
2006-01-04 06:06:50 +08:00
|
|
|
|
2012-08-01 07:44:19 +08:00
|
|
|
if (flags & SKB_ALLOC_FCLONE) {
|
2005-08-18 05:57:30 +08:00
|
|
|
struct sk_buff *child = skb + 1;
|
|
|
|
atomic_t *fclone_ref = (atomic_t *) (child + 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-30 18:16:35 +08:00
|
|
|
kmemcheck_annotate_bitfield(child, flags1);
|
|
|
|
kmemcheck_annotate_bitfield(child, flags2);
|
2005-08-18 05:57:30 +08:00
|
|
|
skb->fclone = SKB_FCLONE_ORIG;
|
|
|
|
atomic_set(fclone_ref, 1);
|
|
|
|
|
|
|
|
child->fclone = SKB_FCLONE_UNAVAILABLE;
|
2012-08-01 07:44:19 +08:00
|
|
|
child->pfmemalloc = pfmemalloc;
|
2005-08-18 05:57:30 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
out:
|
|
|
|
return skb;
|
|
|
|
nodata:
|
2006-01-24 08:32:45 +08:00
|
|
|
kmem_cache_free(cache, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
skb = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(__alloc_skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
/**
|
|
|
|
* build_skb - build a network buffer
|
|
|
|
* @data: data buffer provided by caller
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
* @frag_size: size of fragment, or 0 if head was kmalloced
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
*
|
|
|
|
* Allocate a new &sk_buff. Caller provides space holding head and
|
|
|
|
* skb_shared_info. @data must have been allocated by kmalloc()
|
|
|
|
* The return is the new skb buffer.
|
|
|
|
* On a failure the return is %NULL, and @data is not freed.
|
|
|
|
* Notes :
|
|
|
|
* Before IO, driver allocates only data buffer where NIC put incoming frame
|
|
|
|
* Driver should add room at head (NET_SKB_PAD) and
|
|
|
|
* MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
|
|
|
|
* After IO, driver calls build_skb(), to allocate sk_buff and populate it
|
|
|
|
* before giving packet to stack.
|
|
|
|
* RX rings only contains data buffers, not full skbs.
|
|
|
|
*/
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
struct sk_buff *build_skb(void *data, unsigned int frag_size)
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
{
|
|
|
|
struct skb_shared_info *shinfo;
|
|
|
|
struct sk_buff *skb;
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
unsigned int size = frag_size ? : ksize(data);
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
|
|
|
|
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
|
|
|
|
if (!skb)
|
|
|
|
return NULL;
|
|
|
|
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
|
|
|
|
memset(skb, 0, offsetof(struct sk_buff, tail));
|
|
|
|
skb->truesize = SKB_TRUESIZE(size);
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
skb->head_frag = frag_size != 0;
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
atomic_set(&skb->users, 1);
|
|
|
|
skb->head = data;
|
|
|
|
skb->data = data;
|
|
|
|
skb_reset_tail_pointer(skb);
|
|
|
|
skb->end = skb->tail + size;
|
|
|
|
#ifdef NET_SKBUFF_DATA_USES_OFFSET
|
|
|
|
skb->mac_header = ~0U;
|
2013-01-07 17:28:21 +08:00
|
|
|
skb->transport_header = ~0U;
|
net: introduce build_skb()
One of the thing we discussed during netdev 2011 conference was the idea
to change some network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.
In old days, we allocated skbs when populating the RX ring.
This means bringing into cpu cache sk_buff and skb_shared_info cache
lines (since we clear/initialize them), then 'queue' skb->data to NIC.
By the time NIC fills a frame in skb->data buffer and host can process
it, cpu probably threw away the cache lines from its caches, because lot
of things happened between the allocation and final use.
So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.
build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call
skb_reserve() right after build_skb() to adjust skb->data to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN, but some
drivers might add a hardware provided alignment)
Data provided to build_skb() MUST have been allocated by a prior
kmalloc() call, with enough room to add SKB_DATA_ALIGN(sizeof(struct
skb_shared_info)) bytes at the end of the data without corrupting
incoming frame.
data = kmalloc(NET_SKB_PAD + NET_IP_ALIGN + 1536 +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
GFP_ATOMIC);
...
skb = build_skb(data);
if (!skb) {
recycle_data(data);
} else {
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
...
}
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-14 14:03:34 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/* make sure we initialize shinfo sequentially */
|
|
|
|
shinfo = skb_shinfo(skb);
|
|
|
|
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
|
|
|
|
atomic_set(&shinfo->dataref, 1);
|
|
|
|
kmemcheck_annotate_variable(shinfo->destructor_arg);
|
|
|
|
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(build_skb);
|
|
|
|
|
2012-05-17 15:34:16 +08:00
|
|
|
struct netdev_alloc_cache {
|
2012-09-26 14:46:57 +08:00
|
|
|
struct page_frag frag;
|
|
|
|
/* we maintain a pagecount bias, so that we dont dirty cache line
|
|
|
|
* containing page->_count every time we allocate a fragment.
|
|
|
|
*/
|
|
|
|
unsigned int pagecnt_bias;
|
2012-05-17 15:34:16 +08:00
|
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
|
|
|
|
|
2012-08-01 07:44:19 +08:00
|
|
|
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
|
2012-05-18 13:12:12 +08:00
|
|
|
{
|
|
|
|
struct netdev_alloc_cache *nc;
|
|
|
|
void *data = NULL;
|
2012-09-26 14:46:57 +08:00
|
|
|
int order;
|
2012-05-18 13:12:12 +08:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
nc = &__get_cpu_var(netdev_alloc_cache);
|
2012-09-26 14:46:57 +08:00
|
|
|
if (unlikely(!nc->frag.page)) {
|
2012-05-18 13:12:12 +08:00
|
|
|
refill:
|
2012-09-26 14:46:57 +08:00
|
|
|
for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
|
|
|
|
gfp_t gfp = gfp_mask;
|
|
|
|
|
|
|
|
if (order)
|
|
|
|
gfp |= __GFP_COMP | __GFP_NOWARN;
|
|
|
|
nc->frag.page = alloc_pages(gfp, order);
|
|
|
|
if (likely(nc->frag.page))
|
|
|
|
break;
|
|
|
|
if (--order < 0)
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
nc->frag.size = PAGE_SIZE << order;
|
2012-07-12 22:23:50 +08:00
|
|
|
recycle:
|
2012-09-26 14:46:57 +08:00
|
|
|
atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
|
|
|
|
nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
|
|
|
|
nc->frag.offset = 0;
|
2012-05-18 13:12:12 +08:00
|
|
|
}
|
2012-07-12 22:23:50 +08:00
|
|
|
|
2012-09-26 14:46:57 +08:00
|
|
|
if (nc->frag.offset + fragsz > nc->frag.size) {
|
2012-07-12 22:23:50 +08:00
|
|
|
/* avoid unnecessary locked operations if possible */
|
2012-09-26 14:46:57 +08:00
|
|
|
if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
|
|
|
|
atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
|
2012-07-12 22:23:50 +08:00
|
|
|
goto recycle;
|
|
|
|
goto refill;
|
2012-05-18 13:12:12 +08:00
|
|
|
}
|
2012-07-12 22:23:50 +08:00
|
|
|
|
2012-09-26 14:46:57 +08:00
|
|
|
data = page_address(nc->frag.page) + nc->frag.offset;
|
|
|
|
nc->frag.offset += fragsz;
|
2012-07-12 22:23:50 +08:00
|
|
|
nc->pagecnt_bias--;
|
|
|
|
end:
|
2012-05-18 13:12:12 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
return data;
|
|
|
|
}
|
2012-08-01 07:44:19 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* netdev_alloc_frag - allocate a page fragment
|
|
|
|
* @fragsz: fragment size
|
|
|
|
*
|
|
|
|
* Allocates a frag from a page for receive buffer.
|
|
|
|
* Uses GFP_ATOMIC allocations.
|
|
|
|
*/
|
|
|
|
void *netdev_alloc_frag(unsigned int fragsz)
|
|
|
|
{
|
|
|
|
return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
|
|
|
|
}
|
2012-05-18 13:12:12 +08:00
|
|
|
EXPORT_SYMBOL(netdev_alloc_frag);
|
|
|
|
|
2006-08-01 13:35:23 +08:00
|
|
|
/**
|
|
|
|
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
|
|
|
|
* @dev: network device to receive on
|
|
|
|
* @length: length to allocate
|
|
|
|
* @gfp_mask: get_free_pages mask, passed to alloc_skb
|
|
|
|
*
|
|
|
|
* Allocate a new &sk_buff and assign it a usage count of one. The
|
|
|
|
* buffer has unspecified headroom built in. Users should allocate
|
|
|
|
* the headroom they think they need without accounting for the
|
|
|
|
* built in space. The built in space is used for optimisations.
|
|
|
|
*
|
|
|
|
* %NULL is returned if there is no free memory.
|
|
|
|
*/
|
|
|
|
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
|
2012-05-18 13:12:12 +08:00
|
|
|
unsigned int length, gfp_t gfp_mask)
|
2006-08-01 13:35:23 +08:00
|
|
|
{
|
2012-05-18 13:12:12 +08:00
|
|
|
struct sk_buff *skb = NULL;
|
2012-05-17 15:34:16 +08:00
|
|
|
unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
|
|
|
|
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
|
|
|
|
2012-07-16 19:15:52 +08:00
|
|
|
if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
|
2012-08-01 07:44:19 +08:00
|
|
|
void *data;
|
|
|
|
|
|
|
|
if (sk_memalloc_socks())
|
|
|
|
gfp_mask |= __GFP_MEMALLOC;
|
|
|
|
|
|
|
|
data = __netdev_alloc_frag(fragsz, gfp_mask);
|
2012-05-17 15:34:16 +08:00
|
|
|
|
2012-05-18 13:12:12 +08:00
|
|
|
if (likely(data)) {
|
|
|
|
skb = build_skb(data, fragsz);
|
|
|
|
if (unlikely(!skb))
|
|
|
|
put_page(virt_to_head_page(data));
|
2012-05-17 15:34:16 +08:00
|
|
|
}
|
|
|
|
} else {
|
2012-08-01 07:44:19 +08:00
|
|
|
skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
|
|
|
|
SKB_ALLOC_RX, NUMA_NO_NODE);
|
2012-05-17 15:34:16 +08:00
|
|
|
}
|
2006-08-08 07:09:04 +08:00
|
|
|
if (likely(skb)) {
|
2006-08-01 13:35:23 +08:00
|
|
|
skb_reserve(skb, NET_SKB_PAD);
|
2006-08-08 07:09:04 +08:00
|
|
|
skb->dev = dev;
|
|
|
|
}
|
2006-08-01 13:35:23 +08:00
|
|
|
return skb;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(__netdev_alloc_skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-10-08 05:22:33 +08:00
|
|
|
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
|
2012-03-24 07:59:33 +08:00
|
|
|
int size, unsigned int truesize)
|
2008-10-08 05:22:33 +08:00
|
|
|
{
|
|
|
|
skb_fill_page_desc(skb, i, page, off, size);
|
|
|
|
skb->len += size;
|
|
|
|
skb->data_len += size;
|
2012-03-24 07:59:33 +08:00
|
|
|
skb->truesize += truesize;
|
2008-10-08 05:22:33 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_add_rx_frag);
|
|
|
|
|
2006-07-14 10:26:39 +08:00
|
|
|
static void skb_drop_list(struct sk_buff **listp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-07-14 10:26:39 +08:00
|
|
|
struct sk_buff *list = *listp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-07-14 10:26:39 +08:00
|
|
|
*listp = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
struct sk_buff *this = list;
|
|
|
|
list = list->next;
|
|
|
|
kfree_skb(this);
|
|
|
|
} while (list);
|
|
|
|
}
|
|
|
|
|
2006-07-14 10:26:39 +08:00
|
|
|
static inline void skb_drop_fraglist(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
skb_drop_list(&skb_shinfo(skb)->frag_list);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void skb_clone_fraglist(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sk_buff *list;
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, list)
|
2005-04-17 06:20:36 +08:00
|
|
|
skb_get(list);
|
|
|
|
}
|
|
|
|
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
static void skb_free_head(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (skb->head_frag)
|
|
|
|
put_page(virt_to_head_page(skb->head));
|
|
|
|
else
|
|
|
|
kfree(skb->head);
|
|
|
|
}
|
|
|
|
|
2006-06-30 04:02:35 +08:00
|
|
|
static void skb_release_data(struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
if (!skb->cloned ||
|
|
|
|
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
|
|
|
|
&skb_shinfo(skb)->dataref)) {
|
|
|
|
if (skb_shinfo(skb)->nr_frags) {
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_unref(skb, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-07-06 20:22:12 +08:00
|
|
|
/*
|
|
|
|
* If skb buf is from userspace, we need to notify the caller
|
|
|
|
* the lower device DMA has done;
|
|
|
|
*/
|
|
|
|
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
|
|
|
|
struct ubuf_info *uarg;
|
|
|
|
|
|
|
|
uarg = skb_shinfo(skb)->destructor_arg;
|
|
|
|
if (uarg->callback)
|
2012-11-01 17:16:22 +08:00
|
|
|
uarg->callback(uarg, true);
|
2011-07-06 20:22:12 +08:00
|
|
|
}
|
|
|
|
|
2010-08-23 15:13:46 +08:00
|
|
|
if (skb_has_frag_list(skb))
|
2005-04-17 06:20:36 +08:00
|
|
|
skb_drop_fraglist(skb);
|
|
|
|
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
skb_free_head(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free an skbuff by memory without cleaning the state.
|
|
|
|
*/
|
2007-11-26 23:11:19 +08:00
|
|
|
static void kfree_skbmem(struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-08-18 05:57:30 +08:00
|
|
|
struct sk_buff *other;
|
|
|
|
atomic_t *fclone_ref;
|
|
|
|
|
|
|
|
switch (skb->fclone) {
|
|
|
|
case SKB_FCLONE_UNAVAILABLE:
|
|
|
|
kmem_cache_free(skbuff_head_cache, skb);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SKB_FCLONE_ORIG:
|
|
|
|
fclone_ref = (atomic_t *) (skb + 2);
|
|
|
|
if (atomic_dec_and_test(fclone_ref))
|
|
|
|
kmem_cache_free(skbuff_fclone_cache, skb);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SKB_FCLONE_CLONE:
|
|
|
|
fclone_ref = (atomic_t *) (skb + 1);
|
|
|
|
other = skb - 1;
|
|
|
|
|
|
|
|
/* The clone portion is available for
|
|
|
|
* fast-cloning again.
|
|
|
|
*/
|
|
|
|
skb->fclone = SKB_FCLONE_UNAVAILABLE;
|
|
|
|
|
|
|
|
if (atomic_dec_and_test(fclone_ref))
|
|
|
|
kmem_cache_free(skbuff_fclone_cache, other);
|
|
|
|
break;
|
2007-04-21 08:09:22 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-01 17:33:12 +08:00
|
|
|
static void skb_release_head_state(struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-06-02 13:19:30 +08:00
|
|
|
skb_dst_drop(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_XFRM
|
|
|
|
secpath_put(skb->sp);
|
|
|
|
#endif
|
2005-04-20 13:39:42 +08:00
|
|
|
if (skb->destructor) {
|
|
|
|
WARN_ON(in_irq());
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->destructor(skb);
|
|
|
|
}
|
2011-12-12 10:58:22 +08:00
|
|
|
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
2007-03-24 02:17:07 +08:00
|
|
|
nf_conntrack_put(skb->nfct);
|
2011-01-13 03:25:08 +08:00
|
|
|
#endif
|
|
|
|
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
|
[NETFILTER]: Add nf_conntrack subsystem.
The existing connection tracking subsystem in netfilter can only
handle ipv4. There were basically two choices present to add
connection tracking support for ipv6. We could either duplicate all
of the ipv4 connection tracking code into an ipv6 counterpart, or (the
choice taken by these patches) we could design a generic layer that
could handle both ipv4 and ipv6 and thus requiring only one sub-protocol
(TCP, UDP, etc.) connection tracking helper module to be written.
In fact nf_conntrack is capable of working with any layer 3
protocol.
The existing ipv4 specific conntrack code could also not deal
with the pecularities of doing connection tracking on ipv6,
which is also cured here. For example, these issues include:
1) ICMPv6 handling, which is used for neighbour discovery in
ipv6 thus some messages such as these should not participate
in connection tracking since effectively they are like ARP
messages
2) fragmentation must be handled differently in ipv6, because
the simplistic "defrag, connection track and NAT, refrag"
(which the existing ipv4 connection tracking does) approach simply
isn't feasible in ipv6
3) ipv6 extension header parsing must occur at the correct spots
before and after connection tracking decisions, and there were
no provisions for this in the existing connection tracking
design
4) ipv6 has no need for stateful NAT
The ipv4 specific conntrack layer is kept around, until all of
the ipv4 specific conntrack helpers are ported over to nf_conntrack
and it is feature complete. Once that occurs, the old conntrack
stuff will get placed into the feature-removal-schedule and we will
fully kill it off 6 months later.
Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
2005-11-10 08:38:16 +08:00
|
|
|
nf_conntrack_put_reasm(skb->nfct_reasm);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_BRIDGE_NETFILTER
|
|
|
|
nf_bridge_put(skb->nf_bridge);
|
|
|
|
#endif
|
|
|
|
/* XXX: IS this still necessary? - JHS */
|
|
|
|
#ifdef CONFIG_NET_SCHED
|
|
|
|
skb->tc_index = 0;
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
skb->tc_verd = 0;
|
|
|
|
#endif
|
|
|
|
#endif
|
2008-10-01 17:33:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Free everything but the sk_buff shell. */
|
|
|
|
static void skb_release_all(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
skb_release_head_state(skb);
|
2007-11-26 23:11:19 +08:00
|
|
|
skb_release_data(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __kfree_skb - private function
|
|
|
|
* @skb: buffer
|
|
|
|
*
|
|
|
|
* Free an sk_buff. Release anything attached to the buffer.
|
|
|
|
* Clean the state. This is an internal helper function. Users should
|
|
|
|
* always call kfree_skb
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-11-26 23:11:19 +08:00
|
|
|
void __kfree_skb(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
skb_release_all(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree_skbmem(skb);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(__kfree_skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-21 13:28:35 +08:00
|
|
|
/**
|
|
|
|
* kfree_skb - free an sk_buff
|
|
|
|
* @skb: buffer to free
|
|
|
|
*
|
|
|
|
* Drop a reference to the buffer and free it if the usage count has
|
|
|
|
* hit zero.
|
|
|
|
*/
|
|
|
|
void kfree_skb(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (unlikely(!skb))
|
|
|
|
return;
|
|
|
|
if (likely(atomic_read(&skb->users) == 1))
|
|
|
|
smp_rmb();
|
|
|
|
else if (likely(!atomic_dec_and_test(&skb->users)))
|
|
|
|
return;
|
2009-03-11 17:49:55 +08:00
|
|
|
trace_kfree_skb(skb, __builtin_return_address(0));
|
2006-03-21 13:28:35 +08:00
|
|
|
__kfree_skb(skb);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(kfree_skb);
|
2006-03-21 13:28:35 +08:00
|
|
|
|
2012-11-01 17:16:28 +08:00
|
|
|
/**
|
|
|
|
* skb_tx_error - report an sk_buff xmit error
|
|
|
|
* @skb: buffer that triggered an error
|
|
|
|
*
|
|
|
|
* Report xmit error if a device callback is tracking this skb.
|
|
|
|
* skb must be freed afterwards.
|
|
|
|
*/
|
|
|
|
void skb_tx_error(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
|
|
|
|
struct ubuf_info *uarg;
|
|
|
|
|
|
|
|
uarg = skb_shinfo(skb)->destructor_arg;
|
|
|
|
if (uarg->callback)
|
|
|
|
uarg->callback(uarg, false);
|
|
|
|
skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_tx_error);
|
|
|
|
|
2009-03-11 17:49:55 +08:00
|
|
|
/**
|
|
|
|
* consume_skb - free an skbuff
|
|
|
|
* @skb: buffer to free
|
|
|
|
*
|
|
|
|
* Drop a ref to the buffer and free it if the usage count has hit zero
|
|
|
|
* Functions identically to kfree_skb, but kfree_skb assumes that the frame
|
|
|
|
* is being dropped after a failure and notes that
|
|
|
|
*/
|
|
|
|
void consume_skb(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (unlikely(!skb))
|
|
|
|
return;
|
|
|
|
if (likely(atomic_read(&skb->users) == 1))
|
|
|
|
smp_rmb();
|
|
|
|
else if (likely(!atomic_dec_and_test(&skb->users)))
|
|
|
|
return;
|
2010-08-23 17:46:12 +08:00
|
|
|
trace_consume_skb(skb);
|
2009-03-11 17:49:55 +08:00
|
|
|
__kfree_skb(skb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(consume_skb);
|
|
|
|
|
2007-10-14 15:37:30 +08:00
|
|
|
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
|
|
|
|
{
|
|
|
|
new->tstamp = old->tstamp;
|
|
|
|
new->dev = old->dev;
|
|
|
|
new->transport_header = old->transport_header;
|
|
|
|
new->network_header = old->network_header;
|
|
|
|
new->mac_header = old->mac_header;
|
2012-12-07 22:14:14 +08:00
|
|
|
new->inner_transport_header = old->inner_transport_header;
|
2013-02-01 23:18:49 +08:00
|
|
|
new->inner_network_header = old->inner_network_header;
|
2010-05-12 07:19:48 +08:00
|
|
|
skb_dst_copy(new, old);
|
2010-03-16 16:03:29 +08:00
|
|
|
new->rxhash = old->rxhash;
|
2011-08-19 12:44:18 +08:00
|
|
|
new->ooo_okay = old->ooo_okay;
|
2011-08-15 03:45:55 +08:00
|
|
|
new->l4_rxhash = old->l4_rxhash;
|
2012-02-11 23:39:30 +08:00
|
|
|
new->no_fcs = old->no_fcs;
|
2012-12-07 22:14:14 +08:00
|
|
|
new->encapsulation = old->encapsulation;
|
2008-10-29 04:24:06 +08:00
|
|
|
#ifdef CONFIG_XFRM
|
2007-10-14 15:37:30 +08:00
|
|
|
new->sp = secpath_get(old->sp);
|
|
|
|
#endif
|
|
|
|
memcpy(new->cb, old->cb, sizeof(old->cb));
|
2009-05-23 06:20:02 +08:00
|
|
|
new->csum = old->csum;
|
2007-10-14 15:37:30 +08:00
|
|
|
new->local_df = old->local_df;
|
|
|
|
new->pkt_type = old->pkt_type;
|
|
|
|
new->ip_summed = old->ip_summed;
|
|
|
|
skb_copy_queue_mapping(new, old);
|
|
|
|
new->priority = old->priority;
|
2011-12-12 10:58:22 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IP_VS)
|
2007-10-14 15:37:30 +08:00
|
|
|
new->ipvs_property = old->ipvs_property;
|
|
|
|
#endif
|
2012-08-01 07:44:19 +08:00
|
|
|
new->pfmemalloc = old->pfmemalloc;
|
2007-10-14 15:37:30 +08:00
|
|
|
new->protocol = old->protocol;
|
|
|
|
new->mark = old->mark;
|
2009-11-21 07:35:04 +08:00
|
|
|
new->skb_iif = old->skb_iif;
|
2007-10-14 15:37:30 +08:00
|
|
|
__nf_copy(new, old);
|
2011-12-12 10:58:22 +08:00
|
|
|
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
|
2007-10-14 15:37:30 +08:00
|
|
|
new->nf_trace = old->nf_trace;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NET_SCHED
|
|
|
|
new->tc_index = old->tc_index;
|
|
|
|
#ifdef CONFIG_NET_CLS_ACT
|
|
|
|
new->tc_verd = old->tc_verd;
|
|
|
|
#endif
|
|
|
|
#endif
|
2008-07-15 13:49:06 +08:00
|
|
|
new->vlan_tci = old->vlan_tci;
|
|
|
|
|
2007-10-14 15:37:30 +08:00
|
|
|
skb_copy_secmark(new, old);
|
|
|
|
}
|
|
|
|
|
2009-05-23 06:11:37 +08:00
|
|
|
/*
|
|
|
|
* You should not add any new code to this function. Add it to
|
|
|
|
* __copy_skb_header above instead.
|
|
|
|
*/
|
2007-10-14 15:37:52 +08:00
|
|
|
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
#define C(x) n->x = skb->x
|
|
|
|
|
|
|
|
n->next = n->prev = NULL;
|
|
|
|
n->sk = NULL;
|
2007-10-14 15:37:30 +08:00
|
|
|
__copy_skb_header(n, skb);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
C(len);
|
|
|
|
C(data_len);
|
2007-03-17 06:00:46 +08:00
|
|
|
C(mac_len);
|
[SKBUFF]: Keep track of writable header len of headerless clones
Currently NAT (and others) that want to modify cloned skbs copy them,
even if in the vast majority of cases its not necessary because the
skb is a clone made by TCP and the portion NAT wants to modify is
actually writable because TCP release the header reference before
cloning.
The problem is that there is no clean way for NAT to find out how
long the writable header area is, so this patch introduces skb->hdr_len
to hold this length. When a headerless skb is cloned skb->hdr_len
is set to the current headroom, for regular clones it is copied from
the original. A new function skb_clone_writable(skb, len) returns
whether the skb is writable up to len bytes from skb->data. To avoid
enlarging the skb the mac_len field is reduced to 16 bit and the
new hdr_len field is put in the remaining 16 bit.
I've done a few rough benchmarks of NAT (not with this exact patch,
but a very similar one). As expected it saves huge amounts of system
time in case of sendfile, bringing it down to basically the same
amount as without NAT, with sendmsg it only helps on loopback,
probably because of the large MTU.
Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and
without NAT:
- sendfile eth0, no NAT: sys 0m0.388s
- sendfile eth0, NAT: sys 0m1.835s
- sendfile eth0: NAT + path: sys 0m0.370s (~ -80%)
- sendfile lo, no NAT: sys 0m0.258s
- sendfile lo, NAT: sys 0m2.609s
- sendfile lo, NAT + patch: sys 0m0.260s (~ -90%)
- sendmsg eth0, no NAT: sys 0m2.508s
- sendmsg eth0, NAT: sys 0m2.539s
- sendmsg eth0, NAT + patch: sys 0m2.445s (no change)
- sendmsg lo, no NAT: sys 0m2.151s
- sendmsg lo, NAT: sys 0m3.557s
- sendmsg lo, NAT + patch: sys 0m2.159s (~ -40%)
I expect other users can see a similar performance improvement,
packet mangling iptables targets, ipip and ip_gre come to mind ..
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-06-25 19:35:20 +08:00
|
|
|
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
|
2008-01-08 13:56:41 +08:00
|
|
|
n->cloned = 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
n->nohdr = 0;
|
|
|
|
n->destructor = NULL;
|
|
|
|
C(tail);
|
|
|
|
C(end);
|
2008-01-08 13:56:41 +08:00
|
|
|
C(head);
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
C(head_frag);
|
2008-01-08 13:56:41 +08:00
|
|
|
C(data);
|
|
|
|
C(truesize);
|
|
|
|
atomic_set(&n->users, 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
atomic_inc(&(skb_shinfo(skb)->dataref));
|
|
|
|
skb->cloned = 1;
|
|
|
|
|
|
|
|
return n;
|
2007-10-14 15:37:52 +08:00
|
|
|
#undef C
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_morph - morph one skb into another
|
|
|
|
* @dst: the skb to receive the contents
|
|
|
|
* @src: the skb to supply the contents
|
|
|
|
*
|
|
|
|
* This is identical to skb_clone except that the target skb is
|
|
|
|
* supplied by the user.
|
|
|
|
*
|
|
|
|
* The target skb is returned upon exit.
|
|
|
|
*/
|
|
|
|
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
|
|
|
|
{
|
2007-11-26 23:11:19 +08:00
|
|
|
skb_release_all(dst);
|
2007-10-14 15:37:52 +08:00
|
|
|
return __skb_clone(dst, src);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(skb_morph);
|
|
|
|
|
2012-07-10 18:55:09 +08:00
|
|
|
/**
|
|
|
|
* skb_copy_ubufs - copy userspace skb frags buffers to kernel
|
2011-08-31 16:03:29 +08:00
|
|
|
* @skb: the skb to modify
|
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* This must be called on SKBTX_DEV_ZEROCOPY skb.
|
|
|
|
* It will copy all frags into kernel and drop the reference
|
|
|
|
* to userspace pages.
|
|
|
|
*
|
|
|
|
* If this function is called from an interrupt gfp_mask() must be
|
|
|
|
* %GFP_ATOMIC.
|
|
|
|
*
|
|
|
|
* Returns 0 on success or a negative error code on failure
|
|
|
|
* to allocate kernel memory to copy to.
|
|
|
|
*/
|
|
|
|
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
|
2011-07-06 20:22:12 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int num_frags = skb_shinfo(skb)->nr_frags;
|
|
|
|
struct page *page, *head = NULL;
|
|
|
|
struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
|
|
|
|
|
|
|
|
for (i = 0; i < num_frags; i++) {
|
|
|
|
u8 *vaddr;
|
|
|
|
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
|
|
|
|
|
2012-07-17 10:05:29 +08:00
|
|
|
page = alloc_page(gfp_mask);
|
2011-07-06 20:22:12 +08:00
|
|
|
if (!page) {
|
|
|
|
while (head) {
|
|
|
|
struct page *next = (struct page *)head->private;
|
|
|
|
put_page(head);
|
|
|
|
head = next;
|
|
|
|
}
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr = kmap_atomic(skb_frag_page(f));
|
2011-07-06 20:22:12 +08:00
|
|
|
memcpy(page_address(page),
|
2011-10-19 05:00:24 +08:00
|
|
|
vaddr + f->page_offset, skb_frag_size(f));
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(vaddr);
|
2011-07-06 20:22:12 +08:00
|
|
|
page->private = (unsigned long)head;
|
|
|
|
head = page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skb frags release userspace buffers */
|
2012-07-17 10:05:29 +08:00
|
|
|
for (i = 0; i < num_frags; i++)
|
2011-10-20 07:01:49 +08:00
|
|
|
skb_frag_unref(skb, i);
|
2011-07-06 20:22:12 +08:00
|
|
|
|
2012-11-01 17:16:22 +08:00
|
|
|
uarg->callback(uarg, false);
|
2011-07-06 20:22:12 +08:00
|
|
|
|
|
|
|
/* skb frags point to kernel buffers */
|
2012-07-17 10:05:29 +08:00
|
|
|
for (i = num_frags - 1; i >= 0; i--) {
|
|
|
|
__skb_fill_page_desc(skb, i, head, 0,
|
|
|
|
skb_shinfo(skb)->frags[i].size);
|
2011-07-06 20:22:12 +08:00
|
|
|
head = (struct page *)head->private;
|
|
|
|
}
|
2011-08-31 16:03:29 +08:00
|
|
|
|
|
|
|
skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
|
2011-07-06 20:22:12 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2012-07-20 17:23:20 +08:00
|
|
|
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
|
2011-07-06 20:22:12 +08:00
|
|
|
|
2007-10-14 15:37:52 +08:00
|
|
|
/**
|
|
|
|
* skb_clone - duplicate an sk_buff
|
|
|
|
* @skb: buffer to clone
|
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* Duplicate an &sk_buff. The new one is not owned by a socket. Both
|
|
|
|
* copies share the same packet data but not structure. The new
|
|
|
|
* buffer has a reference count of 1. If the allocation fails the
|
|
|
|
* function returns %NULL otherwise the new buffer is returned.
|
|
|
|
*
|
|
|
|
* If this function is called from an interrupt gfp_mask() must be
|
|
|
|
* %GFP_ATOMIC.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
|
|
|
|
{
|
|
|
|
struct sk_buff *n;
|
|
|
|
|
2012-07-20 17:23:10 +08:00
|
|
|
if (skb_orphan_frags(skb, gfp_mask))
|
|
|
|
return NULL;
|
2011-07-06 20:22:12 +08:00
|
|
|
|
2007-10-14 15:37:52 +08:00
|
|
|
n = skb + 1;
|
|
|
|
if (skb->fclone == SKB_FCLONE_ORIG &&
|
|
|
|
n->fclone == SKB_FCLONE_UNAVAILABLE) {
|
|
|
|
atomic_t *fclone_ref = (atomic_t *) (n + 1);
|
|
|
|
n->fclone = SKB_FCLONE_CLONE;
|
|
|
|
atomic_inc(fclone_ref);
|
|
|
|
} else {
|
2012-08-01 07:44:19 +08:00
|
|
|
if (skb_pfmemalloc(skb))
|
|
|
|
gfp_mask |= __GFP_MEMALLOC;
|
|
|
|
|
2007-10-14 15:37:52 +08:00
|
|
|
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
|
|
|
|
if (!n)
|
|
|
|
return NULL;
|
2008-08-30 18:16:35 +08:00
|
|
|
|
|
|
|
kmemcheck_annotate_bitfield(n, flags1);
|
|
|
|
kmemcheck_annotate_bitfield(n, flags2);
|
2007-10-14 15:37:52 +08:00
|
|
|
n->fclone = SKB_FCLONE_UNAVAILABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return __skb_clone(n, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_clone);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
|
|
|
|
{
|
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-11 12:22:35 +08:00
|
|
|
#ifndef NET_SKBUFF_DATA_USES_OFFSET
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Shift between the two data areas in bytes
|
|
|
|
*/
|
|
|
|
unsigned long offset = new->data - old->data;
|
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-11 12:22:35 +08:00
|
|
|
#endif
|
2007-10-14 15:37:30 +08:00
|
|
|
|
|
|
|
__copy_skb_header(new, old);
|
|
|
|
|
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-11 12:22:35 +08:00
|
|
|
#ifndef NET_SKBUFF_DATA_USES_OFFSET
|
|
|
|
/* {transport,network,mac}_header are relative to skb->head */
|
|
|
|
new->transport_header += offset;
|
|
|
|
new->network_header += offset;
|
2009-06-17 20:17:34 +08:00
|
|
|
if (skb_mac_header_was_set(new))
|
|
|
|
new->mac_header += offset;
|
2012-12-07 22:14:14 +08:00
|
|
|
new->inner_transport_header += offset;
|
|
|
|
new->inner_network_header += offset;
|
[SK_BUFF]: Use offsets for skb->{mac,network,transport}_header on 64bit architectures
With this we save 8 bytes per network packet, leaving a 4 bytes hole to be used
in further shrinking work, likely with the offsetization of other pointers,
such as ->{data,tail,end}, at the cost of adds, that were minimized by the
usual practice of setting skb->{mac,nh,n}.raw to a local variable that is then
accessed multiple times in each function, it also is not more expensive than
before with regards to most of the handling of such headers, like setting one
of these headers to another (transport to network, etc), or subtracting, adding
to/from it, comparing them, etc.
Now we have this layout for sk_buff on a x86_64 machine:
[acme@mica net-2.6.22]$ pahole vmlinux sk_buff
struct sk_buff {
struct sk_buff * next; /* 0 8 */
struct sk_buff * prev; /* 8 8 */
struct rb_node rb; /* 16 24 */
struct sock * sk; /* 40 8 */
ktime_t tstamp; /* 48 8 */
struct net_device * dev; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct net_device * input_dev; /* 64 8 */
sk_buff_data_t transport_header; /* 72 4 */
sk_buff_data_t network_header; /* 76 4 */
sk_buff_data_t mac_header; /* 80 4 */
/* XXX 4 bytes hole, try to pack */
struct dst_entry * dst; /* 88 8 */
struct sec_path * sp; /* 96 8 */
char cb[48]; /* 104 48 */
/* cacheline 2 boundary (128 bytes) was 24 bytes ago*/
unsigned int len; /* 152 4 */
unsigned int data_len; /* 156 4 */
unsigned int mac_len; /* 160 4 */
union {
__wsum csum; /* 4 */
__u32 csum_offset; /* 4 */
}; /* 164 4 */
__u32 priority; /* 168 4 */
__u8 local_df:1; /* 172 1 */
__u8 cloned:1; /* 172 1 */
__u8 ip_summed:2; /* 172 1 */
__u8 nohdr:1; /* 172 1 */
__u8 nfctinfo:3; /* 172 1 */
__u8 pkt_type:3; /* 173 1 */
__u8 fclone:2; /* 173 1 */
__u8 ipvs_property:1; /* 173 1 */
/* XXX 2 bits hole, try to pack */
__be16 protocol; /* 174 2 */
void (*destructor)(struct sk_buff *); /* 176 8 */
struct nf_conntrack * nfct; /* 184 8 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct sk_buff * nfct_reasm; /* 192 8 */
struct nf_bridge_info *nf_bridge; /* 200 8 */
__u16 tc_index; /* 208 2 */
__u16 tc_verd; /* 210 2 */
dma_cookie_t dma_cookie; /* 212 4 */
__u32 secmark; /* 216 4 */
__u32 mark; /* 220 4 */
unsigned int truesize; /* 224 4 */
atomic_t users; /* 228 4 */
unsigned char * head; /* 232 8 */
unsigned char * data; /* 240 8 */
unsigned char * tail; /* 248 8 */
/* --- cacheline 4 boundary (256 bytes) --- */
unsigned char * end; /* 256 8 */
}; /* size: 264, cachelines: 5 */
/* sum members: 260, holes: 1, sum holes: 4 */
/* bit holes: 1, sum bit holes: 2 bits */
/* last cacheline: 8 bytes */
On 32 bits nothing changes, and pointers continue to be used with the compiler
turning all this abstraction layer into dust. But there are some sk_buff
validation tricks that are now possible, humm... :-)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-04-11 12:22:35 +08:00
|
|
|
#endif
|
2006-06-22 17:40:14 +08:00
|
|
|
skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
|
|
|
|
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
|
|
|
|
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-08-01 07:44:19 +08:00
|
|
|
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
if (skb_pfmemalloc(skb))
|
|
|
|
return SKB_ALLOC_RX;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* skb_copy - create private copy of an sk_buff
|
|
|
|
* @skb: buffer to copy
|
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* Make a copy of both an &sk_buff and its data. This is used when the
|
|
|
|
* caller wishes to modify the data and needs a private copy of the
|
|
|
|
* data to alter. Returns %NULL on failure or the pointer to the buffer
|
|
|
|
* on success. The returned buffer has a reference count of 1.
|
|
|
|
*
|
|
|
|
* As by-product this function converts non-linear &sk_buff to linear
|
|
|
|
* one, so that &sk_buff becomes completely private and caller is allowed
|
|
|
|
* to modify all the data of returned buffer. This means that this
|
|
|
|
* function is not recommended for use in circumstances when only
|
|
|
|
* header is going to be modified. Use pskb_copy() instead.
|
|
|
|
*/
|
|
|
|
|
2005-10-07 14:46:04 +08:00
|
|
|
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-09-01 13:25:10 +08:00
|
|
|
int headerlen = skb_headroom(skb);
|
2012-05-04 22:26:56 +08:00
|
|
|
unsigned int size = skb_end_offset(skb) + skb->data_len;
|
2012-08-01 07:44:19 +08:00
|
|
|
struct sk_buff *n = __alloc_skb(size, gfp_mask,
|
|
|
|
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
|
2010-09-01 13:25:10 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!n)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* Set the data pointer */
|
|
|
|
skb_reserve(n, headerlen);
|
|
|
|
/* Set the tail pointer and length */
|
|
|
|
skb_put(n, skb->len);
|
|
|
|
|
|
|
|
if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
|
|
|
|
BUG();
|
|
|
|
|
|
|
|
copy_skb_header(n, skb);
|
|
|
|
return n;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_copy);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
2011-12-04 05:39:53 +08:00
|
|
|
* __pskb_copy - create copy of an sk_buff with private head.
|
2005-04-17 06:20:36 +08:00
|
|
|
* @skb: buffer to copy
|
2011-12-04 05:39:53 +08:00
|
|
|
* @headroom: headroom of new skb
|
2005-04-17 06:20:36 +08:00
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* Make a copy of both an &sk_buff and part of its data, located
|
|
|
|
* in header. Fragmented data remain shared. This is used when
|
|
|
|
* the caller wishes to modify only header of &sk_buff and needs
|
|
|
|
* private copy of the header to alter. Returns %NULL on failure
|
|
|
|
* or the pointer to the buffer on success.
|
|
|
|
* The returned buffer has a reference count of 1.
|
|
|
|
*/
|
|
|
|
|
2011-12-04 05:39:53 +08:00
|
|
|
struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-12-04 05:39:53 +08:00
|
|
|
unsigned int size = skb_headlen(skb) + headroom;
|
2012-08-01 07:44:19 +08:00
|
|
|
struct sk_buff *n = __alloc_skb(size, gfp_mask,
|
|
|
|
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
|
2010-09-01 13:25:10 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!n)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Set the data pointer */
|
2011-12-04 05:39:53 +08:00
|
|
|
skb_reserve(n, headroom);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Set the tail pointer and length */
|
|
|
|
skb_put(n, skb_headlen(skb));
|
|
|
|
/* Copy the bytes */
|
2007-03-28 05:55:52 +08:00
|
|
|
skb_copy_from_linear_data(skb, n->data, n->len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-08 06:57:15 +08:00
|
|
|
n->truesize += skb->data_len;
|
2005-04-17 06:20:36 +08:00
|
|
|
n->data_len = skb->data_len;
|
|
|
|
n->len = skb->len;
|
|
|
|
|
|
|
|
if (skb_shinfo(skb)->nr_frags) {
|
|
|
|
int i;
|
|
|
|
|
2012-07-20 17:23:10 +08:00
|
|
|
if (skb_orphan_frags(skb, gfp_mask)) {
|
|
|
|
kfree_skb(n);
|
|
|
|
n = NULL;
|
|
|
|
goto out;
|
2011-07-06 20:22:12 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
|
|
|
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_ref(skb, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
skb_shinfo(n)->nr_frags = i;
|
|
|
|
}
|
|
|
|
|
2010-08-23 15:13:46 +08:00
|
|
|
if (skb_has_frag_list(skb)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
|
|
|
|
skb_clone_fraglist(n);
|
|
|
|
}
|
|
|
|
|
|
|
|
copy_skb_header(n, skb);
|
|
|
|
out:
|
|
|
|
return n;
|
|
|
|
}
|
2011-12-04 05:39:53 +08:00
|
|
|
EXPORT_SYMBOL(__pskb_copy);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* pskb_expand_head - reallocate header of &sk_buff
|
|
|
|
* @skb: buffer to reallocate
|
|
|
|
* @nhead: room to add at head
|
|
|
|
* @ntail: room to add at tail
|
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* Expands (or creates identical copy, if &nhead and &ntail are zero)
|
|
|
|
* header of skb. &sk_buff itself is not changed. &sk_buff MUST have
|
|
|
|
* reference count of 1. Returns zero in the case of success or error,
|
|
|
|
* if expansion failed. In the last case, &sk_buff is not changed.
|
|
|
|
*
|
|
|
|
* All the pointers pointing into skb header may change and must be
|
|
|
|
* reloaded after call to this function.
|
|
|
|
*/
|
|
|
|
|
2005-07-09 05:57:47 +08:00
|
|
|
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
|
2005-10-07 14:46:04 +08:00
|
|
|
gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
u8 *data;
|
2012-05-04 22:26:56 +08:00
|
|
|
int size = nhead + skb_end_offset(skb) + ntail;
|
2005-04-17 06:20:36 +08:00
|
|
|
long off;
|
|
|
|
|
2008-10-01 22:09:38 +08:00
|
|
|
BUG_ON(nhead < 0);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (skb_shared(skb))
|
|
|
|
BUG();
|
|
|
|
|
|
|
|
size = SKB_DATA_ALIGN(size);
|
|
|
|
|
2012-08-01 07:44:19 +08:00
|
|
|
if (skb_pfmemalloc(skb))
|
|
|
|
gfp_mask |= __GFP_MEMALLOC;
|
|
|
|
data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
|
|
|
|
gfp_mask, NUMA_NO_NODE, NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!data)
|
|
|
|
goto nodata;
|
2012-04-11 04:08:39 +08:00
|
|
|
size = SKB_WITH_OVERHEAD(ksize(data));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Copy only real data... and, alas, header. This should be
|
2010-09-01 13:25:10 +08:00
|
|
|
* optimized for the cases when header is void.
|
|
|
|
*/
|
|
|
|
memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
|
|
|
|
|
|
|
|
memcpy((struct skb_shared_info *)(data + size),
|
|
|
|
skb_shinfo(skb),
|
2010-07-23 03:09:08 +08:00
|
|
|
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-05-04 22:26:51 +08:00
|
|
|
/*
|
|
|
|
* if shinfo is shared we must drop the old head gracefully, but if it
|
|
|
|
* is not we can just drop the old head and let the existing refcount
|
|
|
|
* be since all we did is relocate the values
|
|
|
|
*/
|
|
|
|
if (skb_cloned(skb)) {
|
2011-07-06 20:22:12 +08:00
|
|
|
/* copy this zero copy skb frags */
|
2012-07-20 17:23:10 +08:00
|
|
|
if (skb_orphan_frags(skb, gfp_mask))
|
|
|
|
goto nofrags;
|
2010-09-03 07:09:32 +08:00
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_ref(skb, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-03 07:09:32 +08:00
|
|
|
if (skb_has_frag_list(skb))
|
|
|
|
skb_clone_fraglist(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-03 07:09:32 +08:00
|
|
|
skb_release_data(skb);
|
2012-05-04 22:26:51 +08:00
|
|
|
} else {
|
|
|
|
skb_free_head(skb);
|
2010-09-03 07:09:32 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
off = (data + nhead) - skb->head;
|
|
|
|
|
|
|
|
skb->head = data;
|
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-04-27 08:33:38 +08:00
|
|
|
skb->head_frag = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->data += off;
|
2007-04-20 11:43:29 +08:00
|
|
|
#ifdef NET_SKBUFF_DATA_USES_OFFSET
|
|
|
|
skb->end = size;
|
2007-04-10 02:45:04 +08:00
|
|
|
off = nhead;
|
2007-04-20 11:43:29 +08:00
|
|
|
#else
|
|
|
|
skb->end = skb->head + size;
|
2007-04-10 02:45:04 +08:00
|
|
|
#endif
|
2007-04-20 11:29:13 +08:00
|
|
|
/* {transport,network,mac}_header and tail are relative to skb->head */
|
|
|
|
skb->tail += off;
|
2007-04-11 12:21:55 +08:00
|
|
|
skb->transport_header += off;
|
|
|
|
skb->network_header += off;
|
2009-06-17 20:17:34 +08:00
|
|
|
if (skb_mac_header_was_set(skb))
|
|
|
|
skb->mac_header += off;
|
2012-12-07 22:14:14 +08:00
|
|
|
skb->inner_transport_header += off;
|
|
|
|
skb->inner_network_header += off;
|
net: Fix corruption of skb csum field in pskb_expand_head() of net/core/skbuff.c
Make pskb_expand_head() check ip_summed to make sure csum_start is really
csum_start and not csum before adjusting it.
This fixes a bug I encountered using a Sun Quad-Fast Ethernet card and VLANs.
On my configuration, the sunhme driver produces skbs with differing amounts
of headroom on receive depending on the packet size. See line 2030 of
drivers/net/sunhme.c; packets smaller than RX_COPY_THRESHOLD have 52 bytes
of headroom but packets larger than that cutoff have only 20 bytes.
When these packets reach the VLAN driver, vlan_check_reorder_header()
calls skb_cow(), which, if the packet has less than NET_SKB_PAD (== 32) bytes
of headroom, uses pskb_expand_head() to make more.
Then, pskb_expand_head() needs to adjust a lot of offsets into the skb,
including csum_start. Since csum_start is a union with csum, if the packet
has a valid csum value this will corrupt it, which was the effect I observed.
The sunhme hardware computes receive checksums, so the skbs would be created
by the driver with ip_summed == CHECKSUM_COMPLETE and a valid csum field, and
then pskb_expand_head() would corrupt the csum field, leading to an "hw csum
error" message later on, for example in icmp_rcv() for pings larger than the
sunhme RX_COPY_THRESHOLD.
On the basis of the comment at the beginning of include/linux/skbuff.h,
I believe that the csum_start skb field is only meaningful if ip_csummed is
CSUM_PARTIAL, so this patch makes pskb_expand_head() adjust it only in that
case to avoid corrupting a valid csum value.
Please see my more in-depth disucssion of tracking down this bug for
more details if you like:
http://puellavulnerata.livejournal.com/112186.html
http://puellavulnerata.livejournal.com/112567.html
http://puellavulnerata.livejournal.com/112891.html
http://puellavulnerata.livejournal.com/113096.html
http://puellavulnerata.livejournal.com/113591.html
I am not subscribed to this list, so please CC me on replies.
Signed-off-by: Andrea Shepard <andrea@persephoneslair.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-07-22 17:12:35 +08:00
|
|
|
/* Only adjust this if it actually is csum_start rather than csum */
|
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL)
|
|
|
|
skb->csum_start += nhead;
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->cloned = 0;
|
[SKBUFF]: Keep track of writable header len of headerless clones
Currently NAT (and others) that want to modify cloned skbs copy them,
even if in the vast majority of cases its not necessary because the
skb is a clone made by TCP and the portion NAT wants to modify is
actually writable because TCP release the header reference before
cloning.
The problem is that there is no clean way for NAT to find out how
long the writable header area is, so this patch introduces skb->hdr_len
to hold this length. When a headerless skb is cloned skb->hdr_len
is set to the current headroom, for regular clones it is copied from
the original. A new function skb_clone_writable(skb, len) returns
whether the skb is writable up to len bytes from skb->data. To avoid
enlarging the skb the mac_len field is reduced to 16 bit and the
new hdr_len field is put in the remaining 16 bit.
I've done a few rough benchmarks of NAT (not with this exact patch,
but a very similar one). As expected it saves huge amounts of system
time in case of sendfile, bringing it down to basically the same
amount as without NAT, with sendmsg it only helps on loopback,
probably because of the large MTU.
Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and
without NAT:
- sendfile eth0, no NAT: sys 0m0.388s
- sendfile eth0, NAT: sys 0m1.835s
- sendfile eth0: NAT + path: sys 0m0.370s (~ -80%)
- sendfile lo, no NAT: sys 0m0.258s
- sendfile lo, NAT: sys 0m2.609s
- sendfile lo, NAT + patch: sys 0m0.260s (~ -90%)
- sendmsg eth0, no NAT: sys 0m2.508s
- sendmsg eth0, NAT: sys 0m2.539s
- sendmsg eth0, NAT + patch: sys 0m2.445s (no change)
- sendmsg lo, no NAT: sys 0m2.151s
- sendmsg lo, NAT: sys 0m3.557s
- sendmsg lo, NAT + patch: sys 0m2.159s (~ -40%)
I expect other users can see a similar performance improvement,
packet mangling iptables targets, ipip and ip_gre come to mind ..
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-06-25 19:35:20 +08:00
|
|
|
skb->hdr_len = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->nohdr = 0;
|
|
|
|
atomic_set(&skb_shinfo(skb)->dataref, 1);
|
|
|
|
return 0;
|
|
|
|
|
2011-07-06 20:22:12 +08:00
|
|
|
nofrags:
|
|
|
|
kfree(data);
|
2005-04-17 06:20:36 +08:00
|
|
|
nodata:
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(pskb_expand_head);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Make private copy of skb with writable head and some headroom */
|
|
|
|
|
|
|
|
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb2;
|
|
|
|
int delta = headroom - skb_headroom(skb);
|
|
|
|
|
|
|
|
if (delta <= 0)
|
|
|
|
skb2 = pskb_copy(skb, GFP_ATOMIC);
|
|
|
|
else {
|
|
|
|
skb2 = skb_clone(skb, GFP_ATOMIC);
|
|
|
|
if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
|
|
|
|
GFP_ATOMIC)) {
|
|
|
|
kfree_skb(skb2);
|
|
|
|
skb2 = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return skb2;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_realloc_headroom);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_copy_expand - copy and expand sk_buff
|
|
|
|
* @skb: buffer to copy
|
|
|
|
* @newheadroom: new free bytes at head
|
|
|
|
* @newtailroom: new free bytes at tail
|
|
|
|
* @gfp_mask: allocation priority
|
|
|
|
*
|
|
|
|
* Make a copy of both an &sk_buff and its data and while doing so
|
|
|
|
* allocate additional space.
|
|
|
|
*
|
|
|
|
* This is used when the caller wishes to modify the data and needs a
|
|
|
|
* private copy of the data to alter as well as more space for new fields.
|
|
|
|
* Returns %NULL on failure or the pointer to the buffer
|
|
|
|
* on success. The returned buffer has a reference count of 1.
|
|
|
|
*
|
|
|
|
* You must pass %GFP_ATOMIC as the allocation priority if this function
|
|
|
|
* is called from an interrupt.
|
|
|
|
*/
|
|
|
|
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
|
2005-07-09 05:57:47 +08:00
|
|
|
int newheadroom, int newtailroom,
|
2005-10-07 14:46:04 +08:00
|
|
|
gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Allocate the copy buffer
|
|
|
|
*/
|
2012-08-01 07:44:19 +08:00
|
|
|
struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
|
|
|
|
gfp_mask, skb_alloc_rx_flag(skb),
|
|
|
|
NUMA_NO_NODE);
|
2007-04-11 09:30:09 +08:00
|
|
|
int oldheadroom = skb_headroom(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
int head_copy_len, head_copy_off;
|
2007-09-17 07:32:11 +08:00
|
|
|
int off;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!n)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
skb_reserve(n, newheadroom);
|
|
|
|
|
|
|
|
/* Set the tail pointer and length */
|
|
|
|
skb_put(n, skb->len);
|
|
|
|
|
2007-04-11 09:30:09 +08:00
|
|
|
head_copy_len = oldheadroom;
|
2005-04-17 06:20:36 +08:00
|
|
|
head_copy_off = 0;
|
|
|
|
if (newheadroom <= head_copy_len)
|
|
|
|
head_copy_len = newheadroom;
|
|
|
|
else
|
|
|
|
head_copy_off = newheadroom - head_copy_len;
|
|
|
|
|
|
|
|
/* Copy the linear header and data. */
|
|
|
|
if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
|
|
|
|
skb->len + head_copy_len))
|
|
|
|
BUG();
|
|
|
|
|
|
|
|
copy_skb_header(n, skb);
|
|
|
|
|
2007-04-11 09:30:09 +08:00
|
|
|
off = newheadroom - oldheadroom;
|
2010-07-23 04:27:09 +08:00
|
|
|
if (n->ip_summed == CHECKSUM_PARTIAL)
|
|
|
|
n->csum_start += off;
|
2007-09-17 07:32:11 +08:00
|
|
|
#ifdef NET_SKBUFF_DATA_USES_OFFSET
|
2007-04-11 09:30:09 +08:00
|
|
|
n->transport_header += off;
|
|
|
|
n->network_header += off;
|
2009-06-17 20:17:34 +08:00
|
|
|
if (skb_mac_header_was_set(skb))
|
|
|
|
n->mac_header += off;
|
2012-12-07 22:14:14 +08:00
|
|
|
n->inner_transport_header += off;
|
|
|
|
n->inner_network_header += off;
|
2007-09-17 07:32:11 +08:00
|
|
|
#endif
|
2007-04-11 09:30:09 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return n;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_copy_expand);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_pad - zero pad the tail of an skb
|
|
|
|
* @skb: buffer to pad
|
|
|
|
* @pad: space to pad
|
|
|
|
*
|
|
|
|
* Ensure that a buffer is followed by a padding area that is zero
|
|
|
|
* filled. Used by network drivers which may DMA or transfer data
|
|
|
|
* beyond the buffer end onto the wire.
|
|
|
|
*
|
2006-06-23 17:06:41 +08:00
|
|
|
* May return error in out of memory cases. The skb is freed on error.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-02-09 22:24:36 +08:00
|
|
|
|
2006-06-23 17:06:41 +08:00
|
|
|
int skb_pad(struct sk_buff *skb, int pad)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-06-23 17:06:41 +08:00
|
|
|
int err;
|
|
|
|
int ntail;
|
2007-02-09 22:24:36 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* If the skbuff is non linear tailroom is always zero.. */
|
2006-06-23 17:06:41 +08:00
|
|
|
if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
|
2005-04-17 06:20:36 +08:00
|
|
|
memset(skb->data+skb->len, 0, pad);
|
2006-06-23 17:06:41 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-06-23 17:06:41 +08:00
|
|
|
|
2007-04-20 11:43:29 +08:00
|
|
|
ntail = skb->data_len + pad - (skb->end - skb->tail);
|
2006-06-23 17:06:41 +08:00
|
|
|
if (likely(skb_cloned(skb) || ntail > 0)) {
|
|
|
|
err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
|
|
|
|
if (unlikely(err))
|
|
|
|
goto free_skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FIXME: The use of this function with non-linear skb's really needs
|
|
|
|
* to be audited.
|
|
|
|
*/
|
|
|
|
err = skb_linearize(skb);
|
|
|
|
if (unlikely(err))
|
|
|
|
goto free_skb;
|
|
|
|
|
|
|
|
memset(skb->data + skb->len, 0, pad);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
free_skb:
|
2005-04-17 06:20:36 +08:00
|
|
|
kfree_skb(skb);
|
2006-06-23 17:06:41 +08:00
|
|
|
return err;
|
2007-02-09 22:24:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_pad);
|
2007-02-09 22:24:36 +08:00
|
|
|
|
2008-03-28 08:43:41 +08:00
|
|
|
/**
|
|
|
|
* skb_put - add data to a buffer
|
|
|
|
* @skb: buffer to use
|
|
|
|
* @len: amount of data to add
|
|
|
|
*
|
|
|
|
* This function extends the used data area of the buffer. If this would
|
|
|
|
* exceed the total buffer size the kernel will panic. A pointer to the
|
|
|
|
* first byte of the extra data is returned.
|
|
|
|
*/
|
|
|
|
unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
|
|
|
unsigned char *tmp = skb_tail_pointer(skb);
|
|
|
|
SKB_LINEAR_ASSERT(skb);
|
|
|
|
skb->tail += len;
|
|
|
|
skb->len += len;
|
|
|
|
if (unlikely(skb->tail > skb->end))
|
|
|
|
skb_over_panic(skb, len, __builtin_return_address(0));
|
|
|
|
return tmp;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_put);
|
|
|
|
|
2008-03-28 08:52:40 +08:00
|
|
|
/**
|
|
|
|
* skb_push - add data to the start of a buffer
|
|
|
|
* @skb: buffer to use
|
|
|
|
* @len: amount of data to add
|
|
|
|
*
|
|
|
|
* This function extends the used data area of the buffer at the buffer
|
|
|
|
* start. If this would exceed the total buffer headroom the kernel will
|
|
|
|
* panic. A pointer to the first byte of the extra data is returned.
|
|
|
|
*/
|
|
|
|
unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
|
|
|
skb->data -= len;
|
|
|
|
skb->len += len;
|
|
|
|
if (unlikely(skb->data<skb->head))
|
|
|
|
skb_under_panic(skb, len, __builtin_return_address(0));
|
|
|
|
return skb->data;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_push);
|
|
|
|
|
2008-03-28 08:47:24 +08:00
|
|
|
/**
|
|
|
|
* skb_pull - remove data from the start of a buffer
|
|
|
|
* @skb: buffer to use
|
|
|
|
* @len: amount of data to remove
|
|
|
|
*
|
|
|
|
* This function removes data from the start of a buffer, returning
|
|
|
|
* the memory to the headroom. A pointer to the next data in the buffer
|
|
|
|
* is returned. Once the data has been pulled future pushes will overwrite
|
|
|
|
* the old data.
|
|
|
|
*/
|
|
|
|
unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
2010-05-02 17:21:44 +08:00
|
|
|
return skb_pull_inline(skb, len);
|
2008-03-28 08:47:24 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_pull);
|
|
|
|
|
2008-03-28 08:54:01 +08:00
|
|
|
/**
|
|
|
|
* skb_trim - remove end from a buffer
|
|
|
|
* @skb: buffer to alter
|
|
|
|
* @len: new length
|
|
|
|
*
|
|
|
|
* Cut the length of a buffer down by removing data from the tail. If
|
|
|
|
* the buffer is already under the length specified it is not modified.
|
|
|
|
* The skb must be linear.
|
|
|
|
*/
|
|
|
|
void skb_trim(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
|
|
|
if (skb->len > len)
|
|
|
|
__skb_trim(skb, len);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_trim);
|
|
|
|
|
2006-06-10 07:13:38 +08:00
|
|
|
/* Trims skb to length len. It can change skb pointers.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2006-06-10 07:13:38 +08:00
|
|
|
int ___pskb_trim(struct sk_buff *skb, unsigned int len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-07-14 10:26:39 +08:00
|
|
|
struct sk_buff **fragp;
|
|
|
|
struct sk_buff *frag;
|
2005-04-17 06:20:36 +08:00
|
|
|
int offset = skb_headlen(skb);
|
|
|
|
int nfrags = skb_shinfo(skb)->nr_frags;
|
|
|
|
int i;
|
2006-07-14 10:26:39 +08:00
|
|
|
int err;
|
|
|
|
|
|
|
|
if (skb_cloned(skb) &&
|
|
|
|
unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
|
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-07-31 11:20:28 +08:00
|
|
|
i = 0;
|
|
|
|
if (offset >= len)
|
|
|
|
goto drop_pages;
|
|
|
|
|
|
|
|
for (; i < nfrags; i++) {
|
2011-10-19 05:00:24 +08:00
|
|
|
int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
2006-07-14 10:26:39 +08:00
|
|
|
|
|
|
|
if (end < len) {
|
|
|
|
offset = end;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
|
2006-07-14 10:26:39 +08:00
|
|
|
|
2006-07-31 11:20:28 +08:00
|
|
|
drop_pages:
|
2006-07-14 10:26:39 +08:00
|
|
|
skb_shinfo(skb)->nr_frags = i;
|
|
|
|
|
|
|
|
for (; i < nfrags; i++)
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_unref(skb, i);
|
2006-07-14 10:26:39 +08:00
|
|
|
|
2010-08-23 15:13:46 +08:00
|
|
|
if (skb_has_frag_list(skb))
|
2006-07-14 10:26:39 +08:00
|
|
|
skb_drop_fraglist(skb);
|
2006-07-31 11:20:28 +08:00
|
|
|
goto done;
|
2006-07-14 10:26:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
|
|
|
|
fragp = &frag->next) {
|
|
|
|
int end = offset + frag->len;
|
|
|
|
|
|
|
|
if (skb_shared(frag)) {
|
|
|
|
struct sk_buff *nfrag;
|
|
|
|
|
|
|
|
nfrag = skb_clone(frag, GFP_ATOMIC);
|
|
|
|
if (unlikely(!nfrag))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
nfrag->next = frag->next;
|
2012-04-19 10:24:53 +08:00
|
|
|
consume_skb(frag);
|
2006-07-14 10:26:39 +08:00
|
|
|
frag = nfrag;
|
|
|
|
*fragp = frag;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-07-14 10:26:39 +08:00
|
|
|
|
|
|
|
if (end < len) {
|
|
|
|
offset = end;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (end > len &&
|
|
|
|
unlikely((err = pskb_trim(frag, len - offset))))
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (frag->next)
|
|
|
|
skb_drop_list(&frag->next);
|
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-07-31 11:20:28 +08:00
|
|
|
done:
|
2006-07-14 10:26:39 +08:00
|
|
|
if (len > skb_headlen(skb)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
skb->data_len -= skb->len - len;
|
|
|
|
skb->len = len;
|
|
|
|
} else {
|
2006-07-14 10:26:39 +08:00
|
|
|
skb->len = len;
|
|
|
|
skb->data_len = 0;
|
2007-04-20 11:29:13 +08:00
|
|
|
skb_set_tail_pointer(skb, len);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(___pskb_trim);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* __pskb_pull_tail - advance tail of skb header
|
|
|
|
* @skb: buffer to reallocate
|
|
|
|
* @delta: number of bytes to advance tail
|
|
|
|
*
|
|
|
|
* The function makes a sense only on a fragmented &sk_buff,
|
|
|
|
* it expands header moving its tail forward and copying necessary
|
|
|
|
* data from fragmented part.
|
|
|
|
*
|
|
|
|
* &sk_buff MUST have reference count of 1.
|
|
|
|
*
|
|
|
|
* Returns %NULL (and &sk_buff does not change) if pull failed
|
|
|
|
* or value of new tail of skb in the case of success.
|
|
|
|
*
|
|
|
|
* All the pointers pointing into skb header may change and must be
|
|
|
|
* reloaded after call to this function.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Moves tail of skb head forward, copying data from fragmented part,
|
|
|
|
* when it is necessary.
|
|
|
|
* 1. It may fail due to malloc failure.
|
|
|
|
* 2. It may change skb pointers.
|
|
|
|
*
|
|
|
|
* It is pretty complicated. Luckily, it is called only in exceptional cases.
|
|
|
|
*/
|
|
|
|
unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
|
|
|
|
{
|
|
|
|
/* If skb has not enough free space at tail, get new one
|
|
|
|
* plus 128 bytes for future expansions. If we have enough
|
|
|
|
* room at tail, reallocate without expansion only if skb is cloned.
|
|
|
|
*/
|
2007-04-20 11:43:29 +08:00
|
|
|
int i, k, eat = (skb->tail + delta) - skb->end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (eat > 0 || skb_cloned(skb)) {
|
|
|
|
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
|
|
|
|
GFP_ATOMIC))
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2007-04-20 11:29:13 +08:00
|
|
|
if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
|
2005-04-17 06:20:36 +08:00
|
|
|
BUG();
|
|
|
|
|
|
|
|
/* Optimization: no fragments, no reasons to preestimate
|
|
|
|
* size of pulled pages. Superb.
|
|
|
|
*/
|
2010-08-23 15:13:46 +08:00
|
|
|
if (!skb_has_frag_list(skb))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto pull_pages;
|
|
|
|
|
|
|
|
/* Estimate size of pulled pages. */
|
|
|
|
eat = delta;
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2011-10-19 05:00:24 +08:00
|
|
|
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
|
|
|
|
|
|
|
if (size >= eat)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto pull_pages;
|
2011-10-19 05:00:24 +08:00
|
|
|
eat -= size;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* If we need update frag list, we are in troubles.
|
|
|
|
* Certainly, it possible to add an offset to skb data,
|
|
|
|
* but taking into account that pulling is expected to
|
|
|
|
* be very rare operation, it is worth to fight against
|
|
|
|
* further bloating skb head and crucify ourselves here instead.
|
|
|
|
* Pure masohism, indeed. 8)8)
|
|
|
|
*/
|
|
|
|
if (eat) {
|
|
|
|
struct sk_buff *list = skb_shinfo(skb)->frag_list;
|
|
|
|
struct sk_buff *clone = NULL;
|
|
|
|
struct sk_buff *insp = NULL;
|
|
|
|
|
|
|
|
do {
|
2006-01-09 14:24:28 +08:00
|
|
|
BUG_ON(!list);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (list->len <= eat) {
|
|
|
|
/* Eaten as whole. */
|
|
|
|
eat -= list->len;
|
|
|
|
list = list->next;
|
|
|
|
insp = list;
|
|
|
|
} else {
|
|
|
|
/* Eaten partially. */
|
|
|
|
|
|
|
|
if (skb_shared(list)) {
|
|
|
|
/* Sucks! We need to fork list. :-( */
|
|
|
|
clone = skb_clone(list, GFP_ATOMIC);
|
|
|
|
if (!clone)
|
|
|
|
return NULL;
|
|
|
|
insp = list->next;
|
|
|
|
list = clone;
|
|
|
|
} else {
|
|
|
|
/* This may be pulled without
|
|
|
|
* problems. */
|
|
|
|
insp = list;
|
|
|
|
}
|
|
|
|
if (!pskb_pull(list, eat)) {
|
2009-02-25 08:37:32 +08:00
|
|
|
kfree_skb(clone);
|
2005-04-17 06:20:36 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while (eat);
|
|
|
|
|
|
|
|
/* Free pulled out fragments. */
|
|
|
|
while ((list = skb_shinfo(skb)->frag_list) != insp) {
|
|
|
|
skb_shinfo(skb)->frag_list = list->next;
|
|
|
|
kfree_skb(list);
|
|
|
|
}
|
|
|
|
/* And insert new clone at head. */
|
|
|
|
if (clone) {
|
|
|
|
clone->next = list;
|
|
|
|
skb_shinfo(skb)->frag_list = clone;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Success! Now we may commit changes to skb data. */
|
|
|
|
|
|
|
|
pull_pages:
|
|
|
|
eat = delta;
|
|
|
|
k = 0;
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2011-10-19 05:00:24 +08:00
|
|
|
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
|
|
|
|
|
|
|
if (size <= eat) {
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_unref(skb, i);
|
2011-10-19 05:00:24 +08:00
|
|
|
eat -= size;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
|
|
|
|
if (eat) {
|
|
|
|
skb_shinfo(skb)->frags[k].page_offset += eat;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
|
2005-04-17 06:20:36 +08:00
|
|
|
eat = 0;
|
|
|
|
}
|
|
|
|
k++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
skb_shinfo(skb)->nr_frags = k;
|
|
|
|
|
|
|
|
skb->tail += delta;
|
|
|
|
skb->data_len -= delta;
|
|
|
|
|
2007-04-20 11:29:13 +08:00
|
|
|
return skb_tail_pointer(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(__pskb_pull_tail);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-07-30 02:37:31 +08:00
|
|
|
/**
|
|
|
|
* skb_copy_bits - copy bits from skb to kernel buffer
|
|
|
|
* @skb: source skb
|
|
|
|
* @offset: offset in source
|
|
|
|
* @to: destination buffer
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
|
|
|
* Copy the specified number of bytes from the source skb to the
|
|
|
|
* destination buffer.
|
|
|
|
*
|
|
|
|
* CAUTION ! :
|
|
|
|
* If its prototype is ever changed,
|
|
|
|
* check arch/{*}/net/{*}.S files,
|
|
|
|
* since it is called from BPF assembly code.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
|
|
|
|
{
|
2007-04-28 06:21:23 +08:00
|
|
|
int start = skb_headlen(skb);
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
|
|
|
int i, copy;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (offset > (int)skb->len - len)
|
|
|
|
goto fault;
|
|
|
|
|
|
|
|
/* Copy header. */
|
2007-04-28 06:21:23 +08:00
|
|
|
if ((copy = start - offset) > 0) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2007-03-28 05:55:52 +08:00
|
|
|
skb_copy_from_linear_data_offset(skb, offset, to, copy);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2007-04-28 06:21:23 +08:00
|
|
|
int end;
|
2012-04-05 17:35:15 +08:00
|
|
|
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-26 12:43:18 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2007-04-28 06:21:23 +08:00
|
|
|
|
2012-04-05 17:35:15 +08:00
|
|
|
end = start + skb_frag_size(f);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
u8 *vaddr;
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr = kmap_atomic(skb_frag_page(f));
|
2005-04-17 06:20:36 +08:00
|
|
|
memcpy(to,
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr + f->page_offset + offset - start,
|
|
|
|
copy);
|
|
|
|
kunmap_atomic(vaddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
}
|
2007-04-28 06:21:23 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
int end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + frag_iter->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
if (skb_copy_bits(frag_iter, offset - start, to, copy))
|
|
|
|
goto fault;
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-06-09 15:18:59 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-07-06 20:22:12 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!len)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fault:
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_copy_bits);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-11-07 15:30:13 +08:00
|
|
|
/*
|
|
|
|
* Callback from splice_to_pipe(), if we need to release some pages
|
|
|
|
* at the end of the spd in case we error'ed out in filling the pipe.
|
|
|
|
*/
|
|
|
|
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
|
|
|
|
{
|
2009-01-20 09:03:56 +08:00
|
|
|
put_page(spd->pages[i]);
|
|
|
|
}
|
2007-11-07 15:30:13 +08:00
|
|
|
|
2012-04-24 11:06:11 +08:00
|
|
|
static struct page *linear_to_page(struct page *page, unsigned int *len,
|
|
|
|
unsigned int *offset,
|
2013-01-11 22:46:37 +08:00
|
|
|
struct sock *sk)
|
2009-01-20 09:03:56 +08:00
|
|
|
{
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
struct page_frag *pfrag = sk_page_frag(sk);
|
2009-02-01 16:41:42 +08:00
|
|
|
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
if (!sk_page_frag_refill(sk, pfrag))
|
|
|
|
return NULL;
|
2009-02-01 16:41:42 +08:00
|
|
|
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
|
2009-02-01 16:41:42 +08:00
|
|
|
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
memcpy(page_address(pfrag->page) + pfrag->offset,
|
|
|
|
page_address(page) + *offset, *len);
|
|
|
|
*offset = pfrag->offset;
|
|
|
|
pfrag->offset += *len;
|
2009-01-20 09:03:56 +08:00
|
|
|
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
return pfrag->page;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
2012-04-22 20:26:16 +08:00
|
|
|
static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
|
|
|
|
struct page *page,
|
|
|
|
unsigned int offset)
|
|
|
|
{
|
|
|
|
return spd->nr_pages &&
|
|
|
|
spd->pages[spd->nr_pages - 1] == page &&
|
|
|
|
(spd->partial[spd->nr_pages - 1].offset +
|
|
|
|
spd->partial[spd->nr_pages - 1].len == offset);
|
|
|
|
}
|
|
|
|
|
2007-11-07 15:30:13 +08:00
|
|
|
/*
|
|
|
|
* Fill page/offset/length into spd, if it can hold more pages.
|
|
|
|
*/
|
2012-04-24 11:06:11 +08:00
|
|
|
static bool spd_fill_page(struct splice_pipe_desc *spd,
|
|
|
|
struct pipe_inode_info *pipe, struct page *page,
|
|
|
|
unsigned int *len, unsigned int offset,
|
2013-01-11 22:46:37 +08:00
|
|
|
bool linear,
|
2012-04-24 11:06:11 +08:00
|
|
|
struct sock *sk)
|
2007-11-07 15:30:13 +08:00
|
|
|
{
|
2012-04-22 20:26:16 +08:00
|
|
|
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
|
2012-04-24 11:06:11 +08:00
|
|
|
return true;
|
2007-11-07 15:30:13 +08:00
|
|
|
|
2009-01-20 09:03:56 +08:00
|
|
|
if (linear) {
|
2013-01-11 22:46:37 +08:00
|
|
|
page = linear_to_page(page, len, &offset, sk);
|
2009-01-20 09:03:56 +08:00
|
|
|
if (!page)
|
2012-04-24 11:06:11 +08:00
|
|
|
return true;
|
2012-04-22 20:26:16 +08:00
|
|
|
}
|
|
|
|
if (spd_can_coalesce(spd, page, offset)) {
|
|
|
|
spd->partial[spd->nr_pages - 1].len += *len;
|
2012-04-24 11:06:11 +08:00
|
|
|
return false;
|
2012-04-22 20:26:16 +08:00
|
|
|
}
|
|
|
|
get_page(page);
|
2007-11-07 15:30:13 +08:00
|
|
|
spd->pages[spd->nr_pages] = page;
|
2009-02-01 16:41:42 +08:00
|
|
|
spd->partial[spd->nr_pages].len = *len;
|
2007-11-07 15:30:13 +08:00
|
|
|
spd->partial[spd->nr_pages].offset = offset;
|
|
|
|
spd->nr_pages++;
|
2009-01-20 09:03:56 +08:00
|
|
|
|
2012-04-24 11:06:11 +08:00
|
|
|
return false;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
2012-04-24 11:06:11 +08:00
|
|
|
static bool __splice_segment(struct page *page, unsigned int poff,
|
|
|
|
unsigned int plen, unsigned int *off,
|
2013-01-11 22:46:37 +08:00
|
|
|
unsigned int *len,
|
2012-04-24 11:35:04 +08:00
|
|
|
struct splice_pipe_desc *spd, bool linear,
|
2012-04-24 11:06:11 +08:00
|
|
|
struct sock *sk,
|
|
|
|
struct pipe_inode_info *pipe)
|
2007-11-07 15:30:13 +08:00
|
|
|
{
|
2008-07-15 15:49:11 +08:00
|
|
|
if (!*len)
|
2012-04-24 11:06:11 +08:00
|
|
|
return true;
|
2008-07-15 15:49:11 +08:00
|
|
|
|
|
|
|
/* skip this segment if already processed */
|
|
|
|
if (*off >= plen) {
|
|
|
|
*off -= plen;
|
2012-04-24 11:06:11 +08:00
|
|
|
return false;
|
2008-06-28 08:27:21 +08:00
|
|
|
}
|
2007-11-07 15:30:13 +08:00
|
|
|
|
2008-07-15 15:49:11 +08:00
|
|
|
/* ignore any bits we already processed */
|
2013-01-06 05:31:18 +08:00
|
|
|
poff += *off;
|
|
|
|
plen -= *off;
|
|
|
|
*off = 0;
|
2007-11-07 15:30:13 +08:00
|
|
|
|
2013-01-11 22:46:37 +08:00
|
|
|
do {
|
|
|
|
unsigned int flen = min(*len, plen);
|
2008-07-15 15:49:11 +08:00
|
|
|
|
2013-01-11 22:46:37 +08:00
|
|
|
if (spd_fill_page(spd, pipe, page, &flen, poff,
|
|
|
|
linear, sk))
|
|
|
|
return true;
|
|
|
|
poff += flen;
|
|
|
|
plen -= flen;
|
|
|
|
*len -= flen;
|
|
|
|
} while (*len && plen);
|
2008-07-15 15:49:11 +08:00
|
|
|
|
2012-04-24 11:06:11 +08:00
|
|
|
return false;
|
2008-07-15 15:49:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-04-24 11:06:11 +08:00
|
|
|
* Map linear and fragment data from the skb to spd. It reports true if the
|
2008-07-15 15:49:11 +08:00
|
|
|
* pipe is full or if we already spliced the requested length.
|
|
|
|
*/
|
2012-04-24 11:06:11 +08:00
|
|
|
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
|
|
|
|
unsigned int *offset, unsigned int *len,
|
|
|
|
struct splice_pipe_desc *spd, struct sock *sk)
|
2008-07-15 15:49:11 +08:00
|
|
|
{
|
|
|
|
int seg;
|
|
|
|
|
2012-04-27 10:10:03 +08:00
|
|
|
/* map the linear part :
|
2012-05-03 02:18:42 +08:00
|
|
|
* If skb->head_frag is set, this 'linear' part is backed by a
|
|
|
|
* fragment, and if the head is not shared with any clones then
|
|
|
|
* we can avoid a copy since we own the head portion of this page.
|
2008-07-15 15:49:11 +08:00
|
|
|
*/
|
|
|
|
if (__splice_segment(virt_to_page(skb->data),
|
|
|
|
(unsigned long) skb->data & (PAGE_SIZE - 1),
|
|
|
|
skb_headlen(skb),
|
2013-01-11 22:46:37 +08:00
|
|
|
offset, len, spd,
|
2012-05-03 09:09:42 +08:00
|
|
|
skb_head_is_locked(skb),
|
2012-04-27 10:10:03 +08:00
|
|
|
sk, pipe))
|
2012-04-24 11:06:11 +08:00
|
|
|
return true;
|
2007-11-07 15:30:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* then map the fragments
|
|
|
|
*/
|
|
|
|
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
|
|
|
|
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
|
|
|
|
|
2011-08-23 07:44:58 +08:00
|
|
|
if (__splice_segment(skb_frag_page(f),
|
2011-10-19 05:00:24 +08:00
|
|
|
f->page_offset, skb_frag_size(f),
|
2013-01-11 22:46:37 +08:00
|
|
|
offset, len, spd, false, sk, pipe))
|
2012-04-24 11:06:11 +08:00
|
|
|
return true;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
2012-04-24 11:06:11 +08:00
|
|
|
return false;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map data from the skb to a pipe. Should handle both the linear part,
|
|
|
|
* the fragments, and the frag list. It does NOT handle frag lists within
|
|
|
|
* the frag list, if such a thing exists. We'd probably need to recurse to
|
|
|
|
* handle that cleanly.
|
|
|
|
*/
|
2009-01-20 09:03:56 +08:00
|
|
|
int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
|
2007-11-07 15:30:13 +08:00
|
|
|
struct pipe_inode_info *pipe, unsigned int tlen,
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
2012-04-22 20:26:16 +08:00
|
|
|
struct partial_page partial[MAX_SKB_FRAGS];
|
|
|
|
struct page *pages[MAX_SKB_FRAGS];
|
2007-11-07 15:30:13 +08:00
|
|
|
struct splice_pipe_desc spd = {
|
|
|
|
.pages = pages,
|
|
|
|
.partial = partial,
|
2012-06-12 21:24:40 +08:00
|
|
|
.nr_pages_max = MAX_SKB_FRAGS,
|
2007-11-07 15:30:13 +08:00
|
|
|
.flags = flags,
|
|
|
|
.ops = &sock_pipe_buf_ops,
|
|
|
|
.spd_release = sock_spd_release,
|
|
|
|
};
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
2009-04-30 20:41:19 +08:00
|
|
|
struct sock *sk = skb->sk;
|
2010-05-20 16:43:18 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2007-11-07 15:30:13 +08:00
|
|
|
/*
|
|
|
|
* __skb_splice_bits() only fails if the output has no room left,
|
|
|
|
* so no point in going over the frag_list for the error case.
|
|
|
|
*/
|
2010-05-20 16:43:18 +08:00
|
|
|
if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
|
2007-11-07 15:30:13 +08:00
|
|
|
goto done;
|
|
|
|
else if (!tlen)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* now see if we have a frag_list to map
|
|
|
|
*/
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
if (!tlen)
|
|
|
|
break;
|
2010-05-20 16:43:18 +08:00
|
|
|
if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
|
2009-06-09 15:18:59 +08:00
|
|
|
break;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
if (spd.nr_pages) {
|
|
|
|
/*
|
|
|
|
* Drop the socket lock, otherwise we have reverse
|
|
|
|
* locking dependencies between sk_lock and i_mutex
|
|
|
|
* here as compared to sendfile(). We enter here
|
|
|
|
* with the socket lock held, and splice_to_pipe() will
|
|
|
|
* grab the pipe inode lock. For sendfile() emulation,
|
|
|
|
* we call into ->sendpage() with the i_mutex lock held
|
|
|
|
* and networking will grab the socket lock.
|
|
|
|
*/
|
2008-06-05 06:45:58 +08:00
|
|
|
release_sock(sk);
|
2007-11-07 15:30:13 +08:00
|
|
|
ret = splice_to_pipe(pipe, &spd);
|
2008-06-05 06:45:58 +08:00
|
|
|
lock_sock(sk);
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
2010-05-20 16:43:18 +08:00
|
|
|
return ret;
|
2007-11-07 15:30:13 +08:00
|
|
|
}
|
|
|
|
|
2005-04-20 13:30:14 +08:00
|
|
|
/**
|
|
|
|
* skb_store_bits - store bits from kernel buffer to skb
|
|
|
|
* @skb: destination buffer
|
|
|
|
* @offset: offset in destination
|
|
|
|
* @from: source buffer
|
|
|
|
* @len: number of bytes to copy
|
|
|
|
*
|
|
|
|
* Copy the specified number of bytes from the source buffer to the
|
|
|
|
* destination skb. This function handles all the messy bits of
|
|
|
|
* traversing fragment lists and such.
|
|
|
|
*/
|
|
|
|
|
2007-04-21 07:40:01 +08:00
|
|
|
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
|
2005-04-20 13:30:14 +08:00
|
|
|
{
|
2007-04-28 06:21:23 +08:00
|
|
|
int start = skb_headlen(skb);
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
|
|
|
int i, copy;
|
2005-04-20 13:30:14 +08:00
|
|
|
|
|
|
|
if (offset > (int)skb->len - len)
|
|
|
|
goto fault;
|
|
|
|
|
2007-04-28 06:21:23 +08:00
|
|
|
if ((copy = start - offset) > 0) {
|
2005-04-20 13:30:14 +08:00
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2007-03-31 22:55:19 +08:00
|
|
|
skb_copy_to_linear_data_offset(skb, offset, from, copy);
|
2005-04-20 13:30:14 +08:00
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
from += copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
|
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
2007-04-28 06:21:23 +08:00
|
|
|
int end;
|
|
|
|
|
2008-07-26 12:43:18 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2005-04-20 13:30:14 +08:00
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
end = start + skb_frag_size(frag);
|
2005-04-20 13:30:14 +08:00
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
u8 *vaddr;
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr = kmap_atomic(skb_frag_page(frag));
|
2007-04-28 06:21:23 +08:00
|
|
|
memcpy(vaddr + frag->page_offset + offset - start,
|
|
|
|
from, copy);
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(vaddr);
|
2005-04-20 13:30:14 +08:00
|
|
|
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
from += copy;
|
|
|
|
}
|
2007-04-28 06:21:23 +08:00
|
|
|
start = end;
|
2005-04-20 13:30:14 +08:00
|
|
|
}
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
int end;
|
2005-04-20 13:30:14 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + frag_iter->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
if (skb_store_bits(frag_iter, offset - start,
|
|
|
|
from, copy))
|
|
|
|
goto fault;
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return 0;
|
|
|
|
offset += copy;
|
|
|
|
from += copy;
|
2005-04-20 13:30:14 +08:00
|
|
|
}
|
2009-06-09 15:18:59 +08:00
|
|
|
start = end;
|
2005-04-20 13:30:14 +08:00
|
|
|
}
|
|
|
|
if (!len)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fault:
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_store_bits);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Checksum skb data. */
|
|
|
|
|
2006-11-15 13:37:14 +08:00
|
|
|
__wsum skb_checksum(const struct sk_buff *skb, int offset,
|
|
|
|
int len, __wsum csum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-04-28 06:21:23 +08:00
|
|
|
int start = skb_headlen(skb);
|
|
|
|
int i, copy = start - offset;
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
2005-04-17 06:20:36 +08:00
|
|
|
int pos = 0;
|
|
|
|
|
|
|
|
/* Checksum header. */
|
|
|
|
if (copy > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
csum = csum_partial(skb->data + offset, copy, csum);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
pos = copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2007-04-28 06:21:23 +08:00
|
|
|
int end;
|
2012-04-05 17:35:15 +08:00
|
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
2007-04-28 06:21:23 +08:00
|
|
|
|
2008-07-26 12:43:18 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-04-05 17:35:15 +08:00
|
|
|
end = start + skb_frag_size(frag);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((copy = end - offset) > 0) {
|
2006-11-15 13:36:14 +08:00
|
|
|
__wsum csum2;
|
2005-04-17 06:20:36 +08:00
|
|
|
u8 *vaddr;
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr = kmap_atomic(skb_frag_page(frag));
|
2007-04-28 06:21:23 +08:00
|
|
|
csum2 = csum_partial(vaddr + frag->page_offset +
|
|
|
|
offset - start, copy, 0);
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(vaddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
csum = csum_block_add(csum, csum2, pos);
|
|
|
|
if (!(len -= copy))
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
pos += copy;
|
|
|
|
}
|
2007-04-28 06:21:23 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
int end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + frag_iter->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
__wsum csum2;
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
csum2 = skb_checksum(frag_iter, offset - start,
|
|
|
|
copy, 0);
|
|
|
|
csum = csum_block_add(csum, csum2, pos);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
pos += copy;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-06-09 15:18:59 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-01-09 14:24:28 +08:00
|
|
|
BUG_ON(len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return csum;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_checksum);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Both of above in one bottle. */
|
|
|
|
|
2006-11-15 13:37:33 +08:00
|
|
|
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
|
|
|
|
u8 *to, int len, __wsum csum)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-04-28 06:21:23 +08:00
|
|
|
int start = skb_headlen(skb);
|
|
|
|
int i, copy = start - offset;
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
2005-04-17 06:20:36 +08:00
|
|
|
int pos = 0;
|
|
|
|
|
|
|
|
/* Copy header. */
|
|
|
|
if (copy > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
csum = csum_partial_copy_nocheck(skb->data + offset, to,
|
|
|
|
copy, csum);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos = copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2007-04-28 06:21:23 +08:00
|
|
|
int end;
|
|
|
|
|
2008-07-26 12:43:18 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((copy = end - offset) > 0) {
|
2006-11-15 13:36:34 +08:00
|
|
|
__wsum csum2;
|
2005-04-17 06:20:36 +08:00
|
|
|
u8 *vaddr;
|
|
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2012-04-05 17:35:15 +08:00
|
|
|
vaddr = kmap_atomic(skb_frag_page(frag));
|
2005-04-17 06:20:36 +08:00
|
|
|
csum2 = csum_partial_copy_nocheck(vaddr +
|
2007-04-28 06:21:23 +08:00
|
|
|
frag->page_offset +
|
|
|
|
offset - start, to,
|
|
|
|
copy, 0);
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(vaddr);
|
2005-04-17 06:20:36 +08:00
|
|
|
csum = csum_block_add(csum, csum2, pos);
|
|
|
|
if (!(len -= copy))
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos += copy;
|
|
|
|
}
|
2007-04-28 06:21:23 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
__wsum csum2;
|
|
|
|
int end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
WARN_ON(start > offset + len);
|
|
|
|
|
|
|
|
end = start + frag_iter->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
csum2 = skb_copy_and_csum_bits(frag_iter,
|
|
|
|
offset - start,
|
|
|
|
to, copy, 0);
|
|
|
|
csum = csum_block_add(csum, csum2, pos);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return csum;
|
|
|
|
offset += copy;
|
|
|
|
to += copy;
|
|
|
|
pos += copy;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-06-09 15:18:59 +08:00
|
|
|
start = end;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-01-09 14:24:28 +08:00
|
|
|
BUG_ON(len);
|
2005-04-17 06:20:36 +08:00
|
|
|
return csum;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_copy_and_csum_bits);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
|
|
|
|
{
|
2006-11-15 13:24:49 +08:00
|
|
|
__wsum csum;
|
2005-04-17 06:20:36 +08:00
|
|
|
long csstart;
|
|
|
|
|
2006-08-30 07:44:56 +08:00
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL)
|
2010-12-14 23:24:08 +08:00
|
|
|
csstart = skb_checksum_start_offset(skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
else
|
|
|
|
csstart = skb_headlen(skb);
|
|
|
|
|
2006-01-09 14:24:28 +08:00
|
|
|
BUG_ON(csstart > skb_headlen(skb));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-28 05:55:52 +08:00
|
|
|
skb_copy_from_linear_data(skb, to, csstart);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
csum = 0;
|
|
|
|
if (csstart != skb->len)
|
|
|
|
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
|
|
|
|
skb->len - csstart, 0);
|
|
|
|
|
2006-08-30 07:44:56 +08:00
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
2006-11-21 10:07:29 +08:00
|
|
|
long csstuff = csstart + skb->csum_offset;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-11-15 13:24:49 +08:00
|
|
|
*((__sum16 *)(to + csstuff)) = csum_fold(csum);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_copy_and_csum_dev);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_dequeue - remove from the head of the queue
|
|
|
|
* @list: list to dequeue from
|
|
|
|
*
|
|
|
|
* Remove the head of the list. The list lock is taken so the function
|
|
|
|
* may be used safely with other locking list functions. The head item is
|
|
|
|
* returned or %NULL if the list is empty.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct sk_buff *skb_dequeue(struct sk_buff_head *list)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
struct sk_buff *result;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
result = __skb_dequeue(list);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
|
|
|
return result;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_dequeue);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_dequeue_tail - remove from the tail of the queue
|
|
|
|
* @list: list to dequeue from
|
|
|
|
*
|
|
|
|
* Remove the tail of the list. The list lock is taken so the function
|
|
|
|
* may be used safely with other locking list functions. The tail item is
|
|
|
|
* returned or %NULL if the list is empty.
|
|
|
|
*/
|
|
|
|
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
struct sk_buff *result;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
result = __skb_dequeue_tail(list);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
|
|
|
return result;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_dequeue_tail);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_queue_purge - empty a list
|
|
|
|
* @list: list to empty
|
|
|
|
*
|
|
|
|
* Delete all buffers on an &sk_buff list. Each buffer is removed from
|
|
|
|
* the list and one reference dropped. This function takes the list
|
|
|
|
* lock and is atomic with respect to other list locking functions.
|
|
|
|
*/
|
|
|
|
void skb_queue_purge(struct sk_buff_head *list)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
while ((skb = skb_dequeue(list)) != NULL)
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_queue_purge);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_queue_head - queue a buffer at the list head
|
|
|
|
* @list: list to use
|
|
|
|
* @newsk: buffer to queue
|
|
|
|
*
|
|
|
|
* Queue a buffer at the start of the list. This function takes the
|
|
|
|
* list lock and can be used safely with other locking &sk_buff functions
|
|
|
|
* safely.
|
|
|
|
*
|
|
|
|
* A buffer cannot be placed on two lists at the same time.
|
|
|
|
*/
|
|
|
|
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
__skb_queue_head(list, newsk);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_queue_head);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_queue_tail - queue a buffer at the list tail
|
|
|
|
* @list: list to use
|
|
|
|
* @newsk: buffer to queue
|
|
|
|
*
|
|
|
|
* Queue a buffer at the tail of the list. This function takes the
|
|
|
|
* list lock and can be used safely with other locking &sk_buff functions
|
|
|
|
* safely.
|
|
|
|
*
|
|
|
|
* A buffer cannot be placed on two lists at the same time.
|
|
|
|
*/
|
|
|
|
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
__skb_queue_tail(list, newsk);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_queue_tail);
|
2005-08-10 10:25:21 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* skb_unlink - remove a buffer from a list
|
|
|
|
* @skb: buffer to remove
|
2005-08-10 10:25:21 +08:00
|
|
|
* @list: list to use
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-08-10 10:25:21 +08:00
|
|
|
* Remove a packet from a list. The list locks are taken and this
|
|
|
|
* function is atomic with respect to other list locked calls
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-08-10 10:25:21 +08:00
|
|
|
* You must know what list the SKB is on.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-08-10 10:25:21 +08:00
|
|
|
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-08-10 10:25:21 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-10 10:25:21 +08:00
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
__skb_unlink(skb, list);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_unlink);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_append - append a buffer
|
|
|
|
* @old: buffer to insert after
|
|
|
|
* @newsk: buffer to insert
|
2005-08-10 10:25:21 +08:00
|
|
|
* @list: list to use
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Place a packet after a given packet in a list. The list locks are taken
|
|
|
|
* and this function is atomic with respect to other list locked calls.
|
|
|
|
* A buffer cannot be placed on two lists at the same time.
|
|
|
|
*/
|
2005-08-10 10:25:21 +08:00
|
|
|
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2005-08-10 10:25:21 +08:00
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
2008-04-14 15:05:09 +08:00
|
|
|
__skb_queue_after(list, old, newsk);
|
2005-08-10 10:25:21 +08:00
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_append);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_insert - insert a buffer
|
|
|
|
* @old: buffer to insert before
|
|
|
|
* @newsk: buffer to insert
|
2005-08-10 10:25:21 +08:00
|
|
|
* @list: list to use
|
|
|
|
*
|
|
|
|
* Place a packet before a given packet in a list. The list locks are
|
|
|
|
* taken and this function is atomic with respect to other list locked
|
|
|
|
* calls.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* A buffer cannot be placed on two lists at the same time.
|
|
|
|
*/
|
2005-08-10 10:25:21 +08:00
|
|
|
void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2005-08-10 10:25:21 +08:00
|
|
|
spin_lock_irqsave(&list->lock, flags);
|
|
|
|
__skb_insert(newsk, old->prev, old, list);
|
|
|
|
spin_unlock_irqrestore(&list->lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_insert);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline void skb_split_inside_header(struct sk_buff *skb,
|
|
|
|
struct sk_buff* skb1,
|
|
|
|
const u32 len, const int pos)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2007-03-28 05:55:52 +08:00
|
|
|
skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
|
|
|
|
pos - len);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* And move data appendix as is. */
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
|
|
|
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
|
|
|
|
|
|
|
|
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
|
|
|
|
skb_shinfo(skb)->nr_frags = 0;
|
|
|
|
skb1->data_len = skb->data_len;
|
|
|
|
skb1->len += skb1->data_len;
|
|
|
|
skb->data_len = 0;
|
|
|
|
skb->len = len;
|
2007-04-20 11:29:13 +08:00
|
|
|
skb_set_tail_pointer(skb, len);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void skb_split_no_header(struct sk_buff *skb,
|
|
|
|
struct sk_buff* skb1,
|
|
|
|
const u32 len, int pos)
|
|
|
|
{
|
|
|
|
int i, k = 0;
|
|
|
|
const int nfrags = skb_shinfo(skb)->nr_frags;
|
|
|
|
|
|
|
|
skb_shinfo(skb)->nr_frags = 0;
|
|
|
|
skb1->len = skb1->data_len = skb->len - len;
|
|
|
|
skb->len = len;
|
|
|
|
skb->data_len = len - pos;
|
|
|
|
|
|
|
|
for (i = 0; i < nfrags; i++) {
|
2011-10-19 05:00:24 +08:00
|
|
|
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (pos + size > len) {
|
|
|
|
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
|
|
|
|
|
|
|
|
if (pos < len) {
|
|
|
|
/* Split frag.
|
|
|
|
* We have two variants in this case:
|
|
|
|
* 1. Move all the frag to the second
|
|
|
|
* part, if it is possible. F.e.
|
|
|
|
* this approach is mandatory for TUX,
|
|
|
|
* where splitting is expensive.
|
|
|
|
* 2. Split is accurately. We make this.
|
|
|
|
*/
|
2011-08-23 07:44:58 +08:00
|
|
|
skb_frag_ref(skb, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
skb_shinfo(skb1)->frags[0].page_offset += len - pos;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
|
|
|
|
skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
|
2005-04-17 06:20:36 +08:00
|
|
|
skb_shinfo(skb)->nr_frags++;
|
|
|
|
}
|
|
|
|
k++;
|
|
|
|
} else
|
|
|
|
skb_shinfo(skb)->nr_frags++;
|
|
|
|
pos += size;
|
|
|
|
}
|
|
|
|
skb_shinfo(skb1)->nr_frags = k;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_split - Split fragmented skb to two parts at length len.
|
|
|
|
* @skb: the buffer to split
|
|
|
|
* @skb1: the buffer to receive the second part
|
|
|
|
* @len: new length for skb
|
|
|
|
*/
|
|
|
|
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
|
|
|
|
{
|
|
|
|
int pos = skb_headlen(skb);
|
|
|
|
|
2013-02-20 06:51:30 +08:00
|
|
|
skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (len < pos) /* Split line is inside header. */
|
|
|
|
skb_split_inside_header(skb, skb1, len, pos);
|
|
|
|
else /* Second chunk has no header, nothing to copy. */
|
|
|
|
skb_split_no_header(skb, skb1, len, pos);
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_split);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-11-26 05:57:01 +08:00
|
|
|
/* Shifting from/to a cloned skb is a no-go.
|
|
|
|
*
|
|
|
|
* Caller cannot keep skb_shinfo related pointers past calling here!
|
|
|
|
*/
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
static int skb_prepare_for_shift(struct sk_buff *skb)
|
|
|
|
{
|
2008-11-25 13:30:21 +08:00
|
|
|
return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_shift - Shifts paged data partially from skb to another
|
|
|
|
* @tgt: buffer into which tail data gets added
|
|
|
|
* @skb: buffer from which the paged data comes from
|
|
|
|
* @shiftlen: shift up to this many bytes
|
|
|
|
*
|
|
|
|
* Attempts to shift up to shiftlen worth of bytes, which may be less than
|
2011-11-21 09:47:11 +08:00
|
|
|
* the length of the skb, from skb to tgt. Returns number bytes shifted.
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
* It's up to caller to free skb if everything was shifted.
|
|
|
|
*
|
|
|
|
* If @tgt runs out of frags, the whole operation is aborted.
|
|
|
|
*
|
|
|
|
* Skb cannot include anything else but paged data while tgt is allowed
|
|
|
|
* to have non-paged data as well.
|
|
|
|
*
|
|
|
|
* TODO: full sized shift could be optimized but that would need
|
|
|
|
* specialized skb free'er to handle frags without up-to-date nr_frags.
|
|
|
|
*/
|
|
|
|
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
|
|
|
|
{
|
|
|
|
int from, to, merge, todo;
|
|
|
|
struct skb_frag_struct *fragfrom, *fragto;
|
|
|
|
|
|
|
|
BUG_ON(shiftlen > skb->len);
|
|
|
|
BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
|
|
|
|
|
|
|
|
todo = shiftlen;
|
|
|
|
from = 0;
|
|
|
|
to = skb_shinfo(tgt)->nr_frags;
|
|
|
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
|
|
|
|
|
|
|
/* Actual merge is delayed until the point when we know we can
|
|
|
|
* commit all, so that we don't have to undo partial changes
|
|
|
|
*/
|
|
|
|
if (!to ||
|
2011-08-23 07:44:58 +08:00
|
|
|
!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
|
|
|
|
fragfrom->page_offset)) {
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
merge = -1;
|
|
|
|
} else {
|
|
|
|
merge = to - 1;
|
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
todo -= skb_frag_size(fragfrom);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
if (todo < 0) {
|
|
|
|
if (skb_prepare_for_shift(skb) ||
|
|
|
|
skb_prepare_for_shift(tgt))
|
|
|
|
return 0;
|
|
|
|
|
2008-11-26 05:57:01 +08:00
|
|
|
/* All previous frag pointers might be stale! */
|
|
|
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
fragto = &skb_shinfo(tgt)->frags[merge];
|
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_add(fragto, shiftlen);
|
|
|
|
skb_frag_size_sub(fragfrom, shiftlen);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
fragfrom->page_offset += shiftlen;
|
|
|
|
|
|
|
|
goto onlymerged;
|
|
|
|
}
|
|
|
|
|
|
|
|
from++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip full, not-fitting skb to avoid expensive operations */
|
|
|
|
if ((shiftlen == skb->len) &&
|
|
|
|
(skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
|
|
|
|
if (to == MAX_SKB_FRAGS)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fragfrom = &skb_shinfo(skb)->frags[from];
|
|
|
|
fragto = &skb_shinfo(tgt)->frags[to];
|
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
if (todo >= skb_frag_size(fragfrom)) {
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
*fragto = *fragfrom;
|
2011-10-19 05:00:24 +08:00
|
|
|
todo -= skb_frag_size(fragfrom);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
from++;
|
|
|
|
to++;
|
|
|
|
|
|
|
|
} else {
|
2011-08-23 07:44:58 +08:00
|
|
|
__skb_frag_ref(fragfrom);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
fragto->page = fragfrom->page;
|
|
|
|
fragto->page_offset = fragfrom->page_offset;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_set(fragto, todo);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
|
|
|
|
fragfrom->page_offset += todo;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(fragfrom, todo);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
todo = 0;
|
|
|
|
|
|
|
|
to++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Ready to "commit" this state change to tgt */
|
|
|
|
skb_shinfo(tgt)->nr_frags = to;
|
|
|
|
|
|
|
|
if (merge >= 0) {
|
|
|
|
fragfrom = &skb_shinfo(skb)->frags[0];
|
|
|
|
fragto = &skb_shinfo(tgt)->frags[merge];
|
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
|
2011-08-23 07:44:58 +08:00
|
|
|
__skb_frag_unref(fragfrom);
|
tcp: Try to restore large SKBs while SACK processing
During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.
This approach has a number of benefits:
1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
of TCP has to be aware of this (this was not the case with
some other approach that was, well, quite intrusive all
around).
4) Clean_rtx_queue can release most of the pages using single
put_page instead of previous PAGE_SIZE/mss+1 calls
In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.
TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).
I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
1) ambiguous => no sensible measurement can be taken anyway
2) non-ambiguous is due to reordering => having the timestamp
of the last segment there is just skewing things more off
than does some good since the ack got triggered by one of
the holes (besides some substle issues that would make
determining right hole/skb even harder problem). Anyway,
it has nothing to do with this change then.
I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.
In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.
Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.
TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)
My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...
[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]
...To counter that, I gave up on the fifth advantage:
5) When growing previous SACK block, less allocs for new skbs
are done, basically a new alloc is needed only when new hole
is detected and when the previous skb runs out of frags space
...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.
With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:
TCPSackShifted 398
TCPSackMerged 877
TCPSackShiftFallback 320
TCPSACKCOLLAPSEFALLBACKGSO 0
TCPSACKCOLLAPSEFALLBACKSKBBITS 0
TCPSACKCOLLAPSEFALLBACKSKBDATA 0
TCPSACKCOLLAPSEFALLBACKBELOW 0
TCPSACKCOLLAPSEFALLBACKFIRST 1
TCPSACKCOLLAPSEFALLBACKPREVBITS 318
TCPSACKCOLLAPSEFALLBACKMSS 1
TCPSACKCOLLAPSEFALLBACKNOHEAD 0
TCPSACKCOLLAPSEFALLBACKSHIFT 0
TCPSACKCOLLAPSENOOPSEQ 0
TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
TCPSACKCOLLAPSENOOPSMALLLEN 0
TCPSACKCOLLAPSEHOLE 12
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-25 13:20:15 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Reposition in the original skb */
|
|
|
|
to = 0;
|
|
|
|
while (from < skb_shinfo(skb)->nr_frags)
|
|
|
|
skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
|
|
|
|
skb_shinfo(skb)->nr_frags = to;
|
|
|
|
|
|
|
|
BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
|
|
|
|
|
|
|
|
onlymerged:
|
|
|
|
/* Most likely the tgt won't ever need its checksum anymore, skb on
|
|
|
|
* the other hand might need it if it needs to be resent
|
|
|
|
*/
|
|
|
|
tgt->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
|
|
|
|
/* Yak, is it really working this way? Some helper please? */
|
|
|
|
skb->len -= shiftlen;
|
|
|
|
skb->data_len -= shiftlen;
|
|
|
|
skb->truesize -= shiftlen;
|
|
|
|
tgt->len += shiftlen;
|
|
|
|
tgt->data_len += shiftlen;
|
|
|
|
tgt->truesize += shiftlen;
|
|
|
|
|
|
|
|
return shiftlen;
|
|
|
|
}
|
|
|
|
|
2005-06-24 11:59:51 +08:00
|
|
|
/**
|
|
|
|
* skb_prepare_seq_read - Prepare a sequential read of skb data
|
|
|
|
* @skb: the buffer to read
|
|
|
|
* @from: lower offset of data to be read
|
|
|
|
* @to: upper offset of data to be read
|
|
|
|
* @st: state variable
|
|
|
|
*
|
|
|
|
* Initializes the specified state variable. Must be called before
|
|
|
|
* invoking skb_seq_read() for the first time.
|
|
|
|
*/
|
|
|
|
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
|
|
|
|
unsigned int to, struct skb_seq_state *st)
|
|
|
|
{
|
|
|
|
st->lower_offset = from;
|
|
|
|
st->upper_offset = to;
|
|
|
|
st->root_skb = st->cur_skb = skb;
|
|
|
|
st->frag_idx = st->stepped_offset = 0;
|
|
|
|
st->frag_data = NULL;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_prepare_seq_read);
|
2005-06-24 11:59:51 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_seq_read - Sequentially read skb data
|
|
|
|
* @consumed: number of bytes consumed by the caller so far
|
|
|
|
* @data: destination pointer for data to be returned
|
|
|
|
* @st: state variable
|
|
|
|
*
|
|
|
|
* Reads a block of skb data at &consumed relative to the
|
|
|
|
* lower offset specified to skb_prepare_seq_read(). Assigns
|
|
|
|
* the head of the data block to &data and returns the length
|
|
|
|
* of the block or 0 if the end of the skb data or the upper
|
|
|
|
* offset has been reached.
|
|
|
|
*
|
|
|
|
* The caller is not required to consume all of the data
|
|
|
|
* returned, i.e. &consumed is typically set to the number
|
|
|
|
* of bytes already consumed and the next call to
|
|
|
|
* skb_seq_read() will return the remaining part of the block.
|
|
|
|
*
|
2011-03-31 09:57:33 +08:00
|
|
|
* Note 1: The size of each block of data returned can be arbitrary,
|
2005-06-24 11:59:51 +08:00
|
|
|
* this limitation is the cost for zerocopy seqeuental
|
|
|
|
* reads of potentially non linear data.
|
|
|
|
*
|
2008-02-14 07:03:25 +08:00
|
|
|
* Note 2: Fragment lists within fragments are not implemented
|
2005-06-24 11:59:51 +08:00
|
|
|
* at the moment, state->root_skb could be replaced with
|
|
|
|
* a stack for this purpose.
|
|
|
|
*/
|
|
|
|
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
|
|
|
|
struct skb_seq_state *st)
|
|
|
|
{
|
|
|
|
unsigned int block_limit, abs_offset = consumed + st->lower_offset;
|
|
|
|
skb_frag_t *frag;
|
|
|
|
|
|
|
|
if (unlikely(abs_offset >= st->upper_offset))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
next_skb:
|
2009-01-30 08:07:52 +08:00
|
|
|
block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
|
2005-06-24 11:59:51 +08:00
|
|
|
|
2009-05-19 12:43:27 +08:00
|
|
|
if (abs_offset < block_limit && !st->frag_data) {
|
2009-01-30 08:07:52 +08:00
|
|
|
*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
|
2005-06-24 11:59:51 +08:00
|
|
|
return block_limit - abs_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (st->frag_idx == 0 && !st->frag_data)
|
|
|
|
st->stepped_offset += skb_headlen(st->cur_skb);
|
|
|
|
|
|
|
|
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
|
|
|
|
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
|
2011-10-19 05:00:24 +08:00
|
|
|
block_limit = skb_frag_size(frag) + st->stepped_offset;
|
2005-06-24 11:59:51 +08:00
|
|
|
|
|
|
|
if (abs_offset < block_limit) {
|
|
|
|
if (!st->frag_data)
|
2012-04-05 17:35:15 +08:00
|
|
|
st->frag_data = kmap_atomic(skb_frag_page(frag));
|
2005-06-24 11:59:51 +08:00
|
|
|
|
|
|
|
*data = (u8 *) st->frag_data + frag->page_offset +
|
|
|
|
(abs_offset - st->stepped_offset);
|
|
|
|
|
|
|
|
return block_limit - abs_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (st->frag_data) {
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(st->frag_data);
|
2005-06-24 11:59:51 +08:00
|
|
|
st->frag_data = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
st->frag_idx++;
|
2011-10-19 05:00:24 +08:00
|
|
|
st->stepped_offset += skb_frag_size(frag);
|
2005-06-24 11:59:51 +08:00
|
|
|
}
|
|
|
|
|
2007-06-24 14:11:52 +08:00
|
|
|
if (st->frag_data) {
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(st->frag_data);
|
2007-06-24 14:11:52 +08:00
|
|
|
st->frag_data = NULL;
|
|
|
|
}
|
|
|
|
|
2010-08-23 15:13:46 +08:00
|
|
|
if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
|
2009-01-30 08:12:42 +08:00
|
|
|
st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
|
2005-06-24 11:59:51 +08:00
|
|
|
st->frag_idx = 0;
|
|
|
|
goto next_skb;
|
2009-01-30 08:12:42 +08:00
|
|
|
} else if (st->cur_skb->next) {
|
|
|
|
st->cur_skb = st->cur_skb->next;
|
2009-01-30 08:07:52 +08:00
|
|
|
st->frag_idx = 0;
|
2005-06-24 11:59:51 +08:00
|
|
|
goto next_skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_seq_read);
|
2005-06-24 11:59:51 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_abort_seq_read - Abort a sequential read of skb data
|
|
|
|
* @st: state variable
|
|
|
|
*
|
|
|
|
* Must be called if skb_seq_read() was not called until it
|
|
|
|
* returned 0.
|
|
|
|
*/
|
|
|
|
void skb_abort_seq_read(struct skb_seq_state *st)
|
|
|
|
{
|
|
|
|
if (st->frag_data)
|
2012-04-05 17:35:15 +08:00
|
|
|
kunmap_atomic(st->frag_data);
|
2005-06-24 11:59:51 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_abort_seq_read);
|
2005-06-24 11:59:51 +08:00
|
|
|
|
2005-06-24 12:00:17 +08:00
|
|
|
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
|
|
|
|
|
|
|
|
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
|
|
|
|
struct ts_config *conf,
|
|
|
|
struct ts_state *state)
|
|
|
|
{
|
|
|
|
return skb_seq_read(offset, text, TS_SKB_CB(state));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
|
|
|
|
{
|
|
|
|
skb_abort_seq_read(TS_SKB_CB(state));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_find_text - Find a text pattern in skb data
|
|
|
|
* @skb: the buffer to look in
|
|
|
|
* @from: search offset
|
|
|
|
* @to: search limit
|
|
|
|
* @config: textsearch configuration
|
|
|
|
* @state: uninitialized textsearch state variable
|
|
|
|
*
|
|
|
|
* Finds a pattern in the skb data according to the specified
|
|
|
|
* textsearch configuration. Use textsearch_next() to retrieve
|
|
|
|
* subsequent occurrences of the pattern. Returns the offset
|
|
|
|
* to the first occurrence or UINT_MAX if no match was found.
|
|
|
|
*/
|
|
|
|
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
|
|
|
|
unsigned int to, struct ts_config *config,
|
|
|
|
struct ts_state *state)
|
|
|
|
{
|
2006-06-26 15:00:57 +08:00
|
|
|
unsigned int ret;
|
|
|
|
|
2005-06-24 12:00:17 +08:00
|
|
|
config->get_next_block = skb_ts_get_next_block;
|
|
|
|
config->finish = skb_ts_finish;
|
|
|
|
|
|
|
|
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
|
|
|
|
|
2006-06-26 15:00:57 +08:00
|
|
|
ret = textsearch_find(config, state);
|
|
|
|
return (ret <= to - from ? ret : UINT_MAX);
|
2005-06-24 12:00:17 +08:00
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_find_text);
|
2005-06-24 12:00:17 +08:00
|
|
|
|
2005-10-19 06:46:41 +08:00
|
|
|
/**
|
2012-07-10 18:55:09 +08:00
|
|
|
* skb_append_datato_frags - append the user data to a skb
|
2005-10-19 06:46:41 +08:00
|
|
|
* @sk: sock structure
|
|
|
|
* @skb: skb structure to be appened with user data.
|
|
|
|
* @getfrag: call back function to be used for getting the user data
|
|
|
|
* @from: pointer to user message iov
|
|
|
|
* @length: length of the iov message
|
|
|
|
*
|
|
|
|
* Description: This procedure append the user data in the fragment part
|
|
|
|
* of the skb if any page alloc fails user this procedure returns -ENOMEM
|
|
|
|
*/
|
|
|
|
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
|
2005-12-06 05:40:12 +08:00
|
|
|
int (*getfrag)(void *from, char *to, int offset,
|
2005-10-19 06:46:41 +08:00
|
|
|
int len, int odd, struct sk_buff *skb),
|
|
|
|
void *from, int length)
|
|
|
|
{
|
2012-12-28 14:06:37 +08:00
|
|
|
int frg_cnt = skb_shinfo(skb)->nr_frags;
|
|
|
|
int copy;
|
2005-10-19 06:46:41 +08:00
|
|
|
int offset = 0;
|
|
|
|
int ret;
|
2012-12-28 14:06:37 +08:00
|
|
|
struct page_frag *pfrag = ¤t->task_frag;
|
2005-10-19 06:46:41 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
/* Return error if we don't have space for new frag */
|
|
|
|
if (frg_cnt >= MAX_SKB_FRAGS)
|
2012-12-28 14:06:37 +08:00
|
|
|
return -EMSGSIZE;
|
2005-10-19 06:46:41 +08:00
|
|
|
|
2012-12-28 14:06:37 +08:00
|
|
|
if (!sk_page_frag_refill(sk, pfrag))
|
2005-10-19 06:46:41 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* copy the user data to page */
|
2012-12-28 14:06:37 +08:00
|
|
|
copy = min_t(int, length, pfrag->size - pfrag->offset);
|
2005-10-19 06:46:41 +08:00
|
|
|
|
2012-12-28 14:06:37 +08:00
|
|
|
ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
|
|
|
|
offset, copy, 0, skb);
|
2005-10-19 06:46:41 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
/* copy was successful so update the size parameters */
|
2012-12-28 14:06:37 +08:00
|
|
|
skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
|
|
|
|
copy);
|
|
|
|
frg_cnt++;
|
|
|
|
pfrag->offset += copy;
|
|
|
|
get_page(pfrag->page);
|
|
|
|
|
|
|
|
skb->truesize += copy;
|
|
|
|
atomic_add(copy, &sk->sk_wmem_alloc);
|
2005-10-19 06:46:41 +08:00
|
|
|
skb->len += copy;
|
|
|
|
skb->data_len += copy;
|
|
|
|
offset += copy;
|
|
|
|
length -= copy;
|
|
|
|
|
|
|
|
} while (length > 0);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL(skb_append_datato_frags);
|
2005-10-19 06:46:41 +08:00
|
|
|
|
2006-03-21 14:43:56 +08:00
|
|
|
/**
|
|
|
|
* skb_pull_rcsum - pull skb and update receive checksum
|
|
|
|
* @skb: buffer to update
|
|
|
|
* @len: length of data pulled
|
|
|
|
*
|
|
|
|
* This function performs an skb_pull on the packet and updates
|
2008-02-13 14:03:25 +08:00
|
|
|
* the CHECKSUM_COMPLETE checksum. It should be used on
|
2006-08-30 07:44:56 +08:00
|
|
|
* receive path processing instead of skb_pull unless you know
|
|
|
|
* that the checksum difference is zero (e.g., a valid IP header)
|
|
|
|
* or you are setting ip_summed to CHECKSUM_NONE.
|
2006-03-21 14:43:56 +08:00
|
|
|
*/
|
|
|
|
unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
|
|
|
|
{
|
|
|
|
BUG_ON(len > skb->len);
|
|
|
|
skb->len -= len;
|
|
|
|
BUG_ON(skb->len < skb->data_len);
|
|
|
|
skb_postpull_rcsum(skb, skb->data, len);
|
|
|
|
return skb->data += len;
|
|
|
|
}
|
2006-03-21 14:47:55 +08:00
|
|
|
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
|
|
|
|
|
2006-06-22 18:02:40 +08:00
|
|
|
/**
|
|
|
|
* skb_segment - Perform protocol segmentation on skb.
|
|
|
|
* @skb: buffer to segment
|
2006-06-28 04:22:38 +08:00
|
|
|
* @features: features for the output path (see dev->features)
|
2006-06-22 18:02:40 +08:00
|
|
|
*
|
|
|
|
* This function performs segmentation on the given skb. It returns
|
2008-04-14 12:52:48 +08:00
|
|
|
* a pointer to the first in a list of new skbs for the segments.
|
|
|
|
* In case of error it returns ERR_PTR(err).
|
2006-06-22 18:02:40 +08:00
|
|
|
*/
|
2011-11-15 23:29:55 +08:00
|
|
|
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
|
2006-06-22 18:02:40 +08:00
|
|
|
{
|
|
|
|
struct sk_buff *segs = NULL;
|
|
|
|
struct sk_buff *tail = NULL;
|
2008-12-16 15:26:06 +08:00
|
|
|
struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
|
2006-06-22 18:02:40 +08:00
|
|
|
unsigned int mss = skb_shinfo(skb)->gso_size;
|
2007-03-20 06:33:04 +08:00
|
|
|
unsigned int doffset = skb->data - skb_mac_header(skb);
|
2006-06-22 18:02:40 +08:00
|
|
|
unsigned int offset = doffset;
|
2013-02-14 22:02:41 +08:00
|
|
|
unsigned int tnl_hlen = skb_tnl_header_len(skb);
|
2006-06-22 18:02:40 +08:00
|
|
|
unsigned int headroom;
|
|
|
|
unsigned int len;
|
2011-01-25 07:32:47 +08:00
|
|
|
int sg = !!(features & NETIF_F_SG);
|
2006-06-22 18:02:40 +08:00
|
|
|
int nfrags = skb_shinfo(skb)->nr_frags;
|
|
|
|
int err = -ENOMEM;
|
|
|
|
int i = 0;
|
|
|
|
int pos;
|
|
|
|
|
|
|
|
__skb_push(skb, doffset);
|
|
|
|
headroom = skb_headroom(skb);
|
|
|
|
pos = skb_headlen(skb);
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct sk_buff *nskb;
|
|
|
|
skb_frag_t *frag;
|
2006-10-30 07:59:41 +08:00
|
|
|
int hsize;
|
2006-06-22 18:02:40 +08:00
|
|
|
int size;
|
|
|
|
|
|
|
|
len = skb->len - offset;
|
|
|
|
if (len > mss)
|
|
|
|
len = mss;
|
|
|
|
|
|
|
|
hsize = skb_headlen(skb) - offset;
|
|
|
|
if (hsize < 0)
|
|
|
|
hsize = 0;
|
2006-10-30 07:59:41 +08:00
|
|
|
if (hsize > len || !sg)
|
|
|
|
hsize = len;
|
2006-06-22 18:02:40 +08:00
|
|
|
|
2008-12-16 15:26:06 +08:00
|
|
|
if (!hsize && i >= nfrags) {
|
|
|
|
BUG_ON(fskb->len != len);
|
|
|
|
|
|
|
|
pos += len;
|
|
|
|
nskb = skb_clone(fskb, GFP_ATOMIC);
|
|
|
|
fskb = fskb->next;
|
|
|
|
|
|
|
|
if (unlikely(!nskb))
|
|
|
|
goto err;
|
|
|
|
|
2012-05-04 22:26:56 +08:00
|
|
|
hsize = skb_end_offset(nskb);
|
2008-12-16 15:26:06 +08:00
|
|
|
if (skb_cow_head(nskb, doffset + headroom)) {
|
|
|
|
kfree_skb(nskb);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2012-05-04 22:26:56 +08:00
|
|
|
nskb->truesize += skb_end_offset(nskb) - hsize;
|
2008-12-16 15:26:06 +08:00
|
|
|
skb_release_head_state(nskb);
|
|
|
|
__skb_push(nskb, doffset);
|
|
|
|
} else {
|
2012-08-01 07:44:19 +08:00
|
|
|
nskb = __alloc_skb(hsize + doffset + headroom,
|
|
|
|
GFP_ATOMIC, skb_alloc_rx_flag(skb),
|
|
|
|
NUMA_NO_NODE);
|
2008-12-16 15:26:06 +08:00
|
|
|
|
|
|
|
if (unlikely(!nskb))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
skb_reserve(nskb, headroom);
|
|
|
|
__skb_put(nskb, doffset);
|
|
|
|
}
|
2006-06-22 18:02:40 +08:00
|
|
|
|
|
|
|
if (segs)
|
|
|
|
tail->next = nskb;
|
|
|
|
else
|
|
|
|
segs = nskb;
|
|
|
|
tail = nskb;
|
|
|
|
|
2008-08-16 05:55:02 +08:00
|
|
|
__copy_skb_header(nskb, skb);
|
2006-06-22 18:02:40 +08:00
|
|
|
nskb->mac_len = skb->mac_len;
|
|
|
|
|
2010-09-01 08:50:51 +08:00
|
|
|
/* nskb and skb might have different headroom */
|
|
|
|
if (nskb->ip_summed == CHECKSUM_PARTIAL)
|
|
|
|
nskb->csum_start += skb_headroom(nskb) - headroom;
|
|
|
|
|
2007-03-20 06:30:44 +08:00
|
|
|
skb_reset_mac_header(nskb);
|
2007-03-16 08:42:27 +08:00
|
|
|
skb_set_network_header(nskb, skb->mac_len);
|
2007-04-11 12:21:55 +08:00
|
|
|
nskb->transport_header = (nskb->network_header +
|
|
|
|
skb_network_header_len(skb));
|
2013-02-14 22:02:41 +08:00
|
|
|
|
|
|
|
skb_copy_from_linear_data_offset(skb, -tnl_hlen,
|
|
|
|
nskb->data - tnl_hlen,
|
|
|
|
doffset + tnl_hlen);
|
2008-12-16 15:26:06 +08:00
|
|
|
|
2009-03-29 14:39:18 +08:00
|
|
|
if (fskb != skb_shinfo(skb)->frag_list)
|
2008-12-16 15:26:06 +08:00
|
|
|
continue;
|
|
|
|
|
2006-06-22 18:02:40 +08:00
|
|
|
if (!sg) {
|
2008-08-16 05:55:02 +08:00
|
|
|
nskb->ip_summed = CHECKSUM_NONE;
|
2006-06-22 18:02:40 +08:00
|
|
|
nskb->csum = skb_copy_and_csum_bits(skb, offset,
|
|
|
|
skb_put(nskb, len),
|
|
|
|
len, 0);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag = skb_shinfo(nskb)->frags;
|
|
|
|
|
2007-03-28 05:55:52 +08:00
|
|
|
skb_copy_from_linear_data_offset(skb, offset,
|
|
|
|
skb_put(nskb, hsize), hsize);
|
2006-06-22 18:02:40 +08:00
|
|
|
|
2013-02-11 17:27:41 +08:00
|
|
|
skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
|
2013-01-26 04:34:37 +08:00
|
|
|
|
2008-12-16 15:26:06 +08:00
|
|
|
while (pos < offset + len && i < nfrags) {
|
2006-06-22 18:02:40 +08:00
|
|
|
*frag = skb_shinfo(skb)->frags[i];
|
2011-08-23 07:44:58 +08:00
|
|
|
__skb_frag_ref(frag);
|
2011-10-19 05:00:24 +08:00
|
|
|
size = skb_frag_size(frag);
|
2006-06-22 18:02:40 +08:00
|
|
|
|
|
|
|
if (pos < offset) {
|
|
|
|
frag->page_offset += offset - pos;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(frag, offset - pos);
|
2006-06-22 18:02:40 +08:00
|
|
|
}
|
|
|
|
|
2008-12-16 15:26:06 +08:00
|
|
|
skb_shinfo(nskb)->nr_frags++;
|
2006-06-22 18:02:40 +08:00
|
|
|
|
|
|
|
if (pos + size <= offset + len) {
|
|
|
|
i++;
|
|
|
|
pos += size;
|
|
|
|
} else {
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(frag, pos + size - (offset + len));
|
2008-12-16 15:26:06 +08:00
|
|
|
goto skip_fraglist;
|
2006-06-22 18:02:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
frag++;
|
|
|
|
}
|
|
|
|
|
2008-12-16 15:26:06 +08:00
|
|
|
if (pos < offset + len) {
|
|
|
|
struct sk_buff *fskb2 = fskb;
|
|
|
|
|
|
|
|
BUG_ON(pos + fskb->len != offset + len);
|
|
|
|
|
|
|
|
pos += fskb->len;
|
|
|
|
fskb = fskb->next;
|
|
|
|
|
|
|
|
if (fskb2->next) {
|
|
|
|
fskb2 = skb_clone(fskb2, GFP_ATOMIC);
|
|
|
|
if (!fskb2)
|
|
|
|
goto err;
|
|
|
|
} else
|
|
|
|
skb_get(fskb2);
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
SKB_FRAG_ASSERT(nskb);
|
2008-12-16 15:26:06 +08:00
|
|
|
skb_shinfo(nskb)->frag_list = fskb2;
|
|
|
|
}
|
|
|
|
|
|
|
|
skip_fraglist:
|
2006-06-22 18:02:40 +08:00
|
|
|
nskb->data_len = len - hsize;
|
|
|
|
nskb->len += nskb->data_len;
|
|
|
|
nskb->truesize += nskb->data_len;
|
|
|
|
} while ((offset += len) < skb->len);
|
|
|
|
|
|
|
|
return segs;
|
|
|
|
|
|
|
|
err:
|
|
|
|
while ((skb = segs)) {
|
|
|
|
segs = skb->next;
|
2007-02-28 01:57:37 +08:00
|
|
|
kfree_skb(skb);
|
2006-06-22 18:02:40 +08:00
|
|
|
}
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(skb_segment);
|
|
|
|
|
2008-12-16 15:42:33 +08:00
|
|
|
int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sk_buff *p = *head;
|
|
|
|
struct sk_buff *nskb;
|
2009-05-27 02:50:33 +08:00
|
|
|
struct skb_shared_info *skbinfo = skb_shinfo(skb);
|
|
|
|
struct skb_shared_info *pinfo = skb_shinfo(p);
|
2008-12-16 15:42:33 +08:00
|
|
|
unsigned int headroom;
|
2009-01-29 22:19:50 +08:00
|
|
|
unsigned int len = skb_gro_len(skb);
|
2009-05-27 02:50:22 +08:00
|
|
|
unsigned int offset = skb_gro_offset(skb);
|
|
|
|
unsigned int headlen = skb_headlen(skb);
|
2012-05-03 07:33:21 +08:00
|
|
|
unsigned int delta_truesize;
|
2008-12-16 15:42:33 +08:00
|
|
|
|
2009-01-29 22:19:50 +08:00
|
|
|
if (p->len + len >= 65536)
|
2008-12-16 15:42:33 +08:00
|
|
|
return -E2BIG;
|
|
|
|
|
2009-05-27 02:50:33 +08:00
|
|
|
if (pinfo->frag_list)
|
2008-12-16 15:42:33 +08:00
|
|
|
goto merge;
|
2009-05-27 02:50:22 +08:00
|
|
|
else if (headlen <= offset) {
|
2009-05-27 02:50:19 +08:00
|
|
|
skb_frag_t *frag;
|
2009-05-27 02:50:32 +08:00
|
|
|
skb_frag_t *frag2;
|
2009-05-27 02:50:33 +08:00
|
|
|
int i = skbinfo->nr_frags;
|
|
|
|
int nr_frags = pinfo->nr_frags + i;
|
2009-05-27 02:50:32 +08:00
|
|
|
|
|
|
|
offset -= headlen;
|
2009-05-27 02:50:19 +08:00
|
|
|
|
2009-05-27 02:50:32 +08:00
|
|
|
if (nr_frags > MAX_SKB_FRAGS)
|
2009-01-29 22:19:51 +08:00
|
|
|
return -E2BIG;
|
|
|
|
|
2009-05-27 02:50:33 +08:00
|
|
|
pinfo->nr_frags = nr_frags;
|
|
|
|
skbinfo->nr_frags = 0;
|
2009-01-29 22:19:50 +08:00
|
|
|
|
2009-05-27 02:50:33 +08:00
|
|
|
frag = pinfo->frags + nr_frags;
|
|
|
|
frag2 = skbinfo->frags + i;
|
2009-05-27 02:50:32 +08:00
|
|
|
do {
|
|
|
|
*--frag = *--frag2;
|
|
|
|
} while (--i);
|
2009-01-05 08:13:40 +08:00
|
|
|
|
2009-05-27 02:50:32 +08:00
|
|
|
frag->page_offset += offset;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(frag, offset);
|
2009-01-15 12:40:03 +08:00
|
|
|
|
2012-05-03 07:33:21 +08:00
|
|
|
/* all fragments truesize : remove (head size + sk_buff) */
|
2012-05-04 22:26:56 +08:00
|
|
|
delta_truesize = skb->truesize -
|
|
|
|
SKB_TRUESIZE(skb_end_offset(skb));
|
2012-05-03 07:33:21 +08:00
|
|
|
|
2009-01-15 12:40:03 +08:00
|
|
|
skb->truesize -= skb->data_len;
|
|
|
|
skb->len -= skb->data_len;
|
|
|
|
skb->data_len = 0;
|
|
|
|
|
2012-05-03 07:33:21 +08:00
|
|
|
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
|
2009-01-05 08:13:40 +08:00
|
|
|
goto done;
|
2012-04-30 16:10:34 +08:00
|
|
|
} else if (skb->head_frag) {
|
|
|
|
int nr_frags = pinfo->nr_frags;
|
|
|
|
skb_frag_t *frag = pinfo->frags + nr_frags;
|
|
|
|
struct page *page = virt_to_head_page(skb->head);
|
|
|
|
unsigned int first_size = headlen - offset;
|
|
|
|
unsigned int first_offset;
|
|
|
|
|
|
|
|
if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
first_offset = skb->data -
|
|
|
|
(unsigned char *)page_address(page) +
|
|
|
|
offset;
|
|
|
|
|
|
|
|
pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
|
|
|
|
|
|
|
|
frag->page.p = page;
|
|
|
|
frag->page_offset = first_offset;
|
|
|
|
skb_frag_size_set(frag, first_size);
|
|
|
|
|
|
|
|
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
|
|
|
|
/* We dont need to clear skbinfo->nr_frags here */
|
|
|
|
|
2012-05-03 07:33:21 +08:00
|
|
|
delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
|
2012-04-30 16:10:34 +08:00
|
|
|
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
|
|
|
|
goto done;
|
2009-11-17 21:18:18 +08:00
|
|
|
} else if (skb_gro_len(p) != pinfo->gso_size)
|
|
|
|
return -E2BIG;
|
2008-12-16 15:42:33 +08:00
|
|
|
|
|
|
|
headroom = skb_headroom(p);
|
2010-09-01 08:50:51 +08:00
|
|
|
nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
|
2008-12-16 15:42:33 +08:00
|
|
|
if (unlikely(!nskb))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
__copy_skb_header(nskb, p);
|
|
|
|
nskb->mac_len = p->mac_len;
|
|
|
|
|
|
|
|
skb_reserve(nskb, headroom);
|
2009-01-29 22:19:50 +08:00
|
|
|
__skb_put(nskb, skb_gro_offset(p));
|
2008-12-16 15:42:33 +08:00
|
|
|
|
2009-01-29 22:19:50 +08:00
|
|
|
skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
|
2008-12-16 15:42:33 +08:00
|
|
|
skb_set_network_header(nskb, skb_network_offset(p));
|
|
|
|
skb_set_transport_header(nskb, skb_transport_offset(p));
|
|
|
|
|
2009-01-29 22:19:50 +08:00
|
|
|
__skb_pull(p, skb_gro_offset(p));
|
|
|
|
memcpy(skb_mac_header(nskb), skb_mac_header(p),
|
|
|
|
p->data - skb_mac_header(p));
|
2008-12-16 15:42:33 +08:00
|
|
|
|
|
|
|
skb_shinfo(nskb)->frag_list = p;
|
2009-05-27 02:50:33 +08:00
|
|
|
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
|
2010-05-21 14:07:56 +08:00
|
|
|
pinfo->gso_size = 0;
|
2008-12-16 15:42:33 +08:00
|
|
|
skb_header_release(p);
|
2012-12-06 21:54:59 +08:00
|
|
|
NAPI_GRO_CB(nskb)->last = p;
|
2008-12-16 15:42:33 +08:00
|
|
|
|
|
|
|
nskb->data_len += p->len;
|
2012-02-13 12:09:20 +08:00
|
|
|
nskb->truesize += p->truesize;
|
2008-12-16 15:42:33 +08:00
|
|
|
nskb->len += p->len;
|
|
|
|
|
|
|
|
*head = nskb;
|
|
|
|
nskb->next = p->next;
|
|
|
|
p->next = NULL;
|
|
|
|
|
|
|
|
p = nskb;
|
|
|
|
|
|
|
|
merge:
|
2012-05-03 07:33:21 +08:00
|
|
|
delta_truesize = skb->truesize;
|
2009-05-27 02:50:22 +08:00
|
|
|
if (offset > headlen) {
|
2011-01-24 20:08:48 +08:00
|
|
|
unsigned int eat = offset - headlen;
|
|
|
|
|
|
|
|
skbinfo->frags[0].page_offset += eat;
|
2011-10-19 05:00:24 +08:00
|
|
|
skb_frag_size_sub(&skbinfo->frags[0], eat);
|
2011-01-24 20:08:48 +08:00
|
|
|
skb->data_len -= eat;
|
|
|
|
skb->len -= eat;
|
2009-05-27 02:50:22 +08:00
|
|
|
offset = headlen;
|
2009-02-06 13:26:52 +08:00
|
|
|
}
|
|
|
|
|
2009-05-27 02:50:22 +08:00
|
|
|
__skb_pull(skb, offset);
|
2009-02-06 13:26:52 +08:00
|
|
|
|
2012-12-06 21:54:59 +08:00
|
|
|
NAPI_GRO_CB(p)->last->next = skb;
|
|
|
|
NAPI_GRO_CB(p)->last = skb;
|
2008-12-16 15:42:33 +08:00
|
|
|
skb_header_release(skb);
|
|
|
|
|
2009-01-05 08:13:40 +08:00
|
|
|
done:
|
|
|
|
NAPI_GRO_CB(p)->count++;
|
2009-01-18 03:48:13 +08:00
|
|
|
p->data_len += len;
|
2012-05-03 07:33:21 +08:00
|
|
|
p->truesize += delta_truesize;
|
2009-01-18 03:48:13 +08:00
|
|
|
p->len += len;
|
2008-12-16 15:42:33 +08:00
|
|
|
|
|
|
|
NAPI_GRO_CB(skb)->same_flow = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(skb_gro_receive);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
void __init skb_init(void)
|
|
|
|
{
|
|
|
|
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
|
|
|
|
sizeof(struct sk_buff),
|
|
|
|
0,
|
2006-08-27 10:25:52 +08:00
|
|
|
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
|
2007-07-20 09:11:58 +08:00
|
|
|
NULL);
|
2005-08-18 05:57:30 +08:00
|
|
|
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
|
|
|
|
(2*sizeof(struct sk_buff)) +
|
|
|
|
sizeof(atomic_t),
|
|
|
|
0,
|
2006-08-27 10:25:52 +08:00
|
|
|
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
|
2007-07-20 09:11:58 +08:00
|
|
|
NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-04-03 11:19:53 +08:00
|
|
|
/**
|
|
|
|
* skb_to_sgvec - Fill a scatter-gather list from a socket buffer
|
|
|
|
* @skb: Socket buffer containing the buffers to be mapped
|
|
|
|
* @sg: The scatter-gather list to map into
|
|
|
|
* @offset: The offset into the buffer's contents to start mapping
|
|
|
|
* @len: Length of buffer space to be mapped
|
|
|
|
*
|
|
|
|
* Fill the specified scatter-gather list with mappings/pointers into a
|
|
|
|
* region of the buffer space attached to a socket buffer.
|
|
|
|
*/
|
2007-10-31 12:29:29 +08:00
|
|
|
static int
|
|
|
|
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
|
2007-04-03 11:19:53 +08:00
|
|
|
{
|
2007-04-28 06:21:23 +08:00
|
|
|
int start = skb_headlen(skb);
|
|
|
|
int i, copy = start - offset;
|
2009-06-09 15:18:59 +08:00
|
|
|
struct sk_buff *frag_iter;
|
2007-04-03 11:19:53 +08:00
|
|
|
int elt = 0;
|
|
|
|
|
|
|
|
if (copy > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2007-10-24 17:20:47 +08:00
|
|
|
sg_set_buf(sg, skb->data + offset, copy);
|
2007-04-03 11:19:53 +08:00
|
|
|
elt++;
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return elt;
|
|
|
|
offset += copy;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
2007-04-28 06:21:23 +08:00
|
|
|
int end;
|
2007-04-03 11:19:53 +08:00
|
|
|
|
2008-07-26 12:43:18 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2007-04-28 06:21:23 +08:00
|
|
|
|
2011-10-19 05:00:24 +08:00
|
|
|
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
|
2007-04-03 11:19:53 +08:00
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
|
|
|
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
2011-08-23 07:44:58 +08:00
|
|
|
sg_set_page(&sg[elt], skb_frag_page(frag), copy,
|
2007-10-24 17:20:47 +08:00
|
|
|
frag->page_offset+offset-start);
|
2007-04-03 11:19:53 +08:00
|
|
|
elt++;
|
|
|
|
if (!(len -= copy))
|
|
|
|
return elt;
|
|
|
|
offset += copy;
|
|
|
|
}
|
2007-04-28 06:21:23 +08:00
|
|
|
start = end;
|
2007-04-03 11:19:53 +08:00
|
|
|
}
|
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
skb_walk_frags(skb, frag_iter) {
|
|
|
|
int end;
|
2007-04-28 06:21:23 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
WARN_ON(start > offset + len);
|
2007-04-03 11:19:53 +08:00
|
|
|
|
2009-06-09 15:18:59 +08:00
|
|
|
end = start + frag_iter->len;
|
|
|
|
if ((copy = end - offset) > 0) {
|
|
|
|
if (copy > len)
|
|
|
|
copy = len;
|
|
|
|
elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
|
|
|
|
copy);
|
|
|
|
if ((len -= copy) == 0)
|
|
|
|
return elt;
|
|
|
|
offset += copy;
|
2007-04-03 11:19:53 +08:00
|
|
|
}
|
2009-06-09 15:18:59 +08:00
|
|
|
start = end;
|
2007-04-03 11:19:53 +08:00
|
|
|
}
|
|
|
|
BUG_ON(len);
|
|
|
|
return elt;
|
|
|
|
}
|
|
|
|
|
2007-10-31 12:29:29 +08:00
|
|
|
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
|
|
|
|
{
|
|
|
|
int nsg = __skb_to_sgvec(skb, sg, offset, len);
|
|
|
|
|
2007-10-31 19:06:37 +08:00
|
|
|
sg_mark_end(&sg[nsg - 1]);
|
2007-10-31 12:29:29 +08:00
|
|
|
|
|
|
|
return nsg;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL_GPL(skb_to_sgvec);
|
2007-10-31 12:29:29 +08:00
|
|
|
|
2007-04-03 11:19:53 +08:00
|
|
|
/**
|
|
|
|
* skb_cow_data - Check that a socket buffer's data buffers are writable
|
|
|
|
* @skb: The socket buffer to check.
|
|
|
|
* @tailbits: Amount of trailing space to be added
|
|
|
|
* @trailer: Returned pointer to the skb where the @tailbits space begins
|
|
|
|
*
|
|
|
|
* Make sure that the data buffers attached to a socket buffer are
|
|
|
|
* writable. If they are not, private copies are made of the data buffers
|
|
|
|
* and the socket buffer is set to use these instead.
|
|
|
|
*
|
|
|
|
* If @tailbits is given, make sure that there is space to write @tailbits
|
|
|
|
* bytes of data beyond current end of socket buffer. @trailer will be
|
|
|
|
* set to point to the skb in which this space begins.
|
|
|
|
*
|
|
|
|
* The number of scatterlist elements required to completely map the
|
|
|
|
* COW'd and extended socket buffer will be returned.
|
|
|
|
*/
|
|
|
|
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
|
|
|
|
{
|
|
|
|
int copyflag;
|
|
|
|
int elt;
|
|
|
|
struct sk_buff *skb1, **skb_p;
|
|
|
|
|
|
|
|
/* If skb is cloned or its head is paged, reallocate
|
|
|
|
* head pulling out all the pages (pages are considered not writable
|
|
|
|
* at the moment even if they are anonymous).
|
|
|
|
*/
|
|
|
|
if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
|
|
|
|
__pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Easy case. Most of packets will go this way. */
|
2010-08-23 15:13:46 +08:00
|
|
|
if (!skb_has_frag_list(skb)) {
|
2007-04-03 11:19:53 +08:00
|
|
|
/* A little of trouble, not enough of space for trailer.
|
|
|
|
* This should not happen, when stack is tuned to generate
|
|
|
|
* good frames. OK, on miss we reallocate and reserve even more
|
|
|
|
* space, 128 bytes is fair. */
|
|
|
|
|
|
|
|
if (skb_tailroom(skb) < tailbits &&
|
|
|
|
pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Voila! */
|
|
|
|
*trailer = skb;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Misery. We are in troubles, going to mincer fragments... */
|
|
|
|
|
|
|
|
elt = 1;
|
|
|
|
skb_p = &skb_shinfo(skb)->frag_list;
|
|
|
|
copyflag = 0;
|
|
|
|
|
|
|
|
while ((skb1 = *skb_p) != NULL) {
|
|
|
|
int ntail = 0;
|
|
|
|
|
|
|
|
/* The fragment is partially pulled by someone,
|
|
|
|
* this can happen on input. Copy it and everything
|
|
|
|
* after it. */
|
|
|
|
|
|
|
|
if (skb_shared(skb1))
|
|
|
|
copyflag = 1;
|
|
|
|
|
|
|
|
/* If the skb is the last, worry about trailer. */
|
|
|
|
|
|
|
|
if (skb1->next == NULL && tailbits) {
|
|
|
|
if (skb_shinfo(skb1)->nr_frags ||
|
2010-08-23 15:13:46 +08:00
|
|
|
skb_has_frag_list(skb1) ||
|
2007-04-03 11:19:53 +08:00
|
|
|
skb_tailroom(skb1) < tailbits)
|
|
|
|
ntail = tailbits + 128;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (copyflag ||
|
|
|
|
skb_cloned(skb1) ||
|
|
|
|
ntail ||
|
|
|
|
skb_shinfo(skb1)->nr_frags ||
|
2010-08-23 15:13:46 +08:00
|
|
|
skb_has_frag_list(skb1)) {
|
2007-04-03 11:19:53 +08:00
|
|
|
struct sk_buff *skb2;
|
|
|
|
|
|
|
|
/* Fuck, we are miserable poor guys... */
|
|
|
|
if (ntail == 0)
|
|
|
|
skb2 = skb_copy(skb1, GFP_ATOMIC);
|
|
|
|
else
|
|
|
|
skb2 = skb_copy_expand(skb1,
|
|
|
|
skb_headroom(skb1),
|
|
|
|
ntail,
|
|
|
|
GFP_ATOMIC);
|
|
|
|
if (unlikely(skb2 == NULL))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (skb1->sk)
|
|
|
|
skb_set_owner_w(skb2, skb1->sk);
|
|
|
|
|
|
|
|
/* Looking around. Are we still alive?
|
|
|
|
* OK, link new skb, drop old one */
|
|
|
|
|
|
|
|
skb2->next = skb1->next;
|
|
|
|
*skb_p = skb2;
|
|
|
|
kfree_skb(skb1);
|
|
|
|
skb1 = skb2;
|
|
|
|
}
|
|
|
|
elt++;
|
|
|
|
*trailer = skb1;
|
|
|
|
skb_p = &skb1->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
return elt;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL_GPL(skb_cow_data);
|
2007-04-03 11:19:53 +08:00
|
|
|
|
2010-06-01 14:44:05 +08:00
|
|
|
static void sock_rmem_free(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sock *sk = skb->sk;
|
|
|
|
|
|
|
|
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: We dont mem charge error packets (no sk_forward_alloc changes)
|
|
|
|
*/
|
|
|
|
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
2012-04-06 16:49:10 +08:00
|
|
|
int len = skb->len;
|
|
|
|
|
2010-06-01 14:44:05 +08:00
|
|
|
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
|
2012-04-15 13:58:06 +08:00
|
|
|
(unsigned int)sk->sk_rcvbuf)
|
2010-06-01 14:44:05 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
skb_orphan(skb);
|
|
|
|
skb->sk = sk;
|
|
|
|
skb->destructor = sock_rmem_free;
|
|
|
|
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
|
|
|
|
|
2011-05-18 14:21:31 +08:00
|
|
|
/* before exiting rcu section, make sure dst is refcounted */
|
|
|
|
skb_dst_force(skb);
|
|
|
|
|
2010-06-01 14:44:05 +08:00
|
|
|
skb_queue_tail(&sk->sk_error_queue, skb);
|
|
|
|
if (!sock_flag(sk, SOCK_DEAD))
|
2012-04-06 16:49:10 +08:00
|
|
|
sk->sk_data_ready(sk, len);
|
2010-06-01 14:44:05 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sock_queue_err_skb);
|
|
|
|
|
2009-02-12 13:03:37 +08:00
|
|
|
void skb_tstamp_tx(struct sk_buff *orig_skb,
|
|
|
|
struct skb_shared_hwtstamps *hwtstamps)
|
|
|
|
{
|
|
|
|
struct sock *sk = orig_skb->sk;
|
|
|
|
struct sock_exterr_skb *serr;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (!sk)
|
|
|
|
return;
|
|
|
|
|
|
|
|
skb = skb_clone(orig_skb, GFP_ATOMIC);
|
|
|
|
if (!skb)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (hwtstamps) {
|
|
|
|
*skb_hwtstamps(skb) =
|
|
|
|
*hwtstamps;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* no hardware time stamps available,
|
2010-08-17 16:59:14 +08:00
|
|
|
* so keep the shared tx_flags and only
|
2009-02-12 13:03:37 +08:00
|
|
|
* store software time stamp
|
|
|
|
*/
|
|
|
|
skb->tstamp = ktime_get_real();
|
|
|
|
}
|
|
|
|
|
|
|
|
serr = SKB_EXT_ERR(skb);
|
|
|
|
memset(serr, 0, sizeof(*serr));
|
|
|
|
serr->ee.ee_errno = ENOMSG;
|
|
|
|
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
|
2010-05-29 15:20:48 +08:00
|
|
|
|
2009-02-12 13:03:37 +08:00
|
|
|
err = sock_queue_err_skb(sk, skb);
|
2010-05-29 15:20:48 +08:00
|
|
|
|
2009-02-12 13:03:37 +08:00
|
|
|
if (err)
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
|
|
|
|
|
2011-11-09 17:15:42 +08:00
|
|
|
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
|
|
|
|
{
|
|
|
|
struct sock *sk = skb->sk;
|
|
|
|
struct sock_exterr_skb *serr;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
skb->wifi_acked_valid = 1;
|
|
|
|
skb->wifi_acked = acked;
|
|
|
|
|
|
|
|
serr = SKB_EXT_ERR(skb);
|
|
|
|
memset(serr, 0, sizeof(*serr));
|
|
|
|
serr->ee.ee_errno = ENOMSG;
|
|
|
|
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
|
|
|
|
|
|
|
|
err = sock_queue_err_skb(sk, skb);
|
|
|
|
if (err)
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
|
|
|
|
|
2009-02-12 13:03:37 +08:00
|
|
|
|
2008-02-05 12:49:54 +08:00
|
|
|
/**
|
|
|
|
* skb_partial_csum_set - set up and verify partial csum values for packet
|
|
|
|
* @skb: the skb to set
|
|
|
|
* @start: the number of bytes after skb->data to start checksumming.
|
|
|
|
* @off: the offset from start to place the checksum.
|
|
|
|
*
|
|
|
|
* For untrusted partially-checksummed packets, we need to make sure the values
|
|
|
|
* for skb->csum_start and skb->csum_offset are valid so we don't oops.
|
|
|
|
*
|
|
|
|
* This function checks and sets those values and skb->ip_summed: if this
|
|
|
|
* returns false you should drop the packet.
|
|
|
|
*/
|
|
|
|
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
|
|
|
|
{
|
2009-06-04 09:22:01 +08:00
|
|
|
if (unlikely(start > skb_headlen(skb)) ||
|
|
|
|
unlikely((int)start + off > skb_headlen(skb) - 2)) {
|
2012-05-14 05:56:26 +08:00
|
|
|
net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
|
|
|
|
start, off, skb_headlen(skb));
|
2008-02-05 12:49:54 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
skb->csum_start = skb_headroom(skb) + start;
|
|
|
|
skb->csum_offset = off;
|
|
|
|
return true;
|
|
|
|
}
|
2009-02-10 18:09:24 +08:00
|
|
|
EXPORT_SYMBOL_GPL(skb_partial_csum_set);
|
2008-02-05 12:49:54 +08:00
|
|
|
|
2008-06-20 07:22:28 +08:00
|
|
|
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
|
|
|
|
{
|
2012-05-14 05:56:26 +08:00
|
|
|
net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
|
|
|
|
skb->dev->name);
|
2008-06-20 07:22:28 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__skb_warn_lro_forwarding);
|
2012-05-19 11:02:02 +08:00
|
|
|
|
|
|
|
void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
|
|
|
|
{
|
2012-10-22 17:03:40 +08:00
|
|
|
if (head_stolen) {
|
|
|
|
skb_release_head_state(skb);
|
2012-05-19 11:02:02 +08:00
|
|
|
kmem_cache_free(skbuff_head_cache, skb);
|
2012-10-22 17:03:40 +08:00
|
|
|
} else {
|
2012-05-19 11:02:02 +08:00
|
|
|
__kfree_skb(skb);
|
2012-10-22 17:03:40 +08:00
|
|
|
}
|
2012-05-19 11:02:02 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(kfree_skb_partial);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* skb_try_coalesce - try to merge skb to prior one
|
|
|
|
* @to: prior buffer
|
|
|
|
* @from: buffer to add
|
|
|
|
* @fragstolen: pointer to boolean
|
2012-06-08 22:01:44 +08:00
|
|
|
* @delta_truesize: how much more was allocated than was requested
|
2012-05-19 11:02:02 +08:00
|
|
|
*/
|
|
|
|
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
|
|
|
|
bool *fragstolen, int *delta_truesize)
|
|
|
|
{
|
|
|
|
int i, delta, len = from->len;
|
|
|
|
|
|
|
|
*fragstolen = false;
|
|
|
|
|
|
|
|
if (skb_cloned(to))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (len <= skb_tailroom(to)) {
|
|
|
|
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
|
|
|
|
*delta_truesize = 0;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skb_has_frag_list(to) || skb_has_frag_list(from))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (skb_headlen(from) != 0) {
|
|
|
|
struct page *page;
|
|
|
|
unsigned int offset;
|
|
|
|
|
|
|
|
if (skb_shinfo(to)->nr_frags +
|
|
|
|
skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (skb_head_is_locked(from))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
|
|
|
|
|
|
|
|
page = virt_to_head_page(from->head);
|
|
|
|
offset = from->data - (unsigned char *)page_address(page);
|
|
|
|
|
|
|
|
skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
|
|
|
|
page, offset, skb_headlen(from));
|
|
|
|
*fragstolen = true;
|
|
|
|
} else {
|
|
|
|
if (skb_shinfo(to)->nr_frags +
|
|
|
|
skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
|
|
|
|
return false;
|
|
|
|
|
2012-09-29 04:15:30 +08:00
|
|
|
delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
|
2012-05-19 11:02:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON_ONCE(delta < len);
|
|
|
|
|
|
|
|
memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
|
|
|
|
skb_shinfo(from)->frags,
|
|
|
|
skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
|
|
|
|
skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
|
|
|
|
|
|
|
|
if (!skb_cloned(from))
|
|
|
|
skb_shinfo(from)->nr_frags = 0;
|
|
|
|
|
2012-09-19 00:53:21 +08:00
|
|
|
/* if the skb is not cloned this does nothing
|
|
|
|
* since we set nr_frags to 0.
|
|
|
|
*/
|
2012-05-19 11:02:02 +08:00
|
|
|
for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
|
|
|
|
skb_frag_ref(from, i);
|
|
|
|
|
|
|
|
to->truesize += delta;
|
|
|
|
to->len += len;
|
|
|
|
to->data_len += len;
|
|
|
|
|
|
|
|
*delta_truesize = delta;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_try_coalesce);
|