2007-10-15 17:24:19 +08:00
|
|
|
#ifndef __NET_FRAG_H__
|
|
|
|
#define __NET_FRAG_H__
|
|
|
|
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
#include <linux/percpu_counter.h>
|
|
|
|
|
2008-01-22 22:02:14 +08:00
|
|
|
struct netns_frags {
|
2008-01-22 22:06:23 +08:00
|
|
|
int nqueues;
|
2008-01-22 22:11:48 +08:00
|
|
|
struct list_head lru_list;
|
2013-01-29 07:45:51 +08:00
|
|
|
spinlock_t lru_lock;
|
2008-01-22 22:09:37 +08:00
|
|
|
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
/* The percpu_counter "mem" need to be cacheline aligned.
|
|
|
|
* mem.count must not share cacheline with other writers
|
2013-01-29 07:44:14 +08:00
|
|
|
*/
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
struct percpu_counter mem ____cacheline_aligned_in_smp;
|
|
|
|
|
2008-01-22 22:09:37 +08:00
|
|
|
/* sysctls */
|
|
|
|
int timeout;
|
2008-01-22 22:10:13 +08:00
|
|
|
int high_thresh;
|
|
|
|
int low_thresh;
|
2008-01-22 22:02:14 +08:00
|
|
|
};
|
|
|
|
|
2007-10-15 17:24:19 +08:00
|
|
|
struct inet_frag_queue {
|
|
|
|
spinlock_t lock;
|
|
|
|
struct timer_list timer; /* when will this queue expire? */
|
2013-01-29 07:44:49 +08:00
|
|
|
struct list_head lru_list; /* lru list member */
|
|
|
|
struct hlist_node list;
|
|
|
|
atomic_t refcnt;
|
2007-10-15 17:24:19 +08:00
|
|
|
struct sk_buff *fragments; /* list of received fragments */
|
2010-06-29 12:39:37 +08:00
|
|
|
struct sk_buff *fragments_tail;
|
2007-10-15 17:24:19 +08:00
|
|
|
ktime_t stamp;
|
|
|
|
int len; /* total length of orig datagram */
|
|
|
|
int meat;
|
|
|
|
__u8 last_in; /* first/last segment arrived? */
|
|
|
|
|
2008-03-29 07:35:27 +08:00
|
|
|
#define INET_FRAG_COMPLETE 4
|
|
|
|
#define INET_FRAG_FIRST_IN 2
|
|
|
|
#define INET_FRAG_LAST_IN 1
|
2012-08-27 01:13:55 +08:00
|
|
|
|
|
|
|
u16 max_size;
|
2013-01-29 07:44:49 +08:00
|
|
|
|
|
|
|
struct netns_frags *net;
|
2007-10-15 17:24:19 +08:00
|
|
|
};
|
|
|
|
|
2007-10-15 17:31:52 +08:00
|
|
|
#define INETFRAGS_HASHSZ 64
|
|
|
|
|
|
|
|
struct inet_frags {
|
|
|
|
struct hlist_head hash[INETFRAGS_HASHSZ];
|
2013-01-29 07:44:37 +08:00
|
|
|
/* This rwlock is a global lock (seperate per IPv4, IPv6 and
|
|
|
|
* netfilter). Important to keep this on a seperate cacheline.
|
|
|
|
*/
|
|
|
|
rwlock_t lock ____cacheline_aligned_in_smp;
|
2008-01-22 22:11:04 +08:00
|
|
|
int secret_interval;
|
2007-10-15 17:31:52 +08:00
|
|
|
struct timer_list secret_timer;
|
2013-01-29 07:44:37 +08:00
|
|
|
u32 rnd;
|
|
|
|
int qsize;
|
2007-10-15 17:38:08 +08:00
|
|
|
|
|
|
|
unsigned int (*hashfn)(struct inet_frag_queue *);
|
2013-01-29 07:44:37 +08:00
|
|
|
bool (*match)(struct inet_frag_queue *q, void *arg);
|
2007-10-18 10:46:47 +08:00
|
|
|
void (*constructor)(struct inet_frag_queue *q,
|
|
|
|
void *arg);
|
2007-10-15 17:39:14 +08:00
|
|
|
void (*destructor)(struct inet_frag_queue *);
|
|
|
|
void (*skb_free)(struct sk_buff *);
|
2007-10-18 10:45:23 +08:00
|
|
|
void (*frag_expire)(unsigned long data);
|
2007-10-15 17:31:52 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
void inet_frags_init(struct inet_frags *);
|
|
|
|
void inet_frags_fini(struct inet_frags *);
|
|
|
|
|
2008-01-22 22:06:23 +08:00
|
|
|
void inet_frags_init_net(struct netns_frags *nf);
|
2008-01-22 22:12:39 +08:00
|
|
|
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
|
2008-01-22 22:06:23 +08:00
|
|
|
|
2007-10-15 17:37:18 +08:00
|
|
|
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
|
2007-10-15 17:39:14 +08:00
|
|
|
void inet_frag_destroy(struct inet_frag_queue *q,
|
|
|
|
struct inet_frags *f, int *work);
|
2012-09-19 00:50:11 +08:00
|
|
|
int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
|
2008-01-22 22:02:14 +08:00
|
|
|
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
|
2009-02-25 18:32:52 +08:00
|
|
|
struct inet_frags *f, void *key, unsigned int hash)
|
|
|
|
__releases(&f->lock);
|
2007-10-15 17:37:18 +08:00
|
|
|
|
2007-10-15 17:41:56 +08:00
|
|
|
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&q->refcnt))
|
|
|
|
inet_frag_destroy(q, f, NULL);
|
|
|
|
}
|
|
|
|
|
2013-01-29 07:45:12 +08:00
|
|
|
/* Memory Tracking Functions. */
|
|
|
|
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
/* The default percpu_counter batch size is not big enough to scale to
|
|
|
|
* fragmentation mem acct sizes.
|
|
|
|
* The mem size of a 64K fragment is approx:
|
|
|
|
* (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
|
|
|
|
*/
|
|
|
|
static unsigned int frag_percpu_counter_batch = 130000;
|
|
|
|
|
2013-01-29 07:45:12 +08:00
|
|
|
static inline int frag_mem_limit(struct netns_frags *nf)
|
|
|
|
{
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
return percpu_counter_read(&nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
|
|
|
|
{
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
|
|
|
|
{
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void init_frag_mem_limit(struct netns_frags *nf)
|
|
|
|
{
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
percpu_counter_init(&nf->mem, 0);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int sum_frag_mem_limit(struct netns_frags *nf)
|
|
|
|
{
|
net: use lib/percpu_counter API for fragmentation mem accounting
Replace the per network namespace shared atomic "mem" accounting
variable, in the fragmentation code, with a lib/percpu_counter.
Getting percpu_counter to scale to the fragmentation code usage
requires some tweaks.
At first view, percpu_counter looks superfast, but it does not
scale on multi-CPU/NUMA machines, because the default batch size
is too small, for frag code usage. Thus, I have adjusted the
batch size by using __percpu_counter_add() directly, instead of
percpu_counter_sub() and percpu_counter_add().
The batch size is increased to 130.000, based on the largest 64K
fragment memory usage. This does introduce some imprecise
memory accounting, but its does not need to be strict for this
use-case.
It is also essential, that the percpu_counter, does not
share cacheline with other writers, to make this scale.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-01-29 07:45:33 +08:00
|
|
|
return percpu_counter_sum_positive(&nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
2013-01-29 07:45:51 +08:00
|
|
|
static inline void inet_frag_lru_move(struct inet_frag_queue *q)
|
|
|
|
{
|
|
|
|
spin_lock(&q->net->lru_lock);
|
|
|
|
list_move_tail(&q->lru_list, &q->net->lru_list);
|
|
|
|
spin_unlock(&q->net->lru_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_frag_lru_del(struct inet_frag_queue *q)
|
|
|
|
{
|
|
|
|
spin_lock(&q->net->lru_lock);
|
|
|
|
list_del(&q->lru_list);
|
|
|
|
spin_unlock(&q->net->lru_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_frag_lru_add(struct netns_frags *nf,
|
|
|
|
struct inet_frag_queue *q)
|
|
|
|
{
|
|
|
|
spin_lock(&nf->lru_lock);
|
|
|
|
list_add_tail(&q->lru_list, &nf->lru_list);
|
|
|
|
spin_unlock(&nf->lru_lock);
|
|
|
|
}
|
2007-10-15 17:24:19 +08:00
|
|
|
#endif
|