2007-10-15 17:24:19 +08:00
|
|
|
#ifndef __NET_FRAG_H__
|
|
|
|
#define __NET_FRAG_H__
|
|
|
|
|
2008-01-22 22:02:14 +08:00
|
|
|
struct netns_frags {
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
/* Keep atomic mem on separate cachelines in structs that include it */
|
|
|
|
atomic_t mem ____cacheline_aligned_in_smp;
|
2008-01-22 22:09:37 +08:00
|
|
|
/* sysctls */
|
|
|
|
int timeout;
|
2008-01-22 22:10:13 +08:00
|
|
|
int high_thresh;
|
|
|
|
int low_thresh;
|
2016-02-15 18:11:31 +08:00
|
|
|
int max_dist;
|
2008-01-22 22:02:14 +08:00
|
|
|
};
|
|
|
|
|
2014-08-01 18:29:45 +08:00
|
|
|
/**
|
|
|
|
* fragment queue flags
|
|
|
|
*
|
|
|
|
* @INET_FRAG_FIRST_IN: first fragment has arrived
|
|
|
|
* @INET_FRAG_LAST_IN: final fragment has arrived
|
|
|
|
* @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
INET_FRAG_FIRST_IN = BIT(0),
|
|
|
|
INET_FRAG_LAST_IN = BIT(1),
|
|
|
|
INET_FRAG_COMPLETE = BIT(2),
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct inet_frag_queue - fragment queue
|
|
|
|
*
|
|
|
|
* @lock: spinlock protecting the queue
|
|
|
|
* @timer: queue expiration timer
|
|
|
|
* @list: hash bucket list
|
|
|
|
* @refcnt: reference count of the queue
|
|
|
|
* @fragments: received fragments head
|
|
|
|
* @fragments_tail: received fragments tail
|
|
|
|
* @stamp: timestamp of the last received fragment
|
|
|
|
* @len: total length of the original datagram
|
|
|
|
* @meat: length of received fragments so far
|
|
|
|
* @flags: fragment queue flags
|
2015-05-22 22:32:51 +08:00
|
|
|
* @max_size: maximum received fragment size
|
2014-08-01 18:29:45 +08:00
|
|
|
* @net: namespace that this frag belongs to
|
2015-07-23 18:05:37 +08:00
|
|
|
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
|
2014-08-01 18:29:45 +08:00
|
|
|
*/
|
2007-10-15 17:24:19 +08:00
|
|
|
struct inet_frag_queue {
|
|
|
|
spinlock_t lock;
|
2014-08-01 18:29:45 +08:00
|
|
|
struct timer_list timer;
|
2013-01-29 07:44:49 +08:00
|
|
|
struct hlist_node list;
|
2017-06-30 18:08:07 +08:00
|
|
|
refcount_t refcnt;
|
2014-08-01 18:29:45 +08:00
|
|
|
struct sk_buff *fragments;
|
2010-06-29 12:39:37 +08:00
|
|
|
struct sk_buff *fragments_tail;
|
2007-10-15 17:24:19 +08:00
|
|
|
ktime_t stamp;
|
2014-08-01 18:29:45 +08:00
|
|
|
int len;
|
2007-10-15 17:24:19 +08:00
|
|
|
int meat;
|
2014-08-01 18:29:45 +08:00
|
|
|
__u8 flags;
|
2012-08-27 01:13:55 +08:00
|
|
|
u16 max_size;
|
2013-01-29 07:44:49 +08:00
|
|
|
struct netns_frags *net;
|
2015-07-23 18:05:37 +08:00
|
|
|
struct hlist_node list_evictor;
|
2007-10-15 17:24:19 +08:00
|
|
|
};
|
|
|
|
|
2013-04-25 17:52:25 +08:00
|
|
|
#define INETFRAGS_HASHSZ 1024
|
2007-10-15 17:31:52 +08:00
|
|
|
|
2013-03-15 19:32:30 +08:00
|
|
|
/* averaged:
|
|
|
|
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
|
|
|
|
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
|
|
|
|
* struct frag_queue))
|
|
|
|
*/
|
2014-07-24 22:50:32 +08:00
|
|
|
#define INETFRAGS_MAXDEPTH 128
|
2013-03-15 19:32:30 +08:00
|
|
|
|
net: frag queue per hash bucket locking
This patch implements per hash bucket locking for the frag queue
hash. This removes two write locks, and the only remaining write
lock is for protecting hash rebuild. This essentially reduce the
readers-writer lock to a rebuild lock.
This patch is part of "net: frag performance followup"
http://thread.gmane.org/gmane.linux.network/263644
of which two patches have already been accepted:
Same test setup as previous:
(http://thread.gmane.org/gmane.linux.network/257155)
Two 10G interfaces, on seperate NUMA nodes, are under-test, and uses
Ethernet flow-control. A third interface is used for generating the
DoS attack (with trafgen).
Notice, I have changed the frag DoS generator script to be more
efficient/deadly. Before it would only hit one RX queue, now its
sending packets causing multi-queue RX, due to "better" RX hashing.
Test types summary (netperf UDP_STREAM):
Test-20G64K == 2x10G with 65K fragments
Test-20G3F == 2x10G with 3x fragments (3*1472 bytes)
Test-20G64K+DoS == Same as 20G64K with frag DoS
Test-20G3F+DoS == Same as 20G3F with frag DoS
Test-20G64K+MQ == Same as 20G64K with Multi-Queue frag DoS
Test-20G3F+MQ == Same as 20G3F with Multi-Queue frag DoS
When I rebased this-patch(03) (on top of net-next commit a210576c) and
removed the _bh spinlock, I saw a performance regression. BUT this
was caused by some unrelated change in-between. See tests below.
Test (A) is what I reported before for patch-02, accepted in commit 1b5ab0de.
Test (B) verifying-retest of commit 1b5ab0de corrospond to patch-02.
Test (C) is what I reported before for this-patch
Test (D) is net-next master HEAD (commit a210576c), which reveals some
(unknown) performance regression (compared against test (B)).
Test (D) function as a new base-test.
Performance table summary (in Mbit/s):
(#) Test-type: 20G64K 20G3F 20G64K+DoS 20G3F+DoS 20G64K+MQ 20G3F+MQ
---------- ------- ------- ---------- --------- -------- -------
(A) Patch-02 : 18848.7 13230.1 4103.04 5310.36 130.0 440.2
(B) 1b5ab0de : 18841.5 13156.8 4101.08 5314.57 129.0 424.2
(C) Patch-03v1: 18838.0 13490.5 4405.11 6814.72 196.6 461.6
(D) a210576c : 18321.5 11250.4 3635.34 5160.13 119.1 405.2
(E) with _bh : 17247.3 11492.6 3994.74 6405.29 166.7 413.6
(F) without bh: 17471.3 11298.7 3818.05 6102.11 165.7 406.3
Test (E) and (F) is this-patch(03), with(V1) and without(V2) the _bh spinlocks.
I cannot explain the slow down for 20G64K (but its an artificial
"lab-test" so I'm not worried). But the other results does show
improvements. And test (E) "with _bh" version is slightly better.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
----
V2:
- By analysis from Hannes Frederic Sowa and Eric Dumazet, we don't
need the spinlock _bh versions, as Netfilter currently does a
local_bh_disable() before entering inet_fragment.
- Fold-in desc from cover-mail
V3:
- Drop the chain_len counter per hash bucket.
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-04-04 07:38:16 +08:00
|
|
|
struct inet_frag_bucket {
|
|
|
|
struct hlist_head chain;
|
|
|
|
spinlock_t chain_lock;
|
|
|
|
};
|
|
|
|
|
2007-10-15 17:31:52 +08:00
|
|
|
struct inet_frags {
|
net: frag queue per hash bucket locking
This patch implements per hash bucket locking for the frag queue
hash. This removes two write locks, and the only remaining write
lock is for protecting hash rebuild. This essentially reduce the
readers-writer lock to a rebuild lock.
This patch is part of "net: frag performance followup"
http://thread.gmane.org/gmane.linux.network/263644
of which two patches have already been accepted:
Same test setup as previous:
(http://thread.gmane.org/gmane.linux.network/257155)
Two 10G interfaces, on seperate NUMA nodes, are under-test, and uses
Ethernet flow-control. A third interface is used for generating the
DoS attack (with trafgen).
Notice, I have changed the frag DoS generator script to be more
efficient/deadly. Before it would only hit one RX queue, now its
sending packets causing multi-queue RX, due to "better" RX hashing.
Test types summary (netperf UDP_STREAM):
Test-20G64K == 2x10G with 65K fragments
Test-20G3F == 2x10G with 3x fragments (3*1472 bytes)
Test-20G64K+DoS == Same as 20G64K with frag DoS
Test-20G3F+DoS == Same as 20G3F with frag DoS
Test-20G64K+MQ == Same as 20G64K with Multi-Queue frag DoS
Test-20G3F+MQ == Same as 20G3F with Multi-Queue frag DoS
When I rebased this-patch(03) (on top of net-next commit a210576c) and
removed the _bh spinlock, I saw a performance regression. BUT this
was caused by some unrelated change in-between. See tests below.
Test (A) is what I reported before for patch-02, accepted in commit 1b5ab0de.
Test (B) verifying-retest of commit 1b5ab0de corrospond to patch-02.
Test (C) is what I reported before for this-patch
Test (D) is net-next master HEAD (commit a210576c), which reveals some
(unknown) performance regression (compared against test (B)).
Test (D) function as a new base-test.
Performance table summary (in Mbit/s):
(#) Test-type: 20G64K 20G3F 20G64K+DoS 20G3F+DoS 20G64K+MQ 20G3F+MQ
---------- ------- ------- ---------- --------- -------- -------
(A) Patch-02 : 18848.7 13230.1 4103.04 5310.36 130.0 440.2
(B) 1b5ab0de : 18841.5 13156.8 4101.08 5314.57 129.0 424.2
(C) Patch-03v1: 18838.0 13490.5 4405.11 6814.72 196.6 461.6
(D) a210576c : 18321.5 11250.4 3635.34 5160.13 119.1 405.2
(E) with _bh : 17247.3 11492.6 3994.74 6405.29 166.7 413.6
(F) without bh: 17471.3 11298.7 3818.05 6102.11 165.7 406.3
Test (E) and (F) is this-patch(03), with(V1) and without(V2) the _bh spinlocks.
I cannot explain the slow down for 20G64K (but its an artificial
"lab-test" so I'm not worried). But the other results does show
improvements. And test (E) "with _bh" version is slightly better.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
----
V2:
- By analysis from Hannes Frederic Sowa and Eric Dumazet, we don't
need the spinlock _bh versions, as Netfilter currently does a
local_bh_disable() before entering inet_fragment.
- Fold-in desc from cover-mail
V3:
- Drop the chain_len counter per hash bucket.
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-04-04 07:38:16 +08:00
|
|
|
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
|
2013-10-23 17:06:57 +08:00
|
|
|
|
2014-07-24 22:50:32 +08:00
|
|
|
struct work_struct frags_work;
|
|
|
|
unsigned int next_bucket;
|
2014-07-24 22:50:35 +08:00
|
|
|
unsigned long last_rebuild_jiffies;
|
|
|
|
bool rebuild;
|
2014-07-24 22:50:32 +08:00
|
|
|
|
2013-10-23 17:06:57 +08:00
|
|
|
/* The first call to hashfn is responsible to initialize
|
|
|
|
* rnd. This is best done with net_get_random_once.
|
2014-07-24 22:50:36 +08:00
|
|
|
*
|
|
|
|
* rnd_seqlock is used to let hash insertion detect
|
|
|
|
* when it needs to re-lookup the hash chain to use.
|
2013-10-23 17:06:57 +08:00
|
|
|
*/
|
2013-01-29 07:44:37 +08:00
|
|
|
u32 rnd;
|
2014-07-24 22:50:36 +08:00
|
|
|
seqlock_t rnd_seqlock;
|
2017-05-23 05:20:26 +08:00
|
|
|
unsigned int qsize;
|
2007-10-15 17:38:08 +08:00
|
|
|
|
2014-07-24 22:50:29 +08:00
|
|
|
unsigned int (*hashfn)(const struct inet_frag_queue *);
|
|
|
|
bool (*match)(const struct inet_frag_queue *q,
|
|
|
|
const void *arg);
|
2007-10-18 10:46:47 +08:00
|
|
|
void (*constructor)(struct inet_frag_queue *q,
|
2014-07-24 22:50:29 +08:00
|
|
|
const void *arg);
|
2007-10-15 17:39:14 +08:00
|
|
|
void (*destructor)(struct inet_frag_queue *);
|
2007-10-18 10:45:23 +08:00
|
|
|
void (*frag_expire)(unsigned long data);
|
2014-08-01 18:29:48 +08:00
|
|
|
struct kmem_cache *frags_cachep;
|
|
|
|
const char *frags_cache_name;
|
2007-10-15 17:31:52 +08:00
|
|
|
};
|
|
|
|
|
2014-08-01 18:29:48 +08:00
|
|
|
int inet_frags_init(struct inet_frags *);
|
2007-10-15 17:31:52 +08:00
|
|
|
void inet_frags_fini(struct inet_frags *);
|
|
|
|
|
2017-09-01 17:26:13 +08:00
|
|
|
static inline void inet_frags_init_net(struct netns_frags *nf)
|
2015-11-03 01:03:11 +08:00
|
|
|
{
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
atomic_set(&nf->mem, 0);
|
2015-11-03 01:03:11 +08:00
|
|
|
}
|
2008-01-22 22:12:39 +08:00
|
|
|
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
|
2008-01-22 22:06:23 +08:00
|
|
|
|
2007-10-15 17:37:18 +08:00
|
|
|
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
|
2014-07-24 22:50:34 +08:00
|
|
|
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
|
2008-01-22 22:02:14 +08:00
|
|
|
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
|
2014-07-24 22:50:36 +08:00
|
|
|
struct inet_frags *f, void *key, unsigned int hash);
|
|
|
|
|
2013-03-15 19:32:30 +08:00
|
|
|
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
|
|
|
|
const char *prefix);
|
2007-10-15 17:37:18 +08:00
|
|
|
|
2007-10-15 17:41:56 +08:00
|
|
|
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
|
|
|
|
{
|
2017-06-30 18:08:07 +08:00
|
|
|
if (refcount_dec_and_test(&q->refcnt))
|
2014-07-24 22:50:34 +08:00
|
|
|
inet_frag_destroy(q, f);
|
2007-10-15 17:41:56 +08:00
|
|
|
}
|
|
|
|
|
2015-07-23 18:05:40 +08:00
|
|
|
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
|
|
|
|
{
|
|
|
|
return !hlist_unhashed(&q->list_evictor);
|
|
|
|
}
|
|
|
|
|
2013-01-29 07:45:12 +08:00
|
|
|
/* Memory Tracking Functions. */
|
|
|
|
|
|
|
|
static inline int frag_mem_limit(struct netns_frags *nf)
|
|
|
|
{
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
return atomic_read(&nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
2015-07-23 18:05:38 +08:00
|
|
|
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
|
2013-01-29 07:45:12 +08:00
|
|
|
{
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
atomic_sub(i, &nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
2015-07-23 18:05:38 +08:00
|
|
|
static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
|
2013-01-29 07:45:12 +08:00
|
|
|
{
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
atomic_add(i, &nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
static inline int sum_frag_mem_limit(struct netns_frags *nf)
|
2013-01-29 07:45:12 +08:00
|
|
|
{
|
Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.
The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet. Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.
We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:
1) On systems with CPUs > 24, the heavier fully locked
__percpu_counter_sum() is always invoked, which will be more
expensive than the atomic_t that is reverted to.
Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option. To mitigate this, the batch size could be
decreased and thresh be increased.
2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
CPU, before SKBs are pushed into sockets on remote CPUs. Given
NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
likely be limited. Thus, a fair chance that atomic add+dec happen
on the same CPU.
Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.
Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-01 17:26:08 +08:00
|
|
|
return atomic_read(&nf->mem);
|
2013-01-29 07:45:12 +08:00
|
|
|
}
|
|
|
|
|
2013-03-22 16:24:37 +08:00
|
|
|
/* RFC 3168 support :
|
|
|
|
* We want to check ECN values of all fragments, do detect invalid combinations.
|
|
|
|
* In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
|
|
|
|
*/
|
|
|
|
#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
|
|
|
|
#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
|
|
|
|
#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
|
|
|
|
#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
|
|
|
|
|
|
|
|
extern const u8 ip_frag_ecn_table[16];
|
|
|
|
|
2007-10-15 17:24:19 +08:00
|
|
|
#endif
|