2005-04-17 06:20:36 +08:00
|
|
|
/* flow.c: Generic flow cache.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
|
|
|
|
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/jhash.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/random.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpumask.h>
|
2006-03-21 14:33:17 +08:00
|
|
|
#include <linux/mutex.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <net/flow.h>
|
2011-07-27 07:09:06 +08:00
|
|
|
#include <linux/atomic.h>
|
[LSM-IPSec]: Security association restriction.
This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets. Extensions to the SELinux LSM are
included that leverage the patch for this purpose.
This patch implements the changes necessary to the XFRM subsystem,
pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a
socket to use only authorized security associations (or no security
association) to send/receive network packets.
Patch purpose:
The patch is designed to enable access control per packets based on
the strongly authenticated IPSec security association. Such access
controls augment the existing ones based on network interface and IP
address. The former are very coarse-grained, and the latter can be
spoofed. By using IPSec, the system can control access to remote
hosts based on cryptographic keys generated using the IPSec mechanism.
This enables access control on a per-machine basis or per-application
if the remote machine is running the same mechanism and trusted to
enforce the access control policy.
Patch design approach:
The overall approach is that policy (xfrm_policy) entries set by
user-level programs (e.g., setkey for ipsec-tools) are extended with a
security context that is used at policy selection time in the XFRM
subsystem to restrict the sockets that can send/receive packets via
security associations (xfrm_states) that are built from those
policies.
A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.
Patch implementation details:
On output, the policy retrieved (via xfrm_policy_lookup or
xfrm_sk_policy_lookup) must be authorized for the security context of
the socket and the same security context is required for resultant
security association (retrieved or negotiated via racoon in
ipsec-tools). This is enforced in xfrm_state_find.
On input, the policy retrieved must also be authorized for the socket
(at __xfrm_policy_check), and the security context of the policy must
also match the security association being used.
The patch has virtually no impact on packets that do not use IPSec.
The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as
before.
Also, if IPSec is used without security contexts, the impact is
minimal. The LSM must allow such policies to be selected for the
combination of socket and remote machine, but subsequent IPSec
processing proceeds as in the original case.
Testing:
The pfkey interface is tested using the ipsec-tools. ipsec-tools have
been modified (a separate ipsec-tools patch is available for version
0.5) that supports assignment of xfrm_policy entries and security
associations with security contexts via setkey and the negotiation
using the security contexts via racoon.
The xfrm_user interface is tested via ad hoc programs that set
security contexts. These programs are also available from me, and
contain programs for setting, getting, and deleting policy for testing
this interface. Testing of sa functions was done by tracing kernel
behavior.
Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-12-14 15:12:27 +08:00
|
|
|
#include <linux/security.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct flow_cache_entry {
|
2010-04-07 08:30:07 +08:00
|
|
|
union {
|
|
|
|
struct hlist_node hlist;
|
|
|
|
struct list_head gc_list;
|
|
|
|
} u;
|
2011-08-31 14:05:27 +08:00
|
|
|
struct net *net;
|
2010-04-07 08:30:04 +08:00
|
|
|
u16 family;
|
|
|
|
u8 dir;
|
|
|
|
u32 genid;
|
|
|
|
struct flowi key;
|
|
|
|
struct flow_cache_object *object;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache_percpu {
|
2010-04-07 08:30:07 +08:00
|
|
|
struct hlist_head *hash_table;
|
2010-03-31 08:17:06 +08:00
|
|
|
int hash_count;
|
|
|
|
u32 hash_rnd;
|
|
|
|
int hash_rnd_recalc;
|
|
|
|
struct tasklet_struct flush_tasklet;
|
2008-02-08 10:03:18 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct flow_flush_info {
|
2010-04-07 08:30:04 +08:00
|
|
|
struct flow_cache *cache;
|
2010-03-31 08:17:06 +08:00
|
|
|
atomic_t cpuleft;
|
|
|
|
struct completion completion;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache {
|
|
|
|
u32 hash_shift;
|
2010-09-10 15:00:25 +08:00
|
|
|
struct flow_cache_percpu __percpu *percpu;
|
2010-03-31 08:17:06 +08:00
|
|
|
struct notifier_block hotcpu_notifier;
|
|
|
|
int low_watermark;
|
|
|
|
int high_watermark;
|
|
|
|
struct timer_list rnd_timer;
|
|
|
|
};
|
|
|
|
|
|
|
|
atomic_t flow_cache_genid = ATOMIC_INIT(0);
|
2010-07-10 05:22:04 +08:00
|
|
|
EXPORT_SYMBOL(flow_cache_genid);
|
2010-03-31 08:17:06 +08:00
|
|
|
static struct flow_cache flow_cache_global;
|
2010-09-10 15:00:25 +08:00
|
|
|
static struct kmem_cache *flow_cachep __read_mostly;
|
2010-03-31 08:17:06 +08:00
|
|
|
|
2010-04-07 08:30:07 +08:00
|
|
|
static DEFINE_SPINLOCK(flow_cache_gc_lock);
|
|
|
|
static LIST_HEAD(flow_cache_gc_list);
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
|
|
|
|
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static void flow_cache_new_hashrnd(unsigned long arg)
|
|
|
|
{
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache *fc = (void *) arg;
|
2005-04-17 06:20:36 +08:00
|
|
|
int i;
|
|
|
|
|
2006-04-11 13:52:50 +08:00
|
|
|
for_each_possible_cpu(i)
|
2010-03-31 08:17:06 +08:00
|
|
|
per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
|
|
|
add_timer(&fc->rnd_timer);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-04-07 08:30:04 +08:00
|
|
|
static int flow_entry_valid(struct flow_cache_entry *fle)
|
|
|
|
{
|
|
|
|
if (atomic_read(&flow_cache_genid) != fle->genid)
|
|
|
|
return 0;
|
|
|
|
if (fle->object && !fle->object->ops->check(fle->object))
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2010-04-07 08:30:07 +08:00
|
|
|
static void flow_entry_kill(struct flow_cache_entry *fle)
|
IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error). We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.
The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.
The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.
However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.
The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.
Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely). This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).
Signed-off-by: James Morris <jmorris@namei.org>
2006-10-06 04:42:27 +08:00
|
|
|
{
|
|
|
|
if (fle->object)
|
2010-04-07 08:30:04 +08:00
|
|
|
fle->object->ops->delete(fle->object);
|
IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error). We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.
The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.
The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.
However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.
The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.
Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely). This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).
Signed-off-by: James Morris <jmorris@namei.org>
2006-10-06 04:42:27 +08:00
|
|
|
kmem_cache_free(flow_cachep, fle);
|
2010-04-07 08:30:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void flow_cache_gc_task(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct list_head gc_list;
|
|
|
|
struct flow_cache_entry *fce, *n;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&gc_list);
|
|
|
|
spin_lock_bh(&flow_cache_gc_lock);
|
|
|
|
list_splice_tail_init(&flow_cache_gc_list, &gc_list);
|
|
|
|
spin_unlock_bh(&flow_cache_gc_lock);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
|
|
|
|
flow_entry_kill(fce);
|
|
|
|
}
|
|
|
|
static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
|
|
|
|
|
|
|
|
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
|
|
|
|
int deleted, struct list_head *gc_list)
|
|
|
|
{
|
|
|
|
if (deleted) {
|
|
|
|
fcp->hash_count -= deleted;
|
|
|
|
spin_lock_bh(&flow_cache_gc_lock);
|
|
|
|
list_splice_tail(gc_list, &flow_cache_gc_list);
|
|
|
|
spin_unlock_bh(&flow_cache_gc_lock);
|
|
|
|
schedule_work(&flow_cache_gc_work);
|
|
|
|
}
|
IPsec: propagate security module errors up from flow_cache_lookup
When a security module is loaded (in this case, SELinux), the
security_xfrm_policy_lookup() hook can return an access denied permission
(or other error). We were not handling that correctly, and in fact
inverting the return logic and propagating a false "ok" back up to
xfrm_lookup(), which then allowed packets to pass as if they were not
associated with an xfrm policy.
The way I was seeing the problem was when connecting via IPsec to a
confined service on an SELinux box (vsftpd), which did not have the
appropriate SELinux policy permissions to send packets via IPsec.
The first SYNACK would be blocked, because of an uncached lookup via
flow_cache_lookup(), which would fail to resolve an xfrm policy because
the SELinux policy is checked at that point via the resolver.
However, retransmitted SYNACKs would then find a cached flow entry when
calling into flow_cache_lookup() with a null xfrm policy, which is
interpreted by xfrm_lookup() as the packet not having any associated
policy and similarly to the first case, allowing it to pass without
transformation.
The solution presented here is to first ensure that errno values are
correctly propagated all the way back up through the various call chains
from security_xfrm_policy_lookup(), and handled correctly.
Then, flow_cache_lookup() is modified, so that if the policy resolver
fails (typically a permission denied via the security module), the flow
cache entry is killed rather than having a null policy assigned (which
indicates that the packet can pass freely). This also forces any future
lookups for the same flow to consult the security module (e.g. SELinux)
for current security policy (rather than, say, caching the error on the
flow cache entry).
Signed-off-by: James Morris <jmorris@namei.org>
2006-10-06 04:42:27 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
static void __flow_cache_shrink(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp,
|
|
|
|
int shrink_to)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-04-07 08:30:07 +08:00
|
|
|
struct flow_cache_entry *fle;
|
|
|
|
struct hlist_node *entry, *tmp;
|
|
|
|
LIST_HEAD(gc_list);
|
|
|
|
int i, deleted = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
2010-04-07 08:30:04 +08:00
|
|
|
int saved = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-04-07 08:30:07 +08:00
|
|
|
hlist_for_each_entry_safe(fle, entry, tmp,
|
|
|
|
&fcp->hash_table[i], u.hlist) {
|
2010-04-07 08:30:04 +08:00
|
|
|
if (saved < shrink_to &&
|
|
|
|
flow_entry_valid(fle)) {
|
|
|
|
saved++;
|
|
|
|
} else {
|
2010-04-07 08:30:07 +08:00
|
|
|
deleted++;
|
|
|
|
hlist_del(&fle->u.hlist);
|
|
|
|
list_add_tail(&fle->u.gc_list, &gc_list);
|
2010-04-07 08:30:04 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
2010-04-07 08:30:07 +08:00
|
|
|
|
|
|
|
flow_cache_queue_garbage(fcp, deleted, &gc_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
static void flow_cache_shrink(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-03-31 08:17:06 +08:00
|
|
|
int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
__flow_cache_shrink(fc, fcp, shrink_to);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
static void flow_new_hash_rnd(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-03-31 08:17:06 +08:00
|
|
|
get_random_bytes(&fcp->hash_rnd, sizeof(u32));
|
|
|
|
fcp->hash_rnd_recalc = 0;
|
|
|
|
__flow_cache_shrink(fc, fcp, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
static u32 flow_hash_code(struct flow_cache *fc,
|
|
|
|
struct flow_cache_percpu *fcp,
|
2011-09-06 00:47:24 +08:00
|
|
|
const struct flowi *key,
|
|
|
|
size_t keysize)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-02-23 10:44:31 +08:00
|
|
|
const u32 *k = (const u32 *) key;
|
2011-09-06 00:47:24 +08:00
|
|
|
const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-09-06 00:47:24 +08:00
|
|
|
return jhash2(k, length, fcp->hash_rnd)
|
2010-09-23 04:43:57 +08:00
|
|
|
& (flow_cache_hash_size(fc) - 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* I hear what you're saying, use memcmp. But memcmp cannot make
|
2011-09-06 00:47:24 +08:00
|
|
|
* important assumptions that we can here, such as alignment.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-09-06 00:47:24 +08:00
|
|
|
static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
|
|
|
|
size_t keysize)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-02-23 10:44:31 +08:00
|
|
|
const flow_compare_t *k1, *k1_lim, *k2;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-23 10:44:31 +08:00
|
|
|
k1 = (const flow_compare_t *) key1;
|
2011-09-06 00:47:24 +08:00
|
|
|
k1_lim = k1 + keysize;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-23 10:44:31 +08:00
|
|
|
k2 = (const flow_compare_t *) key2;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
if (*k1++ != *k2++)
|
|
|
|
return 1;
|
|
|
|
} while (k1 < k1_lim);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-07 08:30:04 +08:00
|
|
|
struct flow_cache_object *
|
2011-02-23 10:44:31 +08:00
|
|
|
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
|
2010-04-07 08:30:04 +08:00
|
|
|
flow_resolve_t resolver, void *ctx)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache *fc = &flow_cache_global;
|
|
|
|
struct flow_cache_percpu *fcp;
|
2010-04-07 08:30:07 +08:00
|
|
|
struct flow_cache_entry *fle, *tfle;
|
|
|
|
struct hlist_node *entry;
|
2010-04-07 08:30:04 +08:00
|
|
|
struct flow_cache_object *flo;
|
2011-09-06 00:47:24 +08:00
|
|
|
size_t keysize;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned int hash;
|
|
|
|
|
|
|
|
local_bh_disable();
|
2010-06-24 08:52:37 +08:00
|
|
|
fcp = this_cpu_ptr(fc->percpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
fle = NULL;
|
2010-04-07 08:30:04 +08:00
|
|
|
flo = NULL;
|
2011-09-06 00:47:24 +08:00
|
|
|
|
|
|
|
keysize = flow_key_size(family);
|
|
|
|
if (!keysize)
|
|
|
|
goto nocache;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Packet really early in init? Making flow_cache_init a
|
|
|
|
* pre-smp initcall would solve this. --RR */
|
2010-03-31 08:17:06 +08:00
|
|
|
if (!fcp->hash_table)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto nocache;
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
if (fcp->hash_rnd_recalc)
|
|
|
|
flow_new_hash_rnd(fc, fcp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-09-06 00:47:24 +08:00
|
|
|
hash = flow_hash_code(fc, fcp, key, keysize);
|
2010-04-07 08:30:07 +08:00
|
|
|
hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
|
2011-08-31 14:05:27 +08:00
|
|
|
if (tfle->net == net &&
|
|
|
|
tfle->family == family &&
|
2010-04-07 08:30:07 +08:00
|
|
|
tfle->dir == dir &&
|
2011-09-06 00:47:24 +08:00
|
|
|
flow_key_compare(key, &tfle->key, keysize) == 0) {
|
2010-04-07 08:30:07 +08:00
|
|
|
fle = tfle;
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2010-04-07 08:30:07 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-04-07 08:30:04 +08:00
|
|
|
if (unlikely(!fle)) {
|
2010-03-31 08:17:06 +08:00
|
|
|
if (fcp->hash_count > fc->high_watermark)
|
|
|
|
flow_cache_shrink(fc, fcp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:33:16 +08:00
|
|
|
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (fle) {
|
2011-08-31 14:05:27 +08:00
|
|
|
fle->net = net;
|
2005-04-17 06:20:36 +08:00
|
|
|
fle->family = family;
|
|
|
|
fle->dir = dir;
|
2011-09-06 00:47:24 +08:00
|
|
|
memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
|
2005-04-17 06:20:36 +08:00
|
|
|
fle->object = NULL;
|
2010-04-07 08:30:07 +08:00
|
|
|
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
|
2010-03-31 08:17:06 +08:00
|
|
|
fcp->hash_count++;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-04-07 08:30:04 +08:00
|
|
|
} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
|
|
|
|
flo = fle->object;
|
|
|
|
if (!flo)
|
|
|
|
goto ret_object;
|
|
|
|
flo = flo->ops->get(flo);
|
|
|
|
if (flo)
|
|
|
|
goto ret_object;
|
|
|
|
} else if (fle->object) {
|
|
|
|
flo = fle->object;
|
|
|
|
flo->ops->delete(flo);
|
|
|
|
fle->object = NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
nocache:
|
2010-04-07 08:30:04 +08:00
|
|
|
flo = NULL;
|
|
|
|
if (fle) {
|
|
|
|
flo = fle->object;
|
|
|
|
fle->object = NULL;
|
|
|
|
}
|
|
|
|
flo = resolver(net, key, family, dir, flo, ctx);
|
|
|
|
if (fle) {
|
|
|
|
fle->genid = atomic_read(&flow_cache_genid);
|
|
|
|
if (!IS_ERR(flo))
|
|
|
|
fle->object = flo;
|
|
|
|
else
|
|
|
|
fle->genid--;
|
|
|
|
} else {
|
|
|
|
if (flo && !IS_ERR(flo))
|
|
|
|
flo->ops->delete(flo);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-04-07 08:30:04 +08:00
|
|
|
ret_object:
|
|
|
|
local_bh_enable();
|
|
|
|
return flo;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-07-10 05:22:04 +08:00
|
|
|
EXPORT_SYMBOL(flow_cache_lookup);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static void flow_cache_flush_tasklet(unsigned long data)
|
|
|
|
{
|
|
|
|
struct flow_flush_info *info = (void *)data;
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache *fc = info->cache;
|
|
|
|
struct flow_cache_percpu *fcp;
|
2010-04-07 08:30:07 +08:00
|
|
|
struct flow_cache_entry *fle;
|
|
|
|
struct hlist_node *entry, *tmp;
|
|
|
|
LIST_HEAD(gc_list);
|
|
|
|
int i, deleted = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-06-24 08:52:37 +08:00
|
|
|
fcp = this_cpu_ptr(fc->percpu);
|
2010-03-31 08:17:06 +08:00
|
|
|
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
2010-04-07 08:30:07 +08:00
|
|
|
hlist_for_each_entry_safe(fle, entry, tmp,
|
|
|
|
&fcp->hash_table[i], u.hlist) {
|
2010-04-07 08:30:04 +08:00
|
|
|
if (flow_entry_valid(fle))
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
|
2010-04-07 08:30:07 +08:00
|
|
|
deleted++;
|
|
|
|
hlist_del(&fle->u.hlist);
|
|
|
|
list_add_tail(&fle->u.gc_list, &gc_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-04-07 08:30:07 +08:00
|
|
|
flow_cache_queue_garbage(fcp, deleted, &gc_list);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (atomic_dec_and_test(&info->cpuleft))
|
|
|
|
complete(&info->completion);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flow_cache_flush_per_cpu(void *data)
|
|
|
|
{
|
|
|
|
struct flow_flush_info *info = data;
|
|
|
|
int cpu;
|
|
|
|
struct tasklet_struct *tasklet;
|
|
|
|
|
|
|
|
cpu = smp_processor_id();
|
2010-03-31 08:17:06 +08:00
|
|
|
tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
|
2005-04-17 06:20:36 +08:00
|
|
|
tasklet->data = (unsigned long)info;
|
|
|
|
tasklet_schedule(tasklet);
|
|
|
|
}
|
|
|
|
|
|
|
|
void flow_cache_flush(void)
|
|
|
|
{
|
|
|
|
struct flow_flush_info info;
|
2006-03-21 14:33:17 +08:00
|
|
|
static DEFINE_MUTEX(flow_flush_sem);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Don't want cpus going down or up during this. */
|
2008-01-26 04:08:02 +08:00
|
|
|
get_online_cpus();
|
2006-03-21 14:33:17 +08:00
|
|
|
mutex_lock(&flow_flush_sem);
|
2010-03-31 08:17:06 +08:00
|
|
|
info.cache = &flow_cache_global;
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_set(&info.cpuleft, num_online_cpus());
|
|
|
|
init_completion(&info.completion);
|
|
|
|
|
|
|
|
local_bh_disable();
|
2008-06-06 17:18:06 +08:00
|
|
|
smp_call_function(flow_cache_flush_per_cpu, &info, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
flow_cache_flush_tasklet((unsigned long)&info);
|
|
|
|
local_bh_enable();
|
|
|
|
|
|
|
|
wait_for_completion(&info.completion);
|
2006-03-21 14:33:17 +08:00
|
|
|
mutex_unlock(&flow_flush_sem);
|
2008-01-26 04:08:02 +08:00
|
|
|
put_online_cpus();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-12-22 05:48:08 +08:00
|
|
|
static void flow_cache_flush_task(struct work_struct *work)
|
|
|
|
{
|
|
|
|
flow_cache_flush();
|
|
|
|
}
|
|
|
|
|
|
|
|
static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
|
|
|
|
|
|
|
|
void flow_cache_flush_deferred(void)
|
|
|
|
{
|
|
|
|
schedule_work(&flow_cache_flush_work);
|
|
|
|
}
|
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-09-10 15:00:25 +08:00
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
|
|
|
size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
|
2010-03-31 08:17:06 +08:00
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
if (!fcp->hash_table) {
|
|
|
|
fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
if (!fcp->hash_table) {
|
|
|
|
pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
fcp->hash_rnd_recalc = 1;
|
|
|
|
fcp->hash_count = 0;
|
|
|
|
tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
|
|
|
|
}
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long action,
|
|
|
|
void *hcpu)
|
|
|
|
{
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
|
2010-09-10 15:00:25 +08:00
|
|
|
int res, cpu = (unsigned long) hcpu;
|
2010-03-31 08:17:06 +08:00
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
switch (action) {
|
|
|
|
case CPU_UP_PREPARE:
|
|
|
|
case CPU_UP_PREPARE_FROZEN:
|
|
|
|
res = flow_cache_cpu_prepare(fc, cpu);
|
|
|
|
if (res)
|
|
|
|
return notifier_from_errno(res);
|
|
|
|
break;
|
|
|
|
case CPU_DEAD:
|
|
|
|
case CPU_DEAD_FROZEN:
|
2010-03-31 08:17:06 +08:00
|
|
|
__flow_cache_shrink(fc, fcp, 0);
|
2010-09-10 15:00:25 +08:00
|
|
|
break;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
static int __init flow_cache_init(struct flow_cache *fc)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
fc->hash_shift = 10;
|
|
|
|
fc->low_watermark = 2 * flow_cache_hash_size(fc);
|
|
|
|
fc->high_watermark = 4 * flow_cache_hash_size(fc);
|
|
|
|
|
|
|
|
fc->percpu = alloc_percpu(struct flow_cache_percpu);
|
2010-09-10 15:00:25 +08:00
|
|
|
if (!fc->percpu)
|
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
for_each_online_cpu(i) {
|
|
|
|
if (flow_cache_cpu_prepare(fc, i))
|
2011-09-28 06:51:39 +08:00
|
|
|
goto err;
|
2010-09-10 15:00:25 +08:00
|
|
|
}
|
2010-03-31 08:17:06 +08:00
|
|
|
fc->hotcpu_notifier = (struct notifier_block){
|
|
|
|
.notifier_call = flow_cache_cpu,
|
|
|
|
};
|
|
|
|
register_hotcpu_notifier(&fc->hotcpu_notifier);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-10 15:00:25 +08:00
|
|
|
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
|
|
|
|
(unsigned long) fc);
|
|
|
|
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
|
|
|
add_timer(&fc->rnd_timer);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
2011-09-28 06:51:39 +08:00
|
|
|
|
|
|
|
err:
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
|
|
|
|
kfree(fcp->hash_table);
|
|
|
|
fcp->hash_table = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_percpu(fc->percpu);
|
|
|
|
fc->percpu = NULL;
|
|
|
|
|
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-03-31 08:17:06 +08:00
|
|
|
static int __init flow_cache_init_global(void)
|
|
|
|
{
|
|
|
|
flow_cachep = kmem_cache_create("flow_cache",
|
|
|
|
sizeof(struct flow_cache_entry),
|
|
|
|
0, SLAB_PANIC, NULL);
|
|
|
|
|
|
|
|
return flow_cache_init(&flow_cache_global);
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(flow_cache_init_global);
|