diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 32786a044718..3f237db0a426 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -1,10 +1,17 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ +#include + struct netns_frags { int nqueues; - atomic_t mem; struct list_head lru_list; + spinlock_t lru_lock; + + /* The percpu_counter "mem" need to be cacheline aligned. + * mem.count must not share cacheline with other writers + */ + struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout; @@ -13,12 +20,11 @@ struct netns_frags { }; struct inet_frag_queue { - struct hlist_node list; - struct netns_frags *net; - struct list_head lru_list; /* lru list member */ spinlock_t lock; - atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ + struct list_head lru_list; /* lru list member */ + struct hlist_node list; + atomic_t refcnt; struct sk_buff *fragments; /* list of received fragments */ struct sk_buff *fragments_tail; ktime_t stamp; @@ -31,24 +37,29 @@ struct inet_frag_queue { #define INET_FRAG_LAST_IN 1 u16 max_size; + + struct netns_frags *net; }; #define INETFRAGS_HASHSZ 64 struct inet_frags { struct hlist_head hash[INETFRAGS_HASHSZ]; - rwlock_t lock; - u32 rnd; - int qsize; + /* This rwlock is a global lock (seperate per IPv4, IPv6 and + * netfilter). Important to keep this on a seperate cacheline. + */ + rwlock_t lock ____cacheline_aligned_in_smp; int secret_interval; struct timer_list secret_timer; + u32 rnd; + int qsize; unsigned int (*hashfn)(struct inet_frag_queue *); + bool (*match)(struct inet_frag_queue *q, void *arg); void (*constructor)(struct inet_frag_queue *q, void *arg); void (*destructor)(struct inet_frag_queue *); void (*skb_free)(struct sk_buff *); - bool (*match)(struct inet_frag_queue *q, void *arg); void (*frag_expire)(unsigned long data); }; @@ -72,4 +83,59 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f inet_frag_destroy(q, f, NULL); } +/* Memory Tracking Functions. */ + +/* The default percpu_counter batch size is not big enough to scale to + * fragmentation mem acct sizes. + * The mem size of a 64K fragment is approx: + * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes + */ +static unsigned int frag_percpu_counter_batch = 130000; + +static inline int frag_mem_limit(struct netns_frags *nf) +{ + return percpu_counter_read(&nf->mem); +} + +static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i) +{ + __percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch); +} + +static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) +{ + __percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch); +} + +static inline void init_frag_mem_limit(struct netns_frags *nf) +{ + percpu_counter_init(&nf->mem, 0); +} + +static inline int sum_frag_mem_limit(struct netns_frags *nf) +{ + return percpu_counter_sum_positive(&nf->mem); +} + +static inline void inet_frag_lru_move(struct inet_frag_queue *q) +{ + spin_lock(&q->net->lru_lock); + list_move_tail(&q->lru_list, &q->net->lru_list); + spin_unlock(&q->net->lru_lock); +} + +static inline void inet_frag_lru_del(struct inet_frag_queue *q) +{ + spin_lock(&q->net->lru_lock); + list_del(&q->lru_list); + spin_unlock(&q->net->lru_lock); +} + +static inline void inet_frag_lru_add(struct netns_frags *nf, + struct inet_frag_queue *q) +{ + spin_lock(&nf->lru_lock); + list_add_tail(&q->lru_list, &nf->lru_list); + spin_unlock(&nf->lru_lock); +} #endif diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c1878f7049c8..dc30b60975ef 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,7 @@ static inline int ip6_frag_nqueues(struct net *net) static inline int ip6_frag_mem(struct net *net) { - return atomic_read(&net->ipv6.frags.mem); + return sum_frag_mem_limit(&net->ipv6.frags); } #endif diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4750d2b74d79..2e453bde6992 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -73,8 +73,9 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { nf->nqueues = 0; - atomic_set(&nf->mem, 0); + init_frag_mem_limit(nf); INIT_LIST_HEAD(&nf->lru_list); + spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); @@ -91,6 +92,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) local_bh_disable(); inet_frag_evictor(nf, f, true); local_bh_enable(); + + percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); @@ -98,9 +101,9 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) { write_lock(&f->lock); hlist_del(&fq->list); - list_del(&fq->lru_list); fq->net->nqueues--; write_unlock(&f->lock); + inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -117,12 +120,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb, int *work) + struct sk_buff *skb) { - if (work) - *work -= skb->truesize; - - atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -133,6 +132,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, { struct sk_buff *fp; struct netns_frags *nf; + unsigned int sum, sum_truesize = 0; WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -143,13 +143,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(nf, f, fp, work); + sum_truesize += fp->truesize; + frag_kfree_skb(nf, f, fp); fp = xp; } - + sum = sum_truesize + f->qsize; if (work) - *work -= f->qsize; - atomic_sub(f->qsize, &nf->mem); + *work -= sum; + sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); @@ -164,22 +165,23 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) int work, evicted = 0; if (!force) { - if (atomic_read(&nf->mem) <= nf->high_thresh) + if (frag_mem_limit(nf) <= nf->high_thresh) return 0; } - work = atomic_read(&nf->mem) - nf->low_thresh; + work = frag_mem_limit(nf) - nf->low_thresh; while (work > 0) { - read_lock(&f->lock); + spin_lock(&nf->lru_lock); + if (list_empty(&nf->lru_list)) { - read_unlock(&f->lock); + spin_unlock(&nf->lru_lock); break; } q = list_first_entry(&nf->lru_list, struct inet_frag_queue, lru_list); atomic_inc(&q->refcnt); - read_unlock(&f->lock); + spin_unlock(&nf->lru_lock); spin_lock(&q->lock); if (!(q->last_in & INET_FRAG_COMPLETE)) @@ -233,9 +235,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &f->hash[hash]); - list_add_tail(&qp->lru_list, &nf->lru_list); nf->nqueues++; write_unlock(&f->lock); + inet_frag_lru_add(nf, qp); return qp; } @@ -250,7 +252,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, q->net = nf; f->constructor(q, arg); - atomic_add(f->qsize, &nf->mem); + add_frag_mem_limit(q, f->qsize); + setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f55a4e61bfb8..1211613c6c34 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -122,7 +122,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -161,13 +161,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); @@ -340,6 +333,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -349,9 +343,12 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; @@ -496,7 +493,8 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -519,7 +517,7 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - atomic_add(skb->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; @@ -531,9 +529,7 @@ found: qp->q.meat == qp->q.len) return ip_frag_reasm(qp, prev, dev); - write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); - write_unlock(&ip4_frags.lock); + inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -617,7 +613,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } skb_push(head, head->data - skb_network_header(head)); @@ -645,7 +641,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &qp->q.net->mem); + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3dacecc99065..c674f158efa8 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -319,7 +319,7 @@ found: fq->q.meat += skb->len; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - atomic_add(skb->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -328,9 +328,8 @@ found: fq->nhoffset = nhoff; fq->q.last_in |= INET_FRAG_FIRST_IN; } - write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&nf_frags.lock); + + inet_frag_lru_move(&fq->q); return 0; discard_fq: @@ -398,7 +397,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -422,7 +421,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } - atomic_sub(head->truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, head->truesize); head->local_df = 1; head->next = NULL; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e5253ec9e0fc..bab2c270f292 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -327,7 +327,7 @@ found: } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -341,9 +341,7 @@ found: fq->q.meat == fq->q.len) return ip6_frag_reasm(fq, prev, dev); - write_lock(&ip6_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&ip6_frags.lock); + inet_frag_lru_move(&fq->q); return -1; discard_fq: @@ -429,7 +427,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -467,7 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } fp = next; } - atomic_sub(sum_truesize, &fq->q.net->mem); + sub_frag_mem_limit(&fq->q, sum_truesize); head->next = NULL; head->dev = dev;