From 42c8c99cd891184bf4bcf6f09d62c54e42599453 Mon Sep 17 00:00:00 2001
From: Zhao Jin <cronozhj@gmail.com>
Date: Sat, 27 Aug 2011 00:26:17 +0800
Subject: [PATCH 1/7] slab, cleanup: remove unneeded return

The procedure ends right after the if-statement, so remove ``return''.
Also move the last common statement outside.

Signed-off-by: Zhao Jin <cronozhj@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..806a754fad8e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3693,13 +3693,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
-		ac->entry[ac->avail++] = objp;
-		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
-		ac->entry[ac->avail++] = objp;
 	}
+
+	ac->entry[ac->avail++] = objp;
 }
 
 /**

From 0ad9500e16fe24aa55809a2b00e0d2d0e658fc71 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 16 Dec 2011 16:25:34 +0100
Subject: [PATCH 2/7] slub: prefetch next freelist pointer in slab_alloc()

Recycling a page is a problem, since freelist link chain is hot on
cpu(s) which freed objects, and possibly very cold on cpu currently
owning slab.

Adding a prefetch of cache line containing the pointer to next object in
slab_alloc() helps a lot in many workloads, in particular on assymetric
ones (allocations done on one cpu, frees on another cpus). Added cost is
three machine instructions only.

Examples on my dual socket quad core ht machine (Intel CPU E5540
@2.53GHz) (16 logical cpus, 2 memory nodes), 64bit kernel.

Before patch :

# perf stat -r 32 hackbench 50 process 4000 >/dev/null

 Performance counter stats for 'hackbench 50 process 4000' (32 runs):

     327577,471718 task-clock                #   15,821 CPUs utilized            ( +-  0,64% )
        28 866 491 context-switches          #    0,088 M/sec                    ( +-  1,80% )
         1 506 929 CPU-migrations            #    0,005 M/sec                    ( +-  3,24% )
           127 151 page-faults               #    0,000 M/sec                    ( +-  0,16% )
   829 399 813 448 cycles                    #    2,532 GHz                      ( +-  0,64% )
   580 664 691 740 stalled-cycles-frontend   #   70,01% frontend cycles idle     ( +-  0,71% )
   197 431 700 448 stalled-cycles-backend    #   23,80% backend  cycles idle     ( +-  1,03% )
   503 548 648 975 instructions              #    0,61  insns per cycle
                                             #    1,15  stalled cycles per insn  ( +-  0,46% )
    95 780 068 471 branches                  #  292,389 M/sec                    ( +-  0,48% )
     1 426 407 916 branch-misses             #    1,49% of all branches          ( +-  1,35% )

      20,705679994 seconds time elapsed                                          ( +-  0,64% )

After patch :

# perf stat -r 32 hackbench 50 process 4000 >/dev/null

 Performance counter stats for 'hackbench 50 process 4000' (32 runs):

     286236,542804 task-clock                #   15,786 CPUs utilized            ( +-  1,32% )
        19 703 372 context-switches          #    0,069 M/sec                    ( +-  4,99% )
         1 658 249 CPU-migrations            #    0,006 M/sec                    ( +-  6,62% )
           126 776 page-faults               #    0,000 M/sec                    ( +-  0,12% )
   724 636 593 213 cycles                    #    2,532 GHz                      ( +-  1,32% )
   499 320 714 837 stalled-cycles-frontend   #   68,91% frontend cycles idle     ( +-  1,47% )
   156 555 126 809 stalled-cycles-backend    #   21,60% backend  cycles idle     ( +-  2,22% )
   463 897 792 661 instructions              #    0,64  insns per cycle
                                             #    1,08  stalled cycles per insn  ( +-  0,94% )
    87 717 352 563 branches                  #  306,451 M/sec                    ( +-  0,99% )
       941 738 280 branch-misses             #    1,07% of all branches          ( +-  3,35% )

      18,132070670 seconds time elapsed                                          ( +-  1,30% )

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Christoph Lameter <cl@linux.com>
CC: Matt Mackall <mpm@selenic.com>
CC: David Rientjes <rientjes@google.com>
CC: "Alex,Shi" <alex.shi@intel.com>
CC: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..5b915e86a9b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -269,6 +269,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
 	return *(void **)(object + s->offset);
 }
 
+static void prefetch_freepointer(const struct kmem_cache *s, void *object)
+{
+	prefetch(object + s->offset);
+}
+
 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 {
 	void *p;
@@ -2309,6 +2314,8 @@ redo:
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
+		void *next_object = get_freepointer_safe(s, object);
+
 		/*
 		 * The cmpxchg will only match if there was no additional
 		 * operation and if we are on the right processor.
@@ -2324,11 +2331,12 @@ redo:
 		if (unlikely(!this_cpu_cmpxchg_double(
 				s->cpu_slab->freelist, s->cpu_slab->tid,
 				object, tid,
-				get_freepointer_safe(s, object), next_tid(tid)))) {
+				next_object, next_tid(tid)))) {
 
 			note_cmpxchg_failure("slab_alloc", s, tid);
 			goto redo;
 		}
+		prefetch_freepointer(s, next_object);
 		stat(s, ALLOC_FASTPATH);
 	}
 

From 66c4c35c6bc5a1a452b024cf0364635b28fd94e4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 17 Jan 2012 09:27:31 -0600
Subject: [PATCH 3/7] slub: Do not hold slub_lock when calling sysfs_slab_add()

sysfs_slab_add() calls various sysfs functions that actually may
end up in userspace doing all sorts of things.

Release the slub_lock after adding the kmem_cache structure to the list.
At that point the address of the kmem_cache is not known so we are
guaranteed exlusive access to the following modifications to the
kmem_cache structure.

If the sysfs_slab_add fails then reacquire the slub_lock to
remove the kmem_cache structure from the list.

Cc: <stable@vger.kernel.org>	# 3.3+
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 5b915e86a9b0..bc7a8af24f16 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3937,13 +3937,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 		if (kmem_cache_open(s, n,
 				size, align, flags, ctor)) {
 			list_add(&s->list, &slab_caches);
+			up_write(&slub_lock);
 			if (sysfs_slab_add(s)) {
+				down_write(&slub_lock);
 				list_del(&s->list);
 				kfree(n);
 				kfree(s);
 				goto err;
 			}
-			up_write(&slub_lock);
 			return s;
 		}
 		kfree(n);

From 4de900b4d6b2216b7443d32e263f5de9078697a3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 30 Jan 2012 15:53:51 -0600
Subject: [PATCH 4/7] slub: include include for prefetch

Otherwise m68k breaks:

On Mon, 30 Jan 2012, Geert Uytterhoeven wrote:
> m68k/allmodconfig at http://kisskb.ellerman.id.au/kisskb/buildresult/5527349/
>
> mm/slub.c:274: error: implicit declaration of function 'prefetch'
>
> Sorry, didn't notice it earlier due to other build breakage in -next.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/slub.c b/mm/slub.c
index bc7a8af24f16..b6666eb3d9c4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -29,6 +29,7 @@
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
 #include <linux/stacktrace.h>
+#include <linux/prefetch.h>
 
 #include <trace/events/kmem.h>
 

From 8028dcea8abbbd51b5156e40ea214c20b559cd01 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Fri, 3 Feb 2012 23:34:56 +0800
Subject: [PATCH 5/7] slub: per cpu partial statistics change

This patch split the cpu_partial_free into 2 parts: cpu_partial_node, PCP refilling
times from node partial; and same name cpu_partial_free, PCP refilling times in
slab_free slow path. A new statistic 'cpu_partial_drain' is added to get PCP
drain to node partial times. These info are useful when do PCP tunning.

The slabinfo.c code is unchanged, since cpu_partial_node is not on slow path.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h |  6 ++++--
 mm/slub.c                | 12 +++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a32bcfdc7834..6388a6681af1 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -21,7 +21,7 @@ enum stat_item {
 	FREE_FROZEN,		/* Freeing to frozen slab */
 	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
 	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
-	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from partial list */
+	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
 	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
 	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
 	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
@@ -37,7 +37,9 @@ enum stat_item {
 	CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
 	CMPXCHG_DOUBLE_FAIL,	/* Number of times that cmpxchg double did not match */
 	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
-	CPU_PARTIAL_FREE,	/* USed cpu partial on free */
+	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
+	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
+	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
diff --git a/mm/slub.c b/mm/slub.c
index b6666eb3d9c4..24132edcfe33 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1566,6 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s,
 		} else {
 			page->freelist = t;
 			available = put_cpu_partial(s, page, 0);
+			stat(s, CPU_PARTIAL_NODE);
 		}
 		if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
 			break;
@@ -1979,6 +1980,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 				local_irq_restore(flags);
 				pobjects = 0;
 				pages = 0;
+				stat(s, CPU_PARTIAL_DRAIN);
 			}
 		}
 
@@ -1990,7 +1992,6 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		page->next = oldpage;
 
 	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
-	stat(s, CPU_PARTIAL_FREE);
 	return pobjects;
 }
 
@@ -2474,9 +2475,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		 * If we just froze the page then put it onto the
 		 * per cpu partial list.
 		 */
-		if (new.frozen && !was_frozen)
+		if (new.frozen && !was_frozen) {
 			put_cpu_partial(s, page, 1);
-
+			stat(s, CPU_PARTIAL_FREE);
+		}
 		/*
 		 * The list lock was not taken therefore no list
 		 * activity can be necessary.
@@ -5069,6 +5071,8 @@ STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
+STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
+STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
 #endif
 
 static struct attribute *slab_attrs[] = {
@@ -5134,6 +5138,8 @@ static struct attribute *slab_attrs[] = {
 	&cmpxchg_double_cpu_fail_attr.attr,
 	&cpu_partial_alloc_attr.attr,
 	&cpu_partial_free_attr.attr,
+	&cpu_partial_node_attr.attr,
+	&cpu_partial_drain_attr.attr,
 #endif
 #ifdef CONFIG_FAILSLAB
 	&failslab_attr.attr,

From a8203725dfded5c1f79dca3368a4a273e24b59bb Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Mon, 5 Mar 2012 15:14:41 -0800
Subject: [PATCH 6/7] slab: introduce kmalloc_array()

Introduce a kmalloc_array() wrapper that performs integer overflow
checking without zeroing the memory.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 573c809c33d9..a595dce6b0c7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -190,7 +190,7 @@ size_t ksize(const void *);
 #endif
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
+ * kmalloc_array - allocate memory for an array.
  * @n: number of elements.
  * @size: element size.
  * @flags: the type of memory to allocate.
@@ -240,11 +240,22 @@ size_t ksize(const void *);
  * for general use, and so are not documented here. For a full list of
  * potential flags, always refer to linux/gfp.h.
  */
-static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
 	if (size != 0 && n > ULONG_MAX / size)
 		return NULL;
-	return __kmalloc(n * size, flags | __GFP_ZERO);
+	return __kmalloc(n * size, flags);
+}
+
+/**
+ * kcalloc - allocate memory for an array. The memory is set to zero.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
 #if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)

From 8bdec192b40cf7f7eec170b317c76089eb5eeddb Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Fri, 9 Mar 2012 17:27:27 -0300
Subject: [PATCH 7/7] mm: SLAB Out-of-memory diagnostics

Following the example at mm/slub.c, add out-of-memory diagnostics to the
SLAB allocator to help on debugging certain OOM conditions.

An example print out looks like this:

  <snip page allocator out-of-memory message>
  SLAB: Unable to allocate memory on node 0 (gfp=0x11200)
    cache: bio-0, object size: 192, order: 0
    node 0: slabs: 3/3, objs: 60/60, free: 0

Signed-off-by: Rafael Aquini <aquini@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/mm/slab.c b/mm/slab.c
index 806a754fad8e..67e0e0589267 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1731,6 +1731,52 @@ static int __init cpucache_init(void)
 }
 __initcall(cpucache_init);
 
+static noinline void
+slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
+{
+	struct kmem_list3 *l3;
+	struct slab *slabp;
+	unsigned long flags;
+	int node;
+
+	printk(KERN_WARNING
+		"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+		nodeid, gfpflags);
+	printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
+		cachep->name, cachep->buffer_size, cachep->gfporder);
+
+	for_each_online_node(node) {
+		unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
+		unsigned long active_slabs = 0, num_slabs = 0;
+
+		l3 = cachep->nodelists[node];
+		if (!l3)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each_entry(slabp, &l3->slabs_full, list) {
+			active_objs += cachep->num;
+			active_slabs++;
+		}
+		list_for_each_entry(slabp, &l3->slabs_partial, list) {
+			active_objs += slabp->inuse;
+			active_slabs++;
+		}
+		list_for_each_entry(slabp, &l3->slabs_free, list)
+			num_slabs++;
+
+		free_objects += l3->free_objects;
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+
+		num_slabs += active_slabs;
+		num_objs = num_slabs * cachep->num;
+		printk(KERN_WARNING
+			"  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+			node, active_slabs, num_slabs, active_objs, num_objs,
+			free_objects);
+	}
+}
+
 /*
  * Interface to system's page allocator. No need to hold the cache-lock.
  *
@@ -1757,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		flags |= __GFP_RECLAIMABLE;
 
 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
-	if (!page)
+	if (!page) {
+		if (!(flags & __GFP_NOWARN) && printk_ratelimit())
+			slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
+	}
 
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)