Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu

Conflicts: arch/x86/include/asm/pgtable.h
2009-02-24 21:52:45 +01:00 · 2009-02-24 21:52:45 +01:00 · 0edcf8d692
parent 87b203079e 40150d37be
commit 0edcf8d692
25 changed files with 1720 additions and 198 deletions
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@ -189,9 +189,21 @@ callback_init(void * kernel_end)
 	if (alpha_using_srm) {
 		static struct vm_struct console_remap_vm;
-		unsigned long vaddr = VMALLOC_START;
+		unsigned long nr_pages = 0;
 		unsigned long vaddr;
 		unsigned long i, j;
 		/* calculate needed size */
 		for (i = 0; i < crb->map_entries; ++i)
 			nr_pages += crb->map[i].count;
 		/* register the vm area */
 		console_remap_vm.flags = VM_ALLOC;
 		console_remap_vm.size = nr_pages << PAGE_SHIFT;
 		vm_area_register_early(&console_remap_vm, PAGE_SIZE);
 		vaddr = (unsigned long)consle_remap_vm.addr;
 		/* Set up the third level PTEs and update the virtual
 		   addresses of the CRB entries.  */
 		for (i = 0; i < crb->map_entries; ++i) {
@ -213,12 +225,6 @@ callback_init(void * kernel_end)
 				vaddr += PAGE_SIZE;
 			}
 		}
 		/* Let vmalloc know that we've allocated some space.  */
 		console_remap_vm.flags = VM_ALLOC;
 		console_remap_vm.addr = (void *) VMALLOC_START;
 		console_remap_vm.size = vaddr - VMALLOC_START;
 		vmlist = &console_remap_vm;
 	}
 	callback_init_done = 1;
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
 	def_bool y
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool n
 config ARCH_HAVE_MEMORY_PRESENT
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 config HAVE_DYNAMIC_PER_CPU_AREA
 	def_bool y
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
@ -1122,7 +1125,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
+/* always use node 0 for bootmem on this numa platform */
-/*
+#define alloc_bootmem_core(__bdata, size, align, goal, limit)		\
 * Following are macros that are specific to this numa platform.
 */
 #define reserve_bootmem(addr, size, flags) \
 	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
 #define alloc_bootmem(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
 	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
 				__pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
 	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
 				__pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x)					\
 ({									\
-	struct pglist_data  __maybe_unused			\
+	bootmem_data_t __maybe_unused *	__abm_bdata_dummy = (__bdata);	\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
+	__alloc_bootmem_core(NODE_DATA(0)->bdata,			\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
+			     (size), (align), (goal), (limit));		\
 						__pa(MAX_DMA_ADDRESS));	\
 })
 #define alloc_bootmem_pages_node(pgdat, x)				\
 ({									\
 	struct pglist_data  __maybe_unused			\
 				*__alloc_bootmem_node__pgdat = (pgdat);	\
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
 						__pa(MAX_DMA_ADDRESS));	\
 })
 #define alloc_bootmem_low_pages_node(pgdat, x)				\
 ({									\
 	struct pglist_data  __maybe_unused			\
 				*__alloc_bootmem_node__pgdat = (pgdat);	\
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
 })
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@ -43,6 +43,14 @@
 #else /* ...!ASSEMBLY */
 #include <linux/stringify.h>
 #include <asm/sections.h>
 #define __addr_to_pcpu_ptr(addr)					\
 	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
 		 + (unsigned long)__per_cpu_start)
 #define __pcpu_ptr_to_addr(ptr)						\
 	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
 		 - (unsigned long)__per_cpu_start)
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
 	return 1;
 }
 pmd_t *populate_extra_pmd(unsigned long vaddr);
 pte_t *populate_extra_pte(unsigned long vaddr);
 #endif	/* __ASSEMBLY__ */
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
 #include <linux/percpu.h>
 #include <asm/apic.h>
@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	u32 *isp, arg1, arg2;
 	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = hardirq_ctx[smp_processor_id()];
+	irqctx = __get_cpu_var(hardirq_ctx);
 	/*
 	 * this is where we switch to the IRQ stack. However, if we are
@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
-	if (hardirq_ctx[cpu])
+	if (per_cpu(hardirq_ctx, cpu))
 		return;
-	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(hardirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
-	hardirq_ctx[cpu] = irqctx;
+	per_cpu(hardirq_ctx, cpu) = irqctx;
-	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(softirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= 0;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
-	softirq_ctx[cpu] = irqctx;
+	per_cpu(softirq_ctx, cpu) = irqctx;
 	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 void irq_ctx_exit(int cpu)
 {
-	hardirq_ctx[cpu] = NULL;
+	per_cpu(hardirq_ctx, cpu) = NULL;
 }
 asmlinkage void do_softirq(void)
@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
-		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx = __get_cpu_var(softirq_ctx);
 		irqctx->tinfo.task = curctx->task;
 		irqctx->tinfo.previous_esp = current_stack_pointer;
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
 #include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 /**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
 static bool __init pcpu_need_numa(void)
 {
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 	pg_data_t *last = NULL;
 	unsigned int cpu;
 	for_each_possible_cpu(cpu) {
 		int node = early_cpu_to_node(cpu);
 		if (node_online(node) && NODE_DATA(node) &&
 		    last && last != NODE_DATA(node))
 			return true;
 		last = NODE_DATA(node);
 	}
 #endif
 	return false;
 }
 /**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
 static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 					unsigned long align)
 {
 	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 	int node = early_cpu_to_node(cpu);
 	void *ptr;
 	if (!node_online(node) || !NODE_DATA(node)) {
 		ptr = __alloc_bootmem_nopanic(size, align, goal);
 		pr_info("cpu %d has no node %d or node-local memory\n",
 			cpu, node);
 		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
 			 cpu, size, __pa(ptr));
 	} else {
 		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
 						   size, align, goal);
 		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
 			 "%016lx\n", cpu, size, node, __pa(ptr));
 	}
 	return ptr;
 #else
 	return __alloc_bootmem_nopanic(size, align, goal);
 #endif
 }
 /*
 * Remap allocator
 *
 * This allocator uses PMD page as unit.  A PMD page is allocated for
 * each cpu and each is remapped into vmalloc area using PMD mapping.
 * As PMD page is quite large, only part of it is used for the first
 * chunk.  Unused part is returned to the bootmem allocator.
 *
 * So, the PMD pages are mapped twice - once to the physical mapping
 * and to the vmalloc area for the first percpu chunk.  The double
 * mapping does add one more PMD TLB entry pressure but still is much
 * better than only using 4k mappings while still being NUMA friendly.
 */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 static size_t pcpur_size __initdata;
 static void **pcpur_ptrs __initdata;
 static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 {
 	size_t off = (size_t)pageno << PAGE_SHIFT;
 	if (off >= pcpur_size)
 		return NULL;
 	return virt_to_page(pcpur_ptrs[cpu] + off);
 }
 static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	static struct vm_struct vm;
 	pg_data_t *last;
 	size_t ptrs_size;
 	unsigned int cpu;
 	ssize_t ret;
 	/*
 	 * If large page isn't supported, there's no benefit in doing
 	 * this.  Also, on non-NUMA, embedding is better.
 	 */
 	if (!cpu_has_pse || pcpu_need_numa())
 		return -EINVAL;
 	last = NULL;
 	for_each_possible_cpu(cpu) {
 		int node = early_cpu_to_node(cpu);
 		if (node_online(node) && NODE_DATA(node) &&
 		    last && last != NODE_DATA(node))
 			goto proceed;
 		last = NODE_DATA(node);
 	}
 	return -EINVAL;
 proceed:
 	/*
 	 * Currently supports only single page.  Supporting multiple
 	 * pages won't be too difficult if it ever becomes necessary.
 	 */
 	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
 	if (pcpur_size > PMD_SIZE) {
 		pr_warning("PERCPU: static data is larger than large page, "
 			   "can't use large page\n");
 		return -EINVAL;
 	}
 	/* allocate pointer array and alloc large pages */
 	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
 	pcpur_ptrs = alloc_bootmem(ptrs_size);
 	for_each_possible_cpu(cpu) {
 		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
 		if (!pcpur_ptrs[cpu])
 			goto enomem;
 		/*
 		 * Only use pcpur_size bytes and give back the rest.
 		 *
 		 * Ingo: The 2MB up-rounding bootmem is needed to make
 		 * sure the partial 2MB page is still fully RAM - it's
 		 * not well-specified to have a PAT-incompatible area
 		 * (unmapped RAM, device memory, etc.) in that hole.
 		 */
 		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
 			     PMD_SIZE - pcpur_size);
 		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
 	}
 	/* allocate address and map */
 	vm.flags = VM_ALLOC;
 	vm.size = num_possible_cpus() * PMD_SIZE;
 	vm_area_register_early(&vm, PMD_SIZE);
 	for_each_possible_cpu(cpu) {
 		pmd_t *pmd;
 		pmd = populate_extra_pmd((unsigned long)vm.addr
 					 + cpu * PMD_SIZE);
 		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
 				     PAGE_KERNEL_LARGE));
 	}
 	/* we're ready, commit */
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 enomem:
 	for_each_possible_cpu(cpu)
 		if (pcpur_ptrs[cpu])
 			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
 	ret = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
 	return ret;
 }
 #else
 static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	return -EINVAL;
 }
 #endif
 /*
 * Embedding allocator
 *
 * The first chunk is sized to just contain the static area plus
 * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
 * bootmem allocator and used as-is without being mapped into vmalloc
 * area.  This enables the first chunk to piggy back on the linear
 * physical PMD mapping and doesn't add any additional pressure to
 * TLB.
 */
 static void *pcpue_ptr __initdata;
 static size_t pcpue_unit_size __initdata;
 static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 {
 	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
 			    + ((size_t)pageno << PAGE_SHIFT));
 }
 static ssize_t __init setup_pcpu_embed(size_t static_size)
 {
 	unsigned int cpu;
 	/*
 	 * If large page isn't supported, there's no benefit in doing
 	 * this.  Also, embedding allocation doesn't play well with
 	 * NUMA.
 	 */
 	if (!cpu_has_pse || pcpu_need_numa())
 		return -EINVAL;
 	/* allocate and copy */
 	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
 	pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
 	if (!pcpue_ptr)
 		return -ENOMEM;
 	for_each_possible_cpu(cpu)
 		memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
 		       static_size);
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 				      pcpue_unit_size,
 				      pcpue_unit_size - static_size, pcpue_ptr,
 				      NULL);
 }
 /*
 * 4k page allocator
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page and most of initialization is done by the generic
 * setup function.
 */
 static struct page **pcpu4k_pages __initdata;
 static int pcpu4k_nr_static_pages __initdata;
 static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
 {
 	if (pageno < pcpu4k_nr_static_pages)
 		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
 	return NULL;
 }
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
 }
 static ssize_t __init setup_pcpu_4k(size_t static_size)
 {
 	size_t pages_size;
 	unsigned int cpu;
 	int i, j;
 	ssize_t ret;
 	pcpu4k_nr_static_pages = PFN_UP(static_size);
 	/* unaligned allocations can't be freed, round up to page size */
 	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
 			       * sizeof(pcpu4k_pages[0]));
 	pcpu4k_pages = alloc_bootmem(pages_size);
 	/* allocate and copy */
 	j = 0;
 	for_each_possible_cpu(cpu)
 		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
 			void *ptr;
 			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
 			if (!ptr)
 				goto enomem;
 			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
 			pcpu4k_pages[j++] = virt_to_page(ptr);
 		}
 	/* we're ready, commit */
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
 				     pcpu4k_populate_pte);
 	goto out_free_ar;
 enomem:
 	while (--j >= 0)
 		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
 	ret = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pcpu4k_pages), pages_size);
 	return ret;
 }
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
 */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size;
+	size_t static_size = __per_cpu_end - __per_cpu_start;
-	char *ptr;
+	unsigned int cpu;
-	int cpu;
+	unsigned long delta;
-
+	size_t pcpu_unit_size;
-	/* Copy section for each CPU (we discard the original) */
+	ssize_t ret;
 	size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
-	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+	/*
 	 * Allocate percpu area.  If PSE is supported, try to make use
 	 * of large page mappings.  Please read comments on top of
 	 * each allocator for details.
 	 */
 	ret = setup_pcpu_remap(static_size);
 	if (ret < 0)
 		ret = setup_pcpu_embed(static_size);
 	if (ret < 0)
 		ret = setup_pcpu_4k(static_size);
 	if (ret < 0)
 		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
 		      static_size, ret);
 	pcpu_unit_size = ret;
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 		ptr = alloc_bootmem_pages(size);
 #else
 		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
 			pr_info("cpu %d has no node %d or node-local memory\n",
 				cpu, node);
 			pr_debug("per cpu data for cpu%d at %016lx\n",
 				 cpu, __pa(ptr));
 		} else {
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
 				cpu, node, __pa(ptr));
 		}
 #endif
 		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	int pgd_idx = pgd_index(vaddr);
 	int pmd_idx = pmd_index(vaddr);
 	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
 }
 pte_t * __init populate_extra_pte(unsigned long vaddr)
 {
 	int pte_idx = pte_index(vaddr);
 	pmd_t *pmd;
 	pmd = populate_extra_pmd(vaddr);
 	return one_page_table_init(pmd) + pte_idx;
 }
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
-void
+static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
 	if (pgd_none(*pgd)) {
 		pud_t *pud = (pud_t *)spp_getpage();
 		pgd_populate(&init_mm, pgd, pud);
 		if (pud != pud_offset(pgd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
 			       pud, pud_offset(pgd, 0));
 	}
 	return pud_offset(pgd, vaddr);
 }
 static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
 {
 	if (pud_none(*pud)) {
 		pmd_t *pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
 		if (pmd != pmd_offset(pud, 0))
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
 			       pmd, pmd_offset(pud, 0));
 	}
 	return pmd_offset(pud, vaddr);
 }
 static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
 {
 	if (pmd_none(*pmd)) {
 		pte_t *pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
 		if (pte != pte_offset_kernel(pmd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 	}
 	return pte_offset_kernel(pmd, vaddr);
 }
 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	pud = pud_page + pud_index(vaddr);
-	if (pud_none(*pud)) {
+	pmd = fill_pmd(pud, vaddr);
-		pmd = (pmd_t *) spp_getpage();
+	pte = fill_pte(pmd, vaddr);
 		pud_populate(&init_mm, pud, pmd);
 		if (pmd != pmd_offset(pud, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
 				pmd, pmd_offset(pud, 0));
 			return;
 		}
 	}
 	pmd = pmd_offset(pud, vaddr);
 	if (pmd_none(*pmd)) {
 		pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
 		if (pte != pte_offset_kernel(pmd, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);
 	/*
@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__flush_tlb_one(vaddr);
 }
-void
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pgd = pgd_offset_k(vaddr);
 	pud = fill_pud(pgd, vaddr);
 	return fill_pmd(pud, vaddr);
 }
 pte_t * __init populate_extra_pte(unsigned long vaddr)
 {
 	pmd_t *pmd;
 	pmd = populate_extra_pmd(vaddr);
 	return fill_pte(pmd, vaddr);
 }
 /*
 * Create large page table mappings for a range of physical addresses.
 */
--- a/block/blktrace.c
+++ b/block/blktrace.c
@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 	if (!bt->msg_data)
 		goto err;
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
 			continue;
 		}
-		if (!performance || !percpu_ptr(performance, i)) {
+		if (!performance || !per_cpu_ptr(performance, i)) {
 			retval = -EINVAL;
 			continue;
 		}
-		pr->performance = percpu_ptr(performance, i);
+		pr->performance = per_cpu_ptr(performance, i);
 		cpumask_set_cpu(i, pr->performance->shared_cpu_map);
 		if (acpi_processor_get_psd(pr)) {
 			retval = -EINVAL;
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@ -65,21 +65,18 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT		0
 #define BOOTMEM_EXCLUSIVE	(1<<0)
 extern int reserve_bootmem(unsigned long addr,
 			   unsigned long size,
 			   int flags);
 extern int reserve_bootmem_node(pg_data_t *pgdat,
 				unsigned long physaddr,
 				unsigned long size,
 				int flags);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
 #endif
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
 extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
 				     unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
 	__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
 	__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
 	__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@ -76,52 +76,98 @@
 #ifdef CONFIG_SMP
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
 /*
 * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
 * back on the first chunk if arch is manually allocating and mapping
 * it for faster access (as a part of large page mapping for example).
 * Note that dynamic percpu allocator covers both static and dynamic
 * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
 *
 * On typical configuration with modules, the following values leave
 * about 8k of free space on the first chunk after boot on both x86_32
 * and 64 when module support is enabled.  When module support is
 * disabled, it's much tighter.
 */
 #ifndef PERCPU_DYNAMIC_RESERVE
 #  if BITS_PER_LONG > 32
 #    ifdef CONFIG_MODULES
 #      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
 #    else
 #      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
 #    endif
 #  else
 #    ifdef CONFIG_MODULES
 #      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
 #    else
 #      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
 #    endif
 #  endif
 #endif	/* PERCPU_DYNAMIC_RESERVE */
 extern void *pcpu_base_addr;
 typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 					size_t static_size, size_t unit_size,
 					size_t free_size, void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 /*
 * Use this to get to a cpu's version of the per-cpu object
 * dynamically allocated. Non-atomic access to the current CPU's
 * version should probably be combined with get_cpu()/put_cpu().
 */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 struct percpu_data {
 	void *ptrs[1];
 };
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
+
- * Use this to get to a cpu's version of the per-cpu object dynamically
+#define per_cpu_ptr(ptr, cpu)						\
 * allocated. Non-atomic access to the current CPU's version should
 * probably be combined with get_cpu()/put_cpu().
 */ 
 #define percpu_ptr(ptr, cpu)                              \
 ({									\
        struct percpu_data *__p = __percpu_disguise(ptr);		\
        (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-extern void percpu_free(void *__pdata);
+
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 #else /* CONFIG_SMP */
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
 	/*
 	 * Can't easily make larger alignment work with kmalloc.  WARN
 	 * on it.  Larger alignment should only be used for module
 	 * percpu sections on SMP for which this path isn't used.
 	 */
 	WARN_ON_ONCE(align > __alignof__(unsigned long long));
 	return kzalloc(size, gfp);
 }
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-	kfree(__pdata);
+	kfree(p);
 }
 #endif /* CONFIG_SMP */
-#define percpu_alloc_mask(size, gfp, mask) \
+#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
-	__percpu_alloc_mask((size), (gfp), &(mask))
+						       __alignof__(type))
 #define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
 /* (legacy) interface for use without CPU hotplug handling */
 #define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
 						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
 #define free_percpu(ptr)	percpu_free((ptr))
 #define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
 #endif /* __LINUX_PERCPU_H */
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
 				    pgprot_t prot, struct page **pages);
 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 /* Allocate/destroy a 'vmalloc' VM area. */
@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
 */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 #endif /* _LINUX_VMALLOC_H */
--- a/kernel/module.c
+++ b/kernel/module.c
@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/percpu.h>
 #if 0
 #define DEBUGP printk
@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 #ifdef CONFIG_SMP
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 static void *percpu_modalloc(unsigned long size, unsigned long align,
 			     const char *name)
 {
 	void *ptr;
 	if (align > PAGE_SIZE) {
 		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
 		       name, align, PAGE_SIZE);
 		align = PAGE_SIZE;
 	}
 	ptr = __alloc_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
 	return ptr;
 }
 static void percpu_modfree(void *freeme)
 {
 	free_percpu(freeme);
 }
 #else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
 	}
 }
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
 {
 	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
 static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
 {
 	int cpu;
 	for_each_possible_cpu(cpu)
 		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
 }
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
@ -513,7 +527,26 @@ static int percpu_modinit(void)
 	return 0;
 }
 __initcall(percpu_modinit);
 #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
 {
 	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
 static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
 {
 	int cpu;
 	for_each_possible_cpu(cpu)
 		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
 }
 #else /* ... !CONFIG_SMP */
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
 				    const char *name)
 {
@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 	/* pcpusec should be 0, and size of that section should be 0. */
 	BUG_ON(size != 0);
 }
 #endif /* CONFIG_SMP */
 #define MODINFO_ATTR(field)	\
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 #ifndef CONFIG_64BIT
@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
 	/*
@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	ca = task_ca(tsk);
 	for (; ca; ca = ca->parent) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	 * doesn't hit this CPU until we're ready. */
 	get_cpu();
 	for_each_online_cpu(i) {
-		sm_work = percpu_ptr(stop_machine_work, i);
+		sm_work = per_cpu_ptr(stop_machine_work, i);
 		INIT_WORK(sm_work, stop_cpu);
 		queue_work_on(i, stop_machine_wq, sm_work);
 	}
--- a/mm/Makefile
+++ b/mm/Makefile
@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 obj-$(CONFIG_SMP) += percpu.o
 else
 obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
 * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
+ * @align: alignment
 * @mask: populate per-data for cpu's selected through mask bits
 *
- * Populating per-cpu data for all online cpu's would be a typical use case,
+ * Allocate dynamic percpu area.  Percpu objects are populated with
- * which is simplified by the percpu_alloc() wrapper.
+ * zeroed buffers.
 * Per-cpu objects are populated with zeroed buffers.
 */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
 	/*
 	 * We allocate whole cache lines to avoid false sharing
 	 */
 	size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-	void *pdata = kzalloc(sz, gfp);
+	void *pdata = kzalloc(sz, GFP_KERNEL);
 	void *__pdata = __percpu_disguise(pdata);
 	/*
 	 * Can't easily make larger alignment work with kmalloc.  WARN
 	 * on it.  Larger alignment should only be used for module
 	 * percpu sections on SMP for which this path isn't used.
 	 */
 	WARN_ON_ONCE(align > __alignof__(unsigned long long));
 	if (unlikely(!pdata))
 		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+	if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
 					   &cpu_possible_map)))
 		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
 * @__pdata: object to clean up
 *
 * We simply clean up any per-cpu object left. No need for the client to
 * track and specify through a bis mask which per-cpu objects are to free.
 */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
 	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
 	kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 static int bootmem_debug;
 /*
 * If an arch needs to apply workarounds to bootmem allocation, it can
 * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
 * __alloc_bootmem_core().
 */
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM
 #define alloc_bootmem_core(bdata, size, align, goal, limit)		\
 	__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
 #endif
 static int __init bootmem_debug_setup(char *buf)
 {
 	bootmem_debug = 1;
@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
 * reserve_bootmem - mark a page range as usable
 * @addr: starting address of the range
@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 	return mark_bootmem(start, end, 1, flags);
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
 			unsigned long step)
@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 	return ALIGN(base + off, align) - base;
 }
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
--- a/mm/percpu.c
+++ b/mm/percpu.c
@ -0,0 +1,979 @@
 /*
 * linux/mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009		SUSE Linux Products GmbH
 * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2.
 *
 * This is percpu allocator which can handle both static and dynamic
 * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
 * chunk is consisted of num_possible_cpus() units and the first chunk
 * is used for static percpu variables in the kernel image (special
 * boot time alloc/init handling necessary as these areas need to be
 * brought up before allocation services are running).  Unit grows as
 * necessary and all units grow or shrink in unison.  When a chunk is
 * filled up, another chunk is allocated.  ie. in vmalloc area
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done in offset-size areas of single unit space.  Ie,
 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
 * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
 * percpu base registers UNIT_SIZE apart.
 *
 * There are usually many small percpu allocations many of them as
 * small as 4 bytes.  The allocator organizes chunks into lists
 * according to free size and tries to allocate from the fullest one.
 * Each chunk keeps the maximum contiguous area size hint which is
 * guaranteed to be eqaul to or larger than the maximum contiguous
 * area in the chunk.  This helps the allocator not to iterate the
 * chunk maps unnecessarily.
 *
 * Allocation state in each chunk is kept using an array of integers
 * on chunk->map.  A positive value in the map represents a free
 * region and negative allocated.  Allocation inside a chunk is done
 * by scanning this map sequentially and serving the first matching
 * entry.  This is mostly copied from the percpu_modalloc() allocator.
 * Chunks are also linked into a rb tree to ease address to chunk
 * mapping during free.
 *
 * To use this allocator, arch code should do the followings.
 *
 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	struct rb_node		rb_node;	/* key is chunk->vm->addr */
 	int			free_size;	/* free bytes in the chunk */
 	int			contig_hint;	/* max contiguous size hint */
 	struct vm_struct	*vm;		/* mapped vmalloc region */
 	int			map_used;	/* # of map entries used */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
 	struct page		*page[];	/* #cpus * UNIT_PAGES */
 };
 static int pcpu_unit_pages __read_mostly;
 static int pcpu_unit_size __read_mostly;
 static int pcpu_chunk_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 /* the size of kernel static area */
 static int pcpu_static_size __read_mostly;
 /*
 * One mutex to rule them all.
 *
 * The following mutex is grabbed in the outermost public alloc/free
 * interface functions and released only when the operation is
 * complete.  As such, every function in this file other than the
 * outermost functions are called under pcpu_mutex.
 *
 * It can easily be switched to use spinlock such that only the area
 * allocation and page population commit are protected with it doing
 * actual [de]allocation without holding any lock.  However, given
 * what this allocator does, I think it's better to let them run
 * sequentially.
 */
 static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 static int pcpu_size_to_slot(int size)
 {
 	if (size == pcpu_unit_size)
 		return pcpu_nr_slots - 1;
 	return __pcpu_size_to_slot(size);
 }
 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 {
 	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
 		return 0;
 	return pcpu_size_to_slot(chunk->free_size);
 }
 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 {
 	return cpu * pcpu_unit_pages + page_idx;
 }
 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
 				      unsigned int cpu, int page_idx)
 {
 	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
 }
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 				     unsigned int cpu, int page_idx)
 {
 	return (unsigned long)chunk->vm->addr +
 		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 }
 static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 				     int page_idx)
 {
 	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 }
 /**
 * pcpu_realloc - versatile realloc
 * @p: the current pointer (can be NULL for new allocations)
 * @size: the current size in bytes (can be 0 for new allocations)
 * @new_size: the wanted new size in bytes (can be 0 for free)
 *
 * More robust realloc which can be used to allocate, resize or free a
 * memory area of arbitrary size.  If the needed size goes over
 * PAGE_SIZE, kernel VM is used.
 *
 * RETURNS:
 * The new pointer on success, NULL on failure.
 */
 static void *pcpu_realloc(void *p, size_t size, size_t new_size)
 {
 	void *new;
 	if (new_size <= PAGE_SIZE)
 		new = kmalloc(new_size, GFP_KERNEL);
 	else
 		new = vmalloc(new_size);
 	if (new_size && !new)
 		return NULL;
 	memcpy(new, p, min(size, new_size));
 	if (new_size > size)
 		memset(new + size, 0, new_size - size);
 	if (size <= PAGE_SIZE)
 		kfree(p);
 	else
 		vfree(p);
 	return new;
 }
 /**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.
 */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 	if (oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
 			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
 	}
 }
 static struct rb_node **pcpu_chunk_rb_search(void *addr,
 					     struct rb_node **parentp)
 {
 	struct rb_node **p = &pcpu_addr_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct pcpu_chunk *chunk;
 	while (*p) {
 		parent = *p;
 		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
 		if (addr < chunk->vm->addr)
 			p = &(*p)->rb_left;
 		else if (addr > chunk->vm->addr)
 			p = &(*p)->rb_right;
 		else
 			break;
 	}
 	if (parentp)
 		*parentp = parent;
 	return p;
 }
 /**
 * pcpu_chunk_addr_search - search for chunk containing specified address
 * @addr: address to search for
 *
 * Look for chunk which might contain @addr.  More specifically, it
 * searchs for the chunk with the highest start address which isn't
 * beyond @addr.
 *
 * RETURNS:
 * The address of the found chunk.
 */
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
 		n = parent;
 		BUG_ON(!n);
 	}
 	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 	if (addr < chunk->vm->addr) {
 		/* the parent was the next one, look for the previous one */
 		n = rb_prev(n);
 		BUG_ON(!n);
 		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 	}
 	return chunk;
 }
 /**
 * pcpu_chunk_addr_insert - insert chunk into address rb tree
 * @new: chunk to insert
 *
 * Insert @new into address rb tree.
 */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
 	struct rb_node **p, *parent;
 	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
 	BUG_ON(*p);
 	rb_link_node(&new->rb_node, parent, p);
 	rb_insert_color(&new->rb_node, &pcpu_addr_root);
 }
 /**
 * pcpu_split_block - split a map block
 * @chunk: chunk of interest
 * @i: index of map block to split
 * @head: head size in bytes (can be 0)
 * @tail: tail size in bytes (can be 0)
 *
 * Split the @i'th map block into two or three blocks.  If @head is
 * non-zero, @head bytes block is inserted before block @i moving it
 * to @i+1 and reducing its size by @head bytes.
 *
 * If @tail is non-zero, the target block, which can be @i or @i+1
 * depending on @head, is reduced by @tail bytes and @tail byte block
 * is inserted after the target block.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
 static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
 	int target = chunk->map_used + nr_extra;
 	/* reallocation required? */
 	if (chunk->map_alloc < target) {
 		int new_alloc = chunk->map_alloc;
 		int *new;
 		while (new_alloc < target)
 			new_alloc *= 2;
 		new = pcpu_realloc(chunk->map,
 				   chunk->map_alloc * sizeof(new[0]),
 				   new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 		chunk->map_alloc = new_alloc;
 		chunk->map = new;
 	}
 	/* insert a new subblock */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
 	if (head) {
 		chunk->map[i + 1] = chunk->map[i] - head;
 		chunk->map[i++] = head;
 	}
 	if (tail) {
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
 	return 0;
 }
 /**
 * pcpu_alloc_area - allocate area from a pcpu_chunk
 * @chunk: chunk of interest
 * @size: wanted size in bytes
 * @align: wanted align
 *
 * Try to allocate @size bytes area aligned at @align from @chunk.
 * Note that this function only allocates the offset.  It doesn't
 * populate or map the area.
 *
 * RETURNS:
 * Allocated offset in @chunk on success, -errno on failure.
 */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
 	int oslot = pcpu_chunk_slot(chunk);
 	int max_contig = 0;
 	int i, off;
 	/*
 	 * The static chunk initially doesn't have map attached
 	 * because kmalloc wasn't available during init.  Give it one.
 	 */
 	if (unlikely(!chunk->map)) {
 		chunk->map = pcpu_realloc(NULL, 0,
 				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 		if (!chunk->map)
 			return -ENOMEM;
 		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 		chunk->map[chunk->map_used++] = -pcpu_static_size;
 		if (chunk->free_size)
 			chunk->map[chunk->map_used++] = chunk->free_size;
 	}
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 		bool is_last = i + 1 == chunk->map_used;
 		int head, tail;
 		/* extra for alignment requirement */
 		head = ALIGN(off, align) - off;
 		BUG_ON(i == 0 && head != 0);
 		if (chunk->map[i] < 0)
 			continue;
 		if (chunk->map[i] < head + size) {
 			max_contig = max(chunk->map[i], max_contig);
 			continue;
 		}
 		/*
 		 * If head is small or the previous block is free,
 		 * merge'em.  Note that 'small' is defined as smaller
 		 * than sizeof(int), which is very small but isn't too
 		 * uncommon for percpu allocations.
 		 */
 		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
 			if (chunk->map[i - 1] > 0)
 				chunk->map[i - 1] += head;
 			else {
 				chunk->map[i - 1] -= head;
 				chunk->free_size -= head;
 			}
 			chunk->map[i] -= head;
 			off += head;
 			head = 0;
 		}
 		/* if tail is small, just keep it around */
 		tail = chunk->map[i] - head - size;
 		if (tail < sizeof(int))
 			tail = 0;
 		/* split if warranted */
 		if (head || tail) {
 			if (pcpu_split_block(chunk, i, head, tail))
 				return -ENOMEM;
 			if (head) {
 				i++;
 				off += head;
 				max_contig = max(chunk->map[i - 1], max_contig);
 			}
 			if (tail)
 				max_contig = max(chunk->map[i + 1], max_contig);
 		}
 		/* update hint and mark allocated */
 		if (is_last)
 			chunk->contig_hint = max_contig; /* fully scanned */
 		else
 			chunk->contig_hint = max(chunk->contig_hint,
 						 max_contig);
 		chunk->free_size -= chunk->map[i];
 		chunk->map[i] = -chunk->map[i];
 		pcpu_chunk_relocate(chunk, oslot);
 		return off;
 	}
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 	/*
 	 * Tell the upper layer that this chunk has no area left.
 	 * Note that this is not an error condition but a notification
 	 * to upper layer that it needs to look at other chunks.
 	 * -ENOSPC is chosen as it isn't used in memory subsystem and
 	 * matches the meaning in a way.
 	 */
 	return -ENOSPC;
 }
 /**
 * pcpu_free_area - free area to a pcpu_chunk
 * @chunk: chunk of interest
 * @freeme: offset of area to free
 *
 * Free area starting from @freeme to @chunk.  Note that this function
 * only modifies the allocation map.  It doesn't depopulate or unmap
 * the area.
 */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
 	int oslot = pcpu_chunk_slot(chunk);
 	int i, off;
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
 		if (off == freeme)
 			break;
 	BUG_ON(off != freeme);
 	BUG_ON(chunk->map[i] > 0);
 	chunk->map[i] = -chunk->map[i];
 	chunk->free_size += chunk->map[i];
 	/* merge with previous? */
 	if (i > 0 && chunk->map[i - 1] >= 0) {
 		chunk->map[i - 1] += chunk->map[i];
 		chunk->map_used--;
 		memmove(&chunk->map[i], &chunk->map[i + 1],
 			(chunk->map_used - i) * sizeof(chunk->map[0]));
 		i--;
 	}
 	/* merge with next? */
 	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
 		chunk->map[i] += chunk->map[i + 1];
 		chunk->map_used--;
 		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
 			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
 	}
 	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
 	pcpu_chunk_relocate(chunk, oslot);
 }
 /**
 * pcpu_unmap - unmap pages out of a pcpu_chunk
 * @chunk: chunk of interest
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
 * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 * If @flush is true, vcache is flushed before unmapping and tlb
 * after.
 */
 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 		       bool flush)
 {
 	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
 	/* unmap must not be done on immutable chunk */
 	WARN_ON(chunk->immutable);
 	/*
 	 * Each flushing trial can be very expensive, issue flush on
 	 * the whole region at once rather than doing it for each cpu.
 	 * This could be an overkill but is more scalable.
 	 */
 	if (flush)
 		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
 				   pcpu_chunk_addr(chunk, last, page_end));
 	for_each_possible_cpu(cpu)
 		unmap_kernel_range_noflush(
 				pcpu_chunk_addr(chunk, cpu, page_start),
 				(page_end - page_start) << PAGE_SHIFT);
 	/* ditto as flush_cache_vunmap() */
 	if (flush)
 		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
 				       pcpu_chunk_addr(chunk, last, page_end));
 }
 /**
 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 * @chunk: chunk to depopulate
 * @off: offset to the area to depopulate
 * @size: size of the area to depopulate in bytes
 * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
 * and tlb after.
 */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
 {
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
 	int unmap_start = -1;
 	int uninitialized_var(unmap_end);
 	unsigned int cpu;
 	int i;
 	for (i = page_start; i < page_end; i++) {
 		for_each_possible_cpu(cpu) {
 			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 			if (!*pagep)
 				continue;
 			__free_page(*pagep);
 			/*
 			 * If it's partial depopulation, it might get
 			 * populated or depopulated again.  Mark the
 			 * page gone.
 			 */
 			*pagep = NULL;
 			unmap_start = unmap_start < 0 ? i : unmap_start;
 			unmap_end = i + 1;
 		}
 	}
 	if (unmap_start >= 0)
 		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
 }
 /**
 * pcpu_map - map pages into a pcpu_chunk
 * @chunk: chunk of interest
 * @page_start: page index of the first page to map
 * @page_end: page index of the last page to map + 1
 *
 * For each cpu, map pages [@page_start,@page_end) into @chunk.
 * vcache is flushed afterwards.
 */
 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 {
 	unsigned int last = num_possible_cpus() - 1;
 	unsigned int cpu;
 	int err;
 	/* map must not be done on immutable chunk */
 	WARN_ON(chunk->immutable);
 	for_each_possible_cpu(cpu) {
 		err = map_kernel_range_noflush(
 				pcpu_chunk_addr(chunk, cpu, page_start),
 				(page_end - page_start) << PAGE_SHIFT,
 				PAGE_KERNEL,
 				pcpu_chunk_pagep(chunk, cpu, page_start));
 		if (err < 0)
 			return err;
 	}
 	/* flush at once, please read comments in pcpu_unmap() */
 	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 			 pcpu_chunk_addr(chunk, last, page_end));
 	return 0;
 }
 /**
 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 * @chunk: chunk of interest
 * @off: offset to the area to populate
 * @size: size of the area to populate in bytes
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
 * @chunk.  The area is cleared on return.
 */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
 	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 	int page_start = PFN_DOWN(off);
 	int page_end = PFN_UP(off + size);
 	int map_start = -1;
 	int map_end;
 	unsigned int cpu;
 	int i;
 	for (i = page_start; i < page_end; i++) {
 		if (pcpu_chunk_page_occupied(chunk, i)) {
 			if (map_start >= 0) {
 				if (pcpu_map(chunk, map_start, map_end))
 					goto err;
 				map_start = -1;
 			}
 			continue;
 		}
 		map_start = map_start < 0 ? i : map_start;
 		map_end = i + 1;
 		for_each_possible_cpu(cpu) {
 			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 			*pagep = alloc_pages_node(cpu_to_node(cpu),
 						  alloc_mask, 0);
 			if (!*pagep)
 				goto err;
 		}
 	}
 	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
 		goto err;
 	for_each_possible_cpu(cpu)
 		memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 		       size);
 	return 0;
 err:
 	/* likely under heavy memory pressure, give memory back */
 	pcpu_depopulate_chunk(chunk, off, size, true);
 	return -ENOMEM;
 }
 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 {
 	if (!chunk)
 		return;
 	if (chunk->vm)
 		free_vm_area(chunk->vm);
 	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
 	kfree(chunk);
 }
 static struct pcpu_chunk *alloc_pcpu_chunk(void)
 {
 	struct pcpu_chunk *chunk;
 	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
 	if (!chunk)
 		return NULL;
 	chunk->map = pcpu_realloc(NULL, 0,
 				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
 		free_pcpu_chunk(chunk);
 		return NULL;
 	}
 	INIT_LIST_HEAD(&chunk->list);
 	chunk->free_size = pcpu_unit_size;
 	chunk->contig_hint = pcpu_unit_size;
 	return chunk;
 }
 /**
 * __alloc_percpu - allocate percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Allocate percpu area of @size bytes aligned at @align.  Might
 * sleep.  Might trigger writeouts.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
 void *__alloc_percpu(size_t size, size_t align)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
 		return NULL;
 	}
 	mutex_lock(&pcpu_mutex);
 	/* allocate area */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
 			if (off != -ENOSPC)
 				goto out_unlock;
 		}
 	}
 	/* hmmm... no space left, create a new chunk */
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
 		goto out_unlock;
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
 	off = pcpu_alloc_area(chunk, size, align);
 	if (off < 0)
 		goto out_unlock;
 area_found:
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
 		pcpu_free_area(chunk, off);
 		goto out_unlock;
 	}
 	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
 out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
 	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 	list_del(&chunk->list);
 	rb_erase(&chunk->rb_node, &pcpu_addr_root);
 	free_pcpu_chunk(chunk);
 }
 /**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.  Might sleep.
 */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
 	int off;
 	if (!ptr)
 		return;
 	mutex_lock(&pcpu_mutex);
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
 	pcpu_free_area(chunk, off);
 	/* the chunk became fully free, kill one if there are other free ones */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 		list_for_each_entry(pos,
 				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
 			if (pos != chunk) {
 				pcpu_kill_chunk(pos);
 				break;
 			}
 	}
 	mutex_unlock(&pcpu_mutex);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 /**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @get_page_fn: callback to fetch page pointer
 * @static_size: the size of static percpu area in bytes
 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
 * @free_size: free size in bytes, 0 for auto
 * @base_addr: mapped address, NULL for auto
 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
 *
 * Initialize the first percpu chunk which contains the kernel static
 * perpcu area.  This function is to be called from arch percpu area
 * setup path.  The first two parameters are mandatory.  The rest are
 * optional.
 *
 * @get_page_fn() should return pointer to percpu page given cpu
 * number and page number.  It should at least return enough pages to
 * cover the static area.  The returned pages for static area should
 * have been initialized with valid data.  If @unit_size is specified,
 * it can also return pages after the static area.  NULL return
 * indicates end of pages for the cpu.  Note that @get_page_fn() must
 * return the same number of pages for all cpus.
 *
 * @unit_size, if non-zero, determines unit size and must be aligned
 * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
 *
 * @free_size determines the number of free bytes after the static
 * area in the first chunk.  If zero, whatever left is available.
 * Specifying non-zero value make percpu leave the area after
 * @static_size + @free_size alone.
 *
 * Non-null @base_addr means that the caller already allocated virtual
 * region for the first chunk and mapped it.  percpu must not mess
 * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
 * @populate_pte_fn doesn't make any sense.
 *
 * @populate_pte_fn is used to populate the pagetable.  NULL means the
 * caller already populated the pagetable.
 *
 * RETURNS:
 * The determined pcpu_unit_size which can be used to initialize
 * percpu access.
 */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t unit_size,
 				     size_t free_size, void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct static_vm;
 	struct pcpu_chunk *static_chunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 	/* santiy checks */
 	BUG_ON(!static_size);
 	BUG_ON(!unit_size && free_size);
 	BUG_ON(unit_size && unit_size < static_size + free_size);
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(base_addr && !unit_size);
 	BUG_ON(base_addr && populate_pte_fn);
 	if (unit_size)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 					PFN_UP(static_size));
 	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
 	 */
 	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 	/* init static_chunk */
 	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&static_chunk->list);
 	static_chunk->vm = &static_vm;
 	if (free_size)
 		static_chunk->free_size = free_size;
 	else
 		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
 	static_chunk->contig_hint = static_chunk->free_size;
 	/* allocate vm address */
 	static_vm.flags = VM_ALLOC;
 	static_vm.size = pcpu_chunk_size;
 	if (!base_addr)
 		vm_area_register_early(&static_vm, PAGE_SIZE);
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
 		 * vmalloc area.  In this case the static chunk can't
 		 * be mapped or unmapped by percpu and is marked
 		 * immutable.
 		 */
 		static_vm.addr = base_addr;
 		static_chunk->immutable = true;
 	}
 	/* assign pages */
 	nr_pages = -1;
 	for_each_possible_cpu(cpu) {
 		for (i = 0; i < pcpu_unit_pages; i++) {
 			struct page *page = get_page_fn(cpu, i);
 			if (!page)
 				break;
 			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
 		}
 		BUG_ON(i < PFN_UP(pcpu_static_size));
 		if (nr_pages < 0)
 			nr_pages = i;
 		else
 			BUG_ON(nr_pages != i);
 	}
 	/* map them */
 	if (populate_pte_fn) {
 		for_each_possible_cpu(cpu)
 			for (i = 0; i < nr_pages; i++)
 				populate_pte_fn(pcpu_chunk_addr(static_chunk,
 								cpu, i));
 		err = pcpu_map(static_chunk, 0, nr_pages);
 		if (err)
 			panic("failed to setup static percpu area, err=%d\n",
 			      err);
 	}
 	/* link static_chunk in */
 	pcpu_chunk_relocate(static_chunk, -1);
 	pcpu_chunk_addr_insert(static_chunk);
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
 	return pcpu_unit_size;
 }
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@ -152,7 +153,7 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
-static int vmap_page_range(unsigned long start, unsigned long end,
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 				   pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 	flush_cache_vmap(start, end);
 	if (unlikely(err))
 		return err;
 	return nr;
 }
 static int vmap_page_range(unsigned long start, unsigned long end,
 			   pgprot_t prot, struct page **pages)
 {
 	int ret;
 	ret = vmap_page_range_noflush(start, end, prot, pages);
 	flush_cache_vmap(start, end);
 	return ret;
 }
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 /**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 {
 	static size_t vm_init_off __initdata;
 	unsigned long addr;
 	addr = ALIGN(VMALLOC_START + vm_init_off, align);
 	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
 	vm->addr = (void *)addr;
 	vm->next = vmlist;
 	vmlist = vm;
 }
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 /**
 * map_kernel_range_noflush - map kernel VM area with the specified pages
 * @addr: start of the VM area to map
 * @size: size of the VM area to map
 * @prot: page protection flags to use
 * @pages: pages to map
 *
 * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
 * specify should have been allocated using get_vm_area() and its
 * friends.
 *
 * NOTE:
 * This function does NOT do any cache flushing.  The caller is
 * responsible for calling flush_cache_vmap() on to-be-mapped areas
 * before calling this function.
 *
 * RETURNS:
 * The number of pages mapped on success, -errno on failure.
 */
 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 			     pgprot_t prot, struct page **pages)
 {
 	return vmap_page_range_noflush(addr, addr + size, prot, pages);
 }
 /**
 * unmap_kernel_range_noflush - unmap kernel VM area
 * @addr: start of the VM area to unmap
 * @size: size of the VM area to unmap
 *
 * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
 * specify should have been allocated using get_vm_area() and its
 * friends.
 *
 * NOTE:
 * This function does NOT do any cache flushing.  The caller is
 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
 * before calling this function and flush_tlb_kernel_range() after.
 */
 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
 	vunmap_page_range(addr, addr + size);
 }
 /**
 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
 * @addr: start of the VM area to unmap
 * @size: size of the VM area to unmap
 *
 * Similar to unmap_kernel_range_noflush() but flushes vcache before
 * the unmapping and tlb after.
 */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
 	unsigned long end = addr + size;
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
 	BUG_ON(ptr == NULL);
-	ptr[0] = __alloc_percpu(mibsize);
+	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[0])
 		goto err0;
-	ptr[1] = __alloc_percpu(mibsize);
+	ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
 	if (!ptr[1])
 		goto err1;
 	return 0;