2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* linux/mm/page_alloc.c
|
|
|
|
*
|
|
|
|
* Manages the free list, the system allocates free pages here.
|
|
|
|
* Note that kmalloc() lives in slab.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
|
|
|
* Swap reorganised 29.12.95, Stephen Tweedie
|
|
|
|
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
|
|
|
|
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
|
|
|
|
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
|
|
|
|
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
|
|
|
|
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
|
|
|
|
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/pagemap.h>
|
2008-03-05 06:28:32 +08:00
|
|
|
#include <linux/jiffies.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/bootmem.h>
|
|
|
|
#include <linux/compiler.h>
|
2005-09-13 16:25:16 +08:00
|
|
|
#include <linux/kernel.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/suspend.h>
|
|
|
|
#include <linux/pagevec.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/slab.h>
|
2007-10-17 14:25:53 +08:00
|
|
|
#include <linux/oom.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/topology.h>
|
|
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpuset.h>
|
2005-10-30 09:16:53 +08:00
|
|
|
#include <linux/memory_hotplug.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2006-01-06 16:11:17 +08:00
|
|
|
#include <linux/mempolicy.h>
|
2006-06-23 17:03:11 +08:00
|
|
|
#include <linux/stop_machine.h>
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#include <linux/sort.h>
|
|
|
|
#include <linux/pfn.h>
|
2006-10-20 14:28:16 +08:00
|
|
|
#include <linux/backing-dev.h>
|
2006-12-08 18:39:45 +08:00
|
|
|
#include <linux/fault-inject.h>
|
2007-10-16 16:26:11 +08:00
|
|
|
#include <linux/page-isolation.h>
|
2008-02-07 16:13:53 +08:00
|
|
|
#include <linux/memcontrol.h>
|
2008-04-30 15:55:01 +08:00
|
|
|
#include <linux/debugobjects.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
2006-05-16 00:43:59 +08:00
|
|
|
#include <asm/div64.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "internal.h"
|
|
|
|
|
|
|
|
/*
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:27 +08:00
|
|
|
* Array of node states.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:27 +08:00
|
|
|
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
|
|
|
|
[N_POSSIBLE] = NODE_MASK_ALL,
|
|
|
|
[N_ONLINE] = { { [0] = 1UL } },
|
|
|
|
#ifndef CONFIG_NUMA
|
|
|
|
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
[N_HIGH_MEMORY] = { { [0] = 1UL } },
|
|
|
|
#endif
|
|
|
|
[N_CPU] = { { [0] = 1UL } },
|
|
|
|
#endif /* NUMA */
|
|
|
|
};
|
|
|
|
EXPORT_SYMBOL(node_states);
|
|
|
|
|
2005-09-07 06:17:45 +08:00
|
|
|
unsigned long totalram_pages __read_mostly;
|
2006-04-11 13:52:59 +08:00
|
|
|
unsigned long totalreserve_pages __read_mostly;
|
2005-04-17 06:20:36 +08:00
|
|
|
long nr_swap_pages;
|
2006-01-08 17:00:40 +08:00
|
|
|
int percpu_pagelist_fraction;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
|
|
|
int pageblock_order __read_mostly;
|
|
|
|
#endif
|
|
|
|
|
2006-02-15 05:52:59 +08:00
|
|
|
static void __free_pages_ok(struct page *page, unsigned int order);
|
2006-01-06 16:11:08 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* results with 256, 32 in the lowmem_reserve sysctl:
|
|
|
|
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
|
|
|
|
* 1G machine -> (16M dma, 784M normal, 224M high)
|
|
|
|
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
|
|
|
|
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
|
|
|
|
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
|
2005-11-06 00:25:53 +08:00
|
|
|
*
|
|
|
|
* TBD: should special case ZONE_DMA32 machines here - in those we normally
|
|
|
|
* don't need any ZONE_NORMAL reservation
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-09-26 14:31:13 +08:00
|
|
|
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
|
2007-02-10 17:43:10 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2006-09-26 14:31:13 +08:00
|
|
|
256,
|
2007-02-10 17:43:10 +08:00
|
|
|
#endif
|
2006-09-26 14:31:13 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2006-09-26 14:31:13 +08:00
|
|
|
256,
|
2006-09-26 14:31:13 +08:00
|
|
|
#endif
|
2006-09-26 14:31:14 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
2007-07-17 19:03:12 +08:00
|
|
|
32,
|
2006-09-26 14:31:14 +08:00
|
|
|
#endif
|
2007-07-17 19:03:12 +08:00
|
|
|
32,
|
2006-09-26 14:31:13 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
EXPORT_SYMBOL(totalram_pages);
|
|
|
|
|
2006-12-07 12:40:36 +08:00
|
|
|
static char * const zone_names[MAX_NR_ZONES] = {
|
2007-02-10 17:43:10 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2006-09-26 14:31:13 +08:00
|
|
|
"DMA",
|
2007-02-10 17:43:10 +08:00
|
|
|
#endif
|
2006-09-26 14:31:13 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2006-09-26 14:31:13 +08:00
|
|
|
"DMA32",
|
2006-09-26 14:31:13 +08:00
|
|
|
#endif
|
2006-09-26 14:31:13 +08:00
|
|
|
"Normal",
|
2006-09-26 14:31:14 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
2007-07-17 19:03:12 +08:00
|
|
|
"HighMem",
|
2006-09-26 14:31:14 +08:00
|
|
|
#endif
|
2007-07-17 19:03:12 +08:00
|
|
|
"Movable",
|
2006-09-26 14:31:13 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int min_free_kbytes = 1024;
|
|
|
|
|
2006-06-23 17:03:09 +08:00
|
|
|
unsigned long __meminitdata nr_kernel_pages;
|
|
|
|
unsigned long __meminitdata nr_all_pages;
|
2007-05-08 15:23:07 +08:00
|
|
|
static unsigned long __meminitdata dma_reserve;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
/*
|
2007-10-20 07:27:18 +08:00
|
|
|
* MAX_ACTIVE_REGIONS determines the maximum number of distinct
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
* ranges of memory (RAM) that may be registered with add_active_range().
|
|
|
|
* Ranges passed to add_active_range() will be merged if possible
|
|
|
|
* so the number of times add_active_range() can be called is
|
|
|
|
* related to the number of nodes and the number of holes
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_MAX_ACTIVE_REGIONS
|
|
|
|
/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
|
|
|
|
#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
|
|
|
|
#else
|
|
|
|
#if MAX_NUMNODES >= 32
|
|
|
|
/* If there can be many nodes, allow up to 50 holes per node */
|
|
|
|
#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
|
|
|
|
#else
|
|
|
|
/* By default, allow up to 256 distinct regions */
|
|
|
|
#define MAX_ACTIVE_REGIONS 256
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
2007-07-16 14:38:17 +08:00
|
|
|
static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
|
|
|
|
static int __meminitdata nr_nodemap_entries;
|
|
|
|
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
|
|
|
|
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
|
2006-09-27 16:49:59 +08:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
|
2007-07-16 14:38:17 +08:00
|
|
|
static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
|
|
|
|
static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
|
2006-09-27 16:49:59 +08:00
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
|
2007-07-17 19:03:12 +08:00
|
|
|
unsigned long __initdata required_kernelcore;
|
2007-10-16 16:26:03 +08:00
|
|
|
static unsigned long __initdata required_movablecore;
|
2007-07-20 15:31:44 +08:00
|
|
|
unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
|
2007-07-17 19:03:12 +08:00
|
|
|
|
|
|
|
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
|
|
|
|
int movable_zone;
|
|
|
|
EXPORT_SYMBOL(movable_zone);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
|
|
|
|
|
2007-05-24 04:57:55 +08:00
|
|
|
#if MAX_NUMNODES > 1
|
|
|
|
int nr_node_ids __read_mostly = MAX_NUMNODES;
|
|
|
|
EXPORT_SYMBOL(nr_node_ids);
|
|
|
|
#endif
|
|
|
|
|
2007-10-16 16:25:54 +08:00
|
|
|
int page_group_by_mobility_disabled __read_mostly;
|
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
static void set_pageblock_migratetype(struct page *page, int migratetype)
|
|
|
|
{
|
|
|
|
set_pageblock_flags_group(page, (unsigned long)migratetype,
|
|
|
|
PB_migrate, PB_migrate_end);
|
|
|
|
}
|
|
|
|
|
2006-01-06 16:10:58 +08:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2005-10-30 09:16:52 +08:00
|
|
|
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:53 +08:00
|
|
|
int ret = 0;
|
|
|
|
unsigned seq;
|
|
|
|
unsigned long pfn = page_to_pfn(page);
|
2005-10-30 09:16:52 +08:00
|
|
|
|
2005-10-30 09:16:53 +08:00
|
|
|
do {
|
|
|
|
seq = zone_span_seqbegin(zone);
|
|
|
|
if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
|
|
|
|
ret = 1;
|
|
|
|
else if (pfn < zone->zone_start_pfn)
|
|
|
|
ret = 1;
|
|
|
|
} while (zone_span_seqretry(zone, seq));
|
|
|
|
|
|
|
|
return ret;
|
2005-10-30 09:16:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int page_is_consistent(struct zone *zone, struct page *page)
|
|
|
|
{
|
2007-05-07 05:49:14 +08:00
|
|
|
if (!pfn_valid_within(page_to_pfn(page)))
|
2005-10-30 09:16:52 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (zone != page_zone(page))
|
2005-10-30 09:16:52 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Temporary debugging check for pages not lying within a given zone.
|
|
|
|
*/
|
|
|
|
static int bad_range(struct zone *zone, struct page *page)
|
|
|
|
{
|
|
|
|
if (page_outside_zone_boundaries(zone, page))
|
2005-04-17 06:20:36 +08:00
|
|
|
return 1;
|
2005-10-30 09:16:52 +08:00
|
|
|
if (!page_is_consistent(zone, page))
|
|
|
|
return 1;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2006-01-06 16:10:58 +08:00
|
|
|
#else
|
|
|
|
static inline int bad_range(struct zone *zone, struct page *page)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-01-06 16:11:11 +08:00
|
|
|
static void bad_page(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-03-05 06:29:07 +08:00
|
|
|
void *pc = page_get_page_cgroup(page);
|
|
|
|
|
|
|
|
printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
|
|
|
|
"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
|
2006-01-06 16:11:11 +08:00
|
|
|
current->comm, page, (int)(2*sizeof(unsigned long)),
|
|
|
|
(unsigned long)page->flags, page->mapping,
|
|
|
|
page_mapcount(page), page_count(page));
|
2008-03-05 06:29:07 +08:00
|
|
|
if (pc) {
|
|
|
|
printk(KERN_EMERG "cgroup:%p\n", pc);
|
|
|
|
page_reset_bad_cgroup(page);
|
|
|
|
}
|
|
|
|
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
|
|
|
|
KERN_EMERG "Backtrace:\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
dump_stack();
|
2008-06-10 00:18:45 +08:00
|
|
|
page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
|
2005-04-17 06:20:36 +08:00
|
|
|
set_page_count(page, 0);
|
|
|
|
reset_page_mapcount(page);
|
|
|
|
page->mapping = NULL;
|
2005-09-13 16:25:16 +08:00
|
|
|
add_taint(TAINT_BAD_PAGE);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Higher-order pages are called "compound pages". They are structured thusly:
|
|
|
|
*
|
|
|
|
* The first PAGE_SIZE page is called the "head page".
|
|
|
|
*
|
|
|
|
* The remaining PAGE_SIZE pages are called "tail pages".
|
|
|
|
*
|
|
|
|
* All pages have PG_compound set. All pages have their ->private pointing at
|
|
|
|
* the head page (even the head page has this).
|
|
|
|
*
|
[PATCH] compound page: use page[1].lru
If a compound page has its own put_page_testzero destructor (the only current
example is free_huge_page), that is noted in page[1].mapping of the compound
page. But that's rather a poor place to keep it: functions which call
set_page_dirty_lock after get_user_pages (e.g. Infiniband's
__ib_umem_release) ought to be checking first, otherwise set_page_dirty is
liable to crash on what's not the address of a struct address_space.
And now I'm about to make that worse: it turns out that every compound page
needs a destructor, so we can no longer rely on hugetlb pages going their own
special way, to avoid further problems of page->mapping reuse. For example,
not many people know that: on 50% of i386 -Os builds, the first tail page of a
compound page purports to be PageAnon (when its destructor has an odd
address), which surprises page_add_file_rmap.
Keep the compound page destructor in page[1].lru.next instead. And to free up
the common pairing of mapping and index, also move compound page order from
index to lru.prev. Slab reuses page->lru too: but if we ever need slab to use
compound pages, it can easily stack its use above this.
(akpm: decoded version of the above: the tail pages of a compound page now
have ->mapping==NULL, so there's no need for the set_page_dirty[_lock]()
caller to check that they're not compund pages before doing the dirty).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-15 05:52:58 +08:00
|
|
|
* The first tail page's ->lru.next holds the address of the compound page's
|
|
|
|
* put_page() function. Its ->lru.prev holds the order of allocation.
|
|
|
|
* This usage means that zero-order pages may not be compound.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-02-15 05:52:59 +08:00
|
|
|
|
|
|
|
static void free_compound_page(struct page *page)
|
|
|
|
{
|
2007-05-07 05:49:39 +08:00
|
|
|
__free_pages_ok(page, compound_order(page));
|
2006-02-15 05:52:59 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static void prep_compound_page(struct page *page, unsigned long order)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int nr_pages = 1 << order;
|
|
|
|
|
2006-12-07 12:33:32 +08:00
|
|
|
set_compound_page_dtor(page, free_compound_page);
|
2007-05-07 05:49:39 +08:00
|
|
|
set_compound_order(page, order);
|
2007-05-07 05:49:40 +08:00
|
|
|
__SetPageHead(page);
|
2007-05-07 05:49:39 +08:00
|
|
|
for (i = 1; i < nr_pages; i++) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page *p = page + i;
|
|
|
|
|
2007-05-07 05:49:39 +08:00
|
|
|
__SetPageTail(p);
|
|
|
|
p->first_page = page;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void destroy_compound_page(struct page *page, unsigned long order)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int nr_pages = 1 << order;
|
|
|
|
|
2007-05-07 05:49:39 +08:00
|
|
|
if (unlikely(compound_order(page) != order))
|
2006-01-06 16:11:11 +08:00
|
|
|
bad_page(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-05-07 05:49:40 +08:00
|
|
|
if (unlikely(!PageHead(page)))
|
2007-05-07 05:49:39 +08:00
|
|
|
bad_page(page);
|
2007-05-07 05:49:40 +08:00
|
|
|
__ClearPageHead(page);
|
2007-05-07 05:49:39 +08:00
|
|
|
for (i = 1; i < nr_pages; i++) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page *p = page + i;
|
|
|
|
|
2007-05-07 05:49:40 +08:00
|
|
|
if (unlikely(!PageTail(p) |
|
2007-05-07 05:49:39 +08:00
|
|
|
(p->first_page != page)))
|
2006-01-06 16:11:11 +08:00
|
|
|
bad_page(page);
|
2007-05-07 05:49:39 +08:00
|
|
|
__ClearPageTail(p);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-03-22 16:08:41 +08:00
|
|
|
static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2006-03-22 16:08:42 +08:00
|
|
|
/*
|
|
|
|
* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
|
|
|
|
* and __GFP_HIGHMEM from hard or soft interrupt context.
|
|
|
|
*/
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
|
2006-03-22 16:08:41 +08:00
|
|
|
for (i = 0; i < (1 << order); i++)
|
|
|
|
clear_highpage(page + i);
|
|
|
|
}
|
|
|
|
|
2006-04-19 13:20:52 +08:00
|
|
|
static inline void set_page_order(struct page *page, int order)
|
|
|
|
{
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(page, order);
|
2006-04-10 09:21:48 +08:00
|
|
|
__SetPageBuddy(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void rmv_page_order(struct page *page)
|
|
|
|
{
|
2006-04-10 09:21:48 +08:00
|
|
|
__ClearPageBuddy(page);
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(page, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Locate the struct page for both the matching buddy in our
|
|
|
|
* pair (buddy1) and the combined O(n+1) page they form (page).
|
|
|
|
*
|
|
|
|
* 1) Any buddy B1 will have an order O twin B2 which satisfies
|
|
|
|
* the following equation:
|
|
|
|
* B2 = B1 ^ (1 << O)
|
|
|
|
* For example, if the starting buddy (buddy2) is #8 its order
|
|
|
|
* 1 buddy is #10:
|
|
|
|
* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
|
|
|
|
*
|
|
|
|
* 2) Any buddy B will have an order O+1 parent P which
|
|
|
|
* satisfies the following equation:
|
|
|
|
* P = B & ~(1 << O)
|
|
|
|
*
|
2006-06-27 00:35:02 +08:00
|
|
|
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
static inline struct page *
|
|
|
|
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
|
|
|
|
{
|
|
|
|
unsigned long buddy_idx = page_idx ^ (1 << order);
|
|
|
|
|
|
|
|
return page + (buddy_idx - page_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long
|
|
|
|
__find_combined_index(unsigned long page_idx, unsigned int order)
|
|
|
|
{
|
|
|
|
return (page_idx & ~(1 << order));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function checks whether a page is free && is the buddy
|
|
|
|
* we can do coalesce a page and its buddy if
|
2006-01-06 16:10:58 +08:00
|
|
|
* (a) the buddy is not in a hole &&
|
2006-04-10 09:21:48 +08:00
|
|
|
* (b) the buddy is in the buddy system &&
|
2006-06-23 17:03:01 +08:00
|
|
|
* (c) a page and its buddy have the same order &&
|
|
|
|
* (d) a page and its buddy are in the same zone.
|
2006-04-10 09:21:48 +08:00
|
|
|
*
|
|
|
|
* For recording whether a page is in the buddy system, we use PG_buddy.
|
|
|
|
* Setting, clearing, and testing PG_buddy is serialized by zone->lock.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2006-04-10 09:21:48 +08:00
|
|
|
* For recording page's order, we use page_private(page).
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2006-06-23 17:03:01 +08:00
|
|
|
static inline int page_is_buddy(struct page *page, struct page *buddy,
|
|
|
|
int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-05-07 05:49:14 +08:00
|
|
|
if (!pfn_valid_within(page_to_pfn(buddy)))
|
2006-01-06 16:10:58 +08:00
|
|
|
return 0;
|
|
|
|
|
2006-06-23 17:03:01 +08:00
|
|
|
if (page_zone_id(page) != page_zone_id(buddy))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (PageBuddy(buddy) && page_order(buddy) == order) {
|
|
|
|
BUG_ON(page_count(buddy) != 0);
|
2006-04-19 13:20:52 +08:00
|
|
|
return 1;
|
2006-04-10 09:21:48 +08:00
|
|
|
}
|
2006-04-19 13:20:52 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Freeing function for a buddy system allocator.
|
|
|
|
*
|
|
|
|
* The concept of a buddy system is to maintain direct-mapped table
|
|
|
|
* (containing bit values) for memory blocks of various "orders".
|
|
|
|
* The bottom level table contains the map for the smallest allocatable
|
|
|
|
* units of memory (here, pages), and each level above it describes
|
|
|
|
* pairs of units from the levels below, hence, "buddies".
|
|
|
|
* At a high level, all that happens here is marking the table entry
|
|
|
|
* at the bottom level available, and propagating the changes upward
|
|
|
|
* as necessary, plus some accounting needed to play nicely with other
|
|
|
|
* parts of the VM system.
|
|
|
|
* At each level, we keep a list of pages, which are heads of continuous
|
2006-04-10 09:21:48 +08:00
|
|
|
* free pages of length of (1 << order) and marked with PG_buddy. Page's
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
* order is recorded in page_private(page) field.
|
2005-04-17 06:20:36 +08:00
|
|
|
* So when we are allocating or freeing one, we can derive the state of the
|
|
|
|
* other. That is, if we allocate a small block, and both were
|
|
|
|
* free, the remainder of the region must be split into blocks.
|
|
|
|
* If a block is freed, and its buddy is also free, then this
|
|
|
|
* triggers coalescing into a block of larger size.
|
|
|
|
*
|
|
|
|
* -- wli
|
|
|
|
*/
|
|
|
|
|
2006-01-08 17:00:42 +08:00
|
|
|
static inline void __free_one_page(struct page *page,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct zone *zone, unsigned int order)
|
|
|
|
{
|
|
|
|
unsigned long page_idx;
|
|
|
|
int order_size = 1 << order;
|
2007-10-16 16:25:48 +08:00
|
|
|
int migratetype = get_pageblock_migratetype(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-01-06 16:11:11 +08:00
|
|
|
if (unlikely(PageCompound(page)))
|
2005-04-17 06:20:36 +08:00
|
|
|
destroy_compound_page(page, order);
|
|
|
|
|
|
|
|
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
|
|
|
|
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(page_idx & (order_size - 1));
|
|
|
|
VM_BUG_ON(bad_range(zone, page));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-10 17:43:02 +08:00
|
|
|
__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
while (order < MAX_ORDER-1) {
|
|
|
|
unsigned long combined_idx;
|
|
|
|
struct page *buddy;
|
|
|
|
|
|
|
|
buddy = __page_find_buddy(page, page_idx, order);
|
2006-06-23 17:03:01 +08:00
|
|
|
if (!page_is_buddy(page, buddy, order))
|
2005-04-17 06:20:36 +08:00
|
|
|
break; /* Move the buddy up one level. */
|
2006-01-06 16:10:58 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
list_del(&buddy->lru);
|
2007-10-16 16:25:48 +08:00
|
|
|
zone->free_area[order].nr_free--;
|
2005-04-17 06:20:36 +08:00
|
|
|
rmv_page_order(buddy);
|
2006-01-06 16:10:58 +08:00
|
|
|
combined_idx = __find_combined_index(page_idx, order);
|
2005-04-17 06:20:36 +08:00
|
|
|
page = page + (combined_idx - page_idx);
|
|
|
|
page_idx = combined_idx;
|
|
|
|
order++;
|
|
|
|
}
|
|
|
|
set_page_order(page, order);
|
2007-10-16 16:25:48 +08:00
|
|
|
list_add(&page->lru,
|
|
|
|
&zone->free_area[order].free_list[migratetype]);
|
2005-04-17 06:20:36 +08:00
|
|
|
zone->free_area[order].nr_free++;
|
|
|
|
}
|
|
|
|
|
2006-01-06 16:11:11 +08:00
|
|
|
static inline int free_pages_check(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-01-06 16:10:57 +08:00
|
|
|
if (unlikely(page_mapcount(page) |
|
|
|
|
(page->mapping != NULL) |
|
2008-03-05 06:29:07 +08:00
|
|
|
(page_get_page_cgroup(page) != NULL) |
|
2006-01-06 16:10:57 +08:00
|
|
|
(page_count(page) != 0) |
|
2008-06-10 00:18:45 +08:00
|
|
|
(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
|
2006-01-06 16:11:11 +08:00
|
|
|
bad_page(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (PageDirty(page))
|
2005-09-04 06:54:50 +08:00
|
|
|
__ClearPageDirty(page);
|
2005-11-22 13:32:20 +08:00
|
|
|
/*
|
|
|
|
* For now, we report if PG_reserved was found set, but do not
|
|
|
|
* clear it, and do not free the page. But we shall soon need
|
|
|
|
* to do more, for when the ZERO_PAGE count wraps negative.
|
|
|
|
*/
|
|
|
|
return PageReserved(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Frees a list of pages.
|
|
|
|
* Assumes all pages on list are in same zone, and of same order.
|
2005-09-10 15:26:59 +08:00
|
|
|
* count is the number of pages to free.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* If the zone was previously in an "all pages pinned" state then look to
|
|
|
|
* see if this freeing clears that state.
|
|
|
|
*
|
|
|
|
* And clear the zone's pages_scanned counter, to hold off the "all pages are
|
|
|
|
* pinned" detection logic.
|
|
|
|
*/
|
2006-01-08 17:00:42 +08:00
|
|
|
static void free_pages_bulk(struct zone *zone, int count,
|
|
|
|
struct list_head *list, int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-01-06 16:10:56 +08:00
|
|
|
spin_lock(&zone->lock);
|
2007-10-17 14:25:54 +08:00
|
|
|
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
|
2005-04-17 06:20:36 +08:00
|
|
|
zone->pages_scanned = 0;
|
2006-01-08 17:00:42 +08:00
|
|
|
while (count--) {
|
|
|
|
struct page *page;
|
|
|
|
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(list_empty(list));
|
2005-04-17 06:20:36 +08:00
|
|
|
page = list_entry(list->prev, struct page, lru);
|
2006-01-08 17:00:42 +08:00
|
|
|
/* have to delete it as __free_one_page list manipulates */
|
2005-04-17 06:20:36 +08:00
|
|
|
list_del(&page->lru);
|
2006-01-08 17:00:42 +08:00
|
|
|
__free_one_page(page, zone, order);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-01-06 16:10:56 +08:00
|
|
|
spin_unlock(&zone->lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-01-08 17:00:42 +08:00
|
|
|
static void free_one_page(struct zone *zone, struct page *page, int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-09-26 14:31:48 +08:00
|
|
|
spin_lock(&zone->lock);
|
2007-10-17 14:25:54 +08:00
|
|
|
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
|
2006-09-26 14:31:48 +08:00
|
|
|
zone->pages_scanned = 0;
|
2006-12-07 12:31:38 +08:00
|
|
|
__free_one_page(page, zone, order);
|
2006-09-26 14:31:48 +08:00
|
|
|
spin_unlock(&zone->lock);
|
2006-01-08 17:00:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __free_pages_ok(struct page *page, unsigned int order)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
int i;
|
2005-11-22 13:32:20 +08:00
|
|
|
int reserved = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
for (i = 0 ; i < (1 << order) ; ++i)
|
2006-01-06 16:11:11 +08:00
|
|
|
reserved += free_pages_check(page + i);
|
2005-11-22 13:32:20 +08:00
|
|
|
if (reserved)
|
|
|
|
return;
|
|
|
|
|
2008-04-30 15:55:01 +08:00
|
|
|
if (!PageHighMem(page)) {
|
2006-10-11 16:21:30 +08:00
|
|
|
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
|
2008-04-30 15:55:01 +08:00
|
|
|
debug_check_no_obj_freed(page_address(page),
|
|
|
|
PAGE_SIZE << order);
|
|
|
|
}
|
2006-10-11 16:21:30 +08:00
|
|
|
arch_free_page(page, order);
|
2006-01-08 17:00:42 +08:00
|
|
|
kernel_map_pages(page, 1 << order, 0);
|
2006-10-11 16:21:30 +08:00
|
|
|
|
2006-01-06 16:10:56 +08:00
|
|
|
local_irq_save(flags);
|
2006-06-30 16:55:45 +08:00
|
|
|
__count_vm_events(PGFREE, 1 << order);
|
2006-01-08 17:00:42 +08:00
|
|
|
free_one_page(page_zone(page), page, order);
|
2006-01-06 16:10:56 +08:00
|
|
|
local_irq_restore(flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-01-06 16:11:08 +08:00
|
|
|
/*
|
|
|
|
* permit the bootmem allocator to evade page validation on high-order frees
|
|
|
|
*/
|
2008-04-28 17:13:34 +08:00
|
|
|
void __free_pages_bootmem(struct page *page, unsigned int order)
|
2006-01-06 16:11:08 +08:00
|
|
|
{
|
|
|
|
if (order == 0) {
|
|
|
|
__ClearPageReserved(page);
|
|
|
|
set_page_count(page, 0);
|
2006-03-22 16:08:40 +08:00
|
|
|
set_page_refcounted(page);
|
2006-03-22 16:08:07 +08:00
|
|
|
__free_page(page);
|
2006-01-06 16:11:08 +08:00
|
|
|
} else {
|
|
|
|
int loop;
|
|
|
|
|
2006-03-22 16:08:07 +08:00
|
|
|
prefetchw(page);
|
2006-01-06 16:11:08 +08:00
|
|
|
for (loop = 0; loop < BITS_PER_LONG; loop++) {
|
|
|
|
struct page *p = &page[loop];
|
|
|
|
|
2006-03-22 16:08:07 +08:00
|
|
|
if (loop + 1 < BITS_PER_LONG)
|
|
|
|
prefetchw(p + 1);
|
2006-01-06 16:11:08 +08:00
|
|
|
__ClearPageReserved(p);
|
|
|
|
set_page_count(p, 0);
|
|
|
|
}
|
|
|
|
|
2006-03-22 16:08:40 +08:00
|
|
|
set_page_refcounted(page);
|
2006-03-22 16:08:07 +08:00
|
|
|
__free_pages(page, order);
|
2006-01-06 16:11:08 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The order of subdivision here is critical for the IO subsystem.
|
|
|
|
* Please do not alter this order without good reasons and regression
|
|
|
|
* testing. Specifically, as large blocks of memory are subdivided,
|
|
|
|
* the order in which smaller blocks are delivered depends on the order
|
|
|
|
* they're subdivided in this function. This is the primary factor
|
|
|
|
* influencing the order in which pages are delivered to the IO
|
|
|
|
* subsystem according to empirical testing, and this is also justified
|
|
|
|
* by considering the behavior of a buddy system containing a single
|
|
|
|
* large block of memory acted on by a series of small allocations.
|
|
|
|
* This behavior is a critical factor in sglist merging's success.
|
|
|
|
*
|
|
|
|
* -- wli
|
|
|
|
*/
|
2006-01-06 16:11:01 +08:00
|
|
|
static inline void expand(struct zone *zone, struct page *page,
|
2007-10-16 16:25:48 +08:00
|
|
|
int low, int high, struct free_area *area,
|
|
|
|
int migratetype)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long size = 1 << high;
|
|
|
|
|
|
|
|
while (high > low) {
|
|
|
|
area--;
|
|
|
|
high--;
|
|
|
|
size >>= 1;
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(bad_range(zone, &page[size]));
|
2007-10-16 16:25:48 +08:00
|
|
|
list_add(&page[size].lru, &area->free_list[migratetype]);
|
2005-04-17 06:20:36 +08:00
|
|
|
area->nr_free++;
|
|
|
|
set_page_order(&page[size], high);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This page is about to be returned from the page allocator
|
|
|
|
*/
|
2006-03-22 16:08:41 +08:00
|
|
|
static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-01-06 16:10:57 +08:00
|
|
|
if (unlikely(page_mapcount(page) |
|
|
|
|
(page->mapping != NULL) |
|
2008-03-05 06:29:07 +08:00
|
|
|
(page_get_page_cgroup(page) != NULL) |
|
2006-01-06 16:10:57 +08:00
|
|
|
(page_count(page) != 0) |
|
2008-06-10 00:18:45 +08:00
|
|
|
(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
|
2006-01-06 16:11:11 +08:00
|
|
|
bad_page(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-22 13:32:20 +08:00
|
|
|
/*
|
|
|
|
* For now, we report if PG_reserved was found set, but do not
|
|
|
|
* clear it, and do not allocate the page: as a safety net.
|
|
|
|
*/
|
|
|
|
if (PageReserved(page))
|
|
|
|
return 1;
|
|
|
|
|
2008-04-28 17:12:52 +08:00
|
|
|
page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
|
2005-04-17 06:20:36 +08:00
|
|
|
1 << PG_referenced | 1 << PG_arch_1 |
|
2007-03-01 12:12:27 +08:00
|
|
|
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(page, 0);
|
2006-03-22 16:08:40 +08:00
|
|
|
set_page_refcounted(page);
|
2006-12-07 12:32:00 +08:00
|
|
|
|
|
|
|
arch_alloc_page(page, order);
|
2005-04-17 06:20:36 +08:00
|
|
|
kernel_map_pages(page, 1 << order, 1);
|
2006-03-22 16:08:41 +08:00
|
|
|
|
|
|
|
if (gfp_flags & __GFP_ZERO)
|
|
|
|
prep_zero_page(page, order, gfp_flags);
|
|
|
|
|
|
|
|
if (order && (gfp_flags & __GFP_COMP))
|
|
|
|
prep_compound_page(page, order);
|
|
|
|
|
2005-11-22 13:32:20 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
/*
|
|
|
|
* Go through the free lists for the given migratetype and remove
|
|
|
|
* the smallest available page from the freelists
|
|
|
|
*/
|
|
|
|
static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
|
|
|
|
int migratetype)
|
|
|
|
{
|
|
|
|
unsigned int current_order;
|
|
|
|
struct free_area * area;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
/* Find a page of the appropriate size in the preferred list */
|
|
|
|
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
|
|
|
|
area = &(zone->free_area[current_order]);
|
|
|
|
if (list_empty(&area->free_list[migratetype]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
page = list_entry(area->free_list[migratetype].next,
|
|
|
|
struct page, lru);
|
|
|
|
list_del(&page->lru);
|
|
|
|
rmv_page_order(page);
|
|
|
|
area->nr_free--;
|
|
|
|
__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
|
|
|
|
expand(zone, page, order, current_order, area, migratetype);
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
/*
|
|
|
|
* This array describes the order lists are fallen back to when
|
|
|
|
* the free lists for the desirable migrate type are depleted
|
|
|
|
*/
|
|
|
|
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
|
2007-10-16 16:25:59 +08:00
|
|
|
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
|
|
|
|
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
|
|
|
|
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
|
|
|
|
[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
|
2007-10-16 16:25:48 +08:00
|
|
|
};
|
|
|
|
|
2007-10-16 16:25:51 +08:00
|
|
|
/*
|
|
|
|
* Move the free pages in a range to the free lists of the requested type.
|
2007-10-16 16:26:01 +08:00
|
|
|
* Note that start_page and end_pages are not aligned on a pageblock
|
2007-10-16 16:25:51 +08:00
|
|
|
* boundary. If alignment is required, use move_freepages_block()
|
|
|
|
*/
|
|
|
|
int move_freepages(struct zone *zone,
|
|
|
|
struct page *start_page, struct page *end_page,
|
|
|
|
int migratetype)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
unsigned long order;
|
2007-10-16 16:26:00 +08:00
|
|
|
int pages_moved = 0;
|
2007-10-16 16:25:51 +08:00
|
|
|
|
|
|
|
#ifndef CONFIG_HOLES_IN_ZONE
|
|
|
|
/*
|
|
|
|
* page_zone is not safe to call in this context when
|
|
|
|
* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
|
|
|
|
* anyway as we check zone boundaries in move_freepages_block().
|
|
|
|
* Remove at a later date when no bug reports exist related to
|
2007-10-16 16:25:58 +08:00
|
|
|
* grouping pages by mobility
|
2007-10-16 16:25:51 +08:00
|
|
|
*/
|
|
|
|
BUG_ON(page_zone(start_page) != page_zone(end_page));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (page = start_page; page <= end_page;) {
|
|
|
|
if (!pfn_valid_within(page_to_pfn(page))) {
|
|
|
|
page++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!PageBuddy(page)) {
|
|
|
|
page++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
order = page_order(page);
|
|
|
|
list_del(&page->lru);
|
|
|
|
list_add(&page->lru,
|
|
|
|
&zone->free_area[order].free_list[migratetype]);
|
|
|
|
page += 1 << order;
|
2007-10-16 16:26:00 +08:00
|
|
|
pages_moved += 1 << order;
|
2007-10-16 16:25:51 +08:00
|
|
|
}
|
|
|
|
|
2007-10-16 16:26:00 +08:00
|
|
|
return pages_moved;
|
2007-10-16 16:25:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
struct page *start_page, *end_page;
|
|
|
|
|
|
|
|
start_pfn = page_to_pfn(page);
|
2007-10-16 16:26:01 +08:00
|
|
|
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
|
2007-10-16 16:25:51 +08:00
|
|
|
start_page = pfn_to_page(start_pfn);
|
2007-10-16 16:26:01 +08:00
|
|
|
end_page = start_page + pageblock_nr_pages - 1;
|
|
|
|
end_pfn = start_pfn + pageblock_nr_pages - 1;
|
2007-10-16 16:25:51 +08:00
|
|
|
|
|
|
|
/* Do not cross zone boundaries */
|
|
|
|
if (start_pfn < zone->zone_start_pfn)
|
|
|
|
start_page = page;
|
|
|
|
if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return move_freepages(zone, start_page, end_page, migratetype);
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
/* Remove an element from the buddy allocator from the fallback list */
|
|
|
|
static struct page *__rmqueue_fallback(struct zone *zone, int order,
|
|
|
|
int start_migratetype)
|
|
|
|
{
|
|
|
|
struct free_area * area;
|
|
|
|
int current_order;
|
|
|
|
struct page *page;
|
|
|
|
int migratetype, i;
|
|
|
|
|
|
|
|
/* Find the largest possible block of pages in the other list */
|
|
|
|
for (current_order = MAX_ORDER-1; current_order >= order;
|
|
|
|
--current_order) {
|
|
|
|
for (i = 0; i < MIGRATE_TYPES - 1; i++) {
|
|
|
|
migratetype = fallbacks[start_migratetype][i];
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
/* MIGRATE_RESERVE handled later if necessary */
|
|
|
|
if (migratetype == MIGRATE_RESERVE)
|
|
|
|
continue;
|
2007-10-16 16:25:53 +08:00
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
area = &(zone->free_area[current_order]);
|
|
|
|
if (list_empty(&area->free_list[migratetype]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
page = list_entry(area->free_list[migratetype].next,
|
|
|
|
struct page, lru);
|
|
|
|
area->nr_free--;
|
|
|
|
|
|
|
|
/*
|
2007-10-16 16:25:51 +08:00
|
|
|
* If breaking a large block of pages, move all free
|
2007-10-16 16:25:55 +08:00
|
|
|
* pages to the preferred allocation list. If falling
|
|
|
|
* back for a reclaimable kernel allocation, be more
|
|
|
|
* agressive about taking ownership of free pages
|
2007-10-16 16:25:48 +08:00
|
|
|
*/
|
2007-10-16 16:26:01 +08:00
|
|
|
if (unlikely(current_order >= (pageblock_order >> 1)) ||
|
2007-10-16 16:25:55 +08:00
|
|
|
start_migratetype == MIGRATE_RECLAIMABLE) {
|
|
|
|
unsigned long pages;
|
|
|
|
pages = move_freepages_block(zone, page,
|
|
|
|
start_migratetype);
|
|
|
|
|
|
|
|
/* Claim the whole block if over half of it is free */
|
2007-10-16 16:26:01 +08:00
|
|
|
if (pages >= (1 << (pageblock_order-1)))
|
2007-10-16 16:25:55 +08:00
|
|
|
set_pageblock_migratetype(page,
|
|
|
|
start_migratetype);
|
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
migratetype = start_migratetype;
|
2007-10-16 16:25:51 +08:00
|
|
|
}
|
2007-10-16 16:25:48 +08:00
|
|
|
|
|
|
|
/* Remove the page from the freelists */
|
|
|
|
list_del(&page->lru);
|
|
|
|
rmv_page_order(page);
|
|
|
|
__mod_zone_page_state(zone, NR_FREE_PAGES,
|
|
|
|
-(1UL << order));
|
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
if (current_order == pageblock_order)
|
2007-10-16 16:25:48 +08:00
|
|
|
set_pageblock_migratetype(page,
|
|
|
|
start_migratetype);
|
|
|
|
|
|
|
|
expand(zone, page, order, current_order, area, migratetype);
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
/* Use MIGRATE_RESERVE rather than fail an allocation */
|
|
|
|
return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
|
2007-10-16 16:25:48 +08:00
|
|
|
}
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
/*
|
2005-04-17 06:20:36 +08:00
|
|
|
* Do the hard work of removing an element from the buddy allocator.
|
|
|
|
* Call me with the zone->lock already held.
|
|
|
|
*/
|
2007-10-16 16:25:48 +08:00
|
|
|
static struct page *__rmqueue(struct zone *zone, unsigned int order,
|
|
|
|
int migratetype)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
page = __rmqueue_smallest(zone, order, migratetype);
|
2007-10-16 16:25:48 +08:00
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
if (unlikely(!page))
|
|
|
|
page = __rmqueue_fallback(zone, order, migratetype);
|
2007-10-16 16:25:48 +08:00
|
|
|
|
|
|
|
return page;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Obtain a specified number of elements from the buddy allocator, all under
|
|
|
|
* a single hold of the lock, for efficiency. Add them to the supplied list.
|
|
|
|
* Returns the number of new pages which were placed at *list.
|
|
|
|
*/
|
|
|
|
static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
2007-10-16 16:25:48 +08:00
|
|
|
unsigned long count, struct list_head *list,
|
|
|
|
int migratetype)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2006-01-06 16:10:56 +08:00
|
|
|
spin_lock(&zone->lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i = 0; i < count; ++i) {
|
2007-10-16 16:25:48 +08:00
|
|
|
struct page *page = __rmqueue(zone, order, migratetype);
|
2006-01-06 16:11:01 +08:00
|
|
|
if (unlikely(page == NULL))
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2007-12-18 08:20:05 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Split buddy pages returned by expand() are received here
|
|
|
|
* in physical page order. The page is added to the callers and
|
|
|
|
* list and the list head then moves forward. From the callers
|
|
|
|
* perspective, the linked list is ordered by page number in
|
|
|
|
* some conditions. This is useful for IO devices that can
|
|
|
|
* merge IO requests if the physical pages are ordered
|
|
|
|
* properly.
|
|
|
|
*/
|
2007-10-16 16:25:49 +08:00
|
|
|
list_add(&page->lru, list);
|
|
|
|
set_page_private(page, migratetype);
|
2007-12-18 08:20:05 +08:00
|
|
|
list = &page->lru;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-01-06 16:10:56 +08:00
|
|
|
spin_unlock(&zone->lock);
|
2006-01-06 16:11:01 +08:00
|
|
|
return i;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-06-22 08:14:57 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-03-10 09:33:54 +08:00
|
|
|
/*
|
2007-05-09 17:35:14 +08:00
|
|
|
* Called from the vmstat counter updater to drain pagesets of this
|
|
|
|
* currently executing processor on remote nodes after they have
|
|
|
|
* expired.
|
|
|
|
*
|
2006-03-22 16:09:08 +08:00
|
|
|
* Note that this function must be called with the thread pinned to
|
|
|
|
* a single processor.
|
2006-03-10 09:33:54 +08:00
|
|
|
*/
|
2007-05-09 17:35:14 +08:00
|
|
|
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
|
2005-06-22 08:14:57 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-05-09 17:35:14 +08:00
|
|
|
int to_drain;
|
2005-06-22 08:14:57 +08:00
|
|
|
|
2007-05-09 17:35:14 +08:00
|
|
|
local_irq_save(flags);
|
|
|
|
if (pcp->count >= pcp->batch)
|
|
|
|
to_drain = pcp->batch;
|
|
|
|
else
|
|
|
|
to_drain = pcp->count;
|
|
|
|
free_pages_bulk(zone, to_drain, &pcp->list, 0);
|
|
|
|
pcp->count -= to_drain;
|
|
|
|
local_irq_restore(flags);
|
2005-06-22 08:14:57 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-02-05 14:29:11 +08:00
|
|
|
/*
|
|
|
|
* Drain pages of the indicated processor.
|
|
|
|
*
|
|
|
|
* The processor must either be the current processor and the
|
|
|
|
* thread pinned to the current processor or a processor that
|
|
|
|
* is not online.
|
|
|
|
*/
|
|
|
|
static void drain_pages(unsigned int cpu)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-01-06 16:10:56 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct zone *zone;
|
|
|
|
|
|
|
|
for_each_zone(zone) {
|
|
|
|
struct per_cpu_pageset *pset;
|
2008-02-05 14:29:19 +08:00
|
|
|
struct per_cpu_pages *pcp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-01-06 08:37:02 +08:00
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
2005-06-22 08:14:47 +08:00
|
|
|
pset = zone_pcp(zone, cpu);
|
2008-02-05 14:29:19 +08:00
|
|
|
|
|
|
|
pcp = &pset->pcp;
|
|
|
|
local_irq_save(flags);
|
|
|
|
free_pages_bulk(zone, pcp->count, &pcp->list, 0);
|
|
|
|
pcp->count = 0;
|
|
|
|
local_irq_restore(flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:11 +08:00
|
|
|
/*
|
|
|
|
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
|
|
|
|
*/
|
|
|
|
void drain_local_pages(void *arg)
|
|
|
|
{
|
|
|
|
drain_pages(smp_processor_id());
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Spill all the per-cpu pages from all CPUs back into the buddy allocator
|
|
|
|
*/
|
|
|
|
void drain_all_pages(void)
|
|
|
|
{
|
|
|
|
on_each_cpu(drain_local_pages, NULL, 0, 1);
|
|
|
|
}
|
|
|
|
|
2007-07-30 05:27:18 +08:00
|
|
|
#ifdef CONFIG_HIBERNATION
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void mark_free_pages(struct zone *zone)
|
|
|
|
{
|
2006-09-26 14:32:49 +08:00
|
|
|
unsigned long pfn, max_zone_pfn;
|
|
|
|
unsigned long flags;
|
2007-10-16 16:25:48 +08:00
|
|
|
int order, t;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head *curr;
|
|
|
|
|
|
|
|
if (!zone->spanned_pages)
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
2006-09-26 14:32:49 +08:00
|
|
|
|
|
|
|
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
|
|
|
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
|
|
|
|
if (pfn_valid(pfn)) {
|
|
|
|
struct page *page = pfn_to_page(pfn);
|
|
|
|
|
2007-05-07 05:50:42 +08:00
|
|
|
if (!swsusp_page_is_forbidden(page))
|
|
|
|
swsusp_unset_page_free(page);
|
2006-09-26 14:32:49 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-16 16:25:48 +08:00
|
|
|
for_each_migratetype_order(order, t) {
|
|
|
|
list_for_each(curr, &zone->free_area[order].free_list[t]) {
|
2006-09-26 14:32:49 +08:00
|
|
|
unsigned long i;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:32:49 +08:00
|
|
|
pfn = page_to_pfn(list_entry(curr, struct page, lru));
|
|
|
|
for (i = 0; i < (1UL << order); i++)
|
2007-05-07 05:50:42 +08:00
|
|
|
swsusp_set_page_free(pfn_to_page(pfn + i));
|
2006-09-26 14:32:49 +08:00
|
|
|
}
|
2007-10-16 16:25:48 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
|
|
}
|
2007-10-16 16:25:50 +08:00
|
|
|
#endif /* CONFIG_PM */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a 0-order page
|
|
|
|
*/
|
2008-02-05 14:29:26 +08:00
|
|
|
static void free_hot_cold_page(struct page *page, int cold)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct zone *zone = page_zone(page);
|
|
|
|
struct per_cpu_pages *pcp;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (PageAnon(page))
|
|
|
|
page->mapping = NULL;
|
2006-01-06 16:11:11 +08:00
|
|
|
if (free_pages_check(page))
|
2005-11-22 13:32:20 +08:00
|
|
|
return;
|
|
|
|
|
2008-04-30 15:55:01 +08:00
|
|
|
if (!PageHighMem(page)) {
|
2006-10-11 16:21:30 +08:00
|
|
|
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
|
2008-04-30 15:55:01 +08:00
|
|
|
debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
|
|
|
|
}
|
2006-10-11 16:21:30 +08:00
|
|
|
arch_free_page(page, 0);
|
2005-11-22 13:32:20 +08:00
|
|
|
kernel_map_pages(page, 1, 0);
|
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
pcp = &zone_pcp(zone, get_cpu())->pcp;
|
2005-04-17 06:20:36 +08:00
|
|
|
local_irq_save(flags);
|
2006-06-30 16:55:45 +08:00
|
|
|
__count_vm_event(PGFREE);
|
2008-02-05 14:29:19 +08:00
|
|
|
if (cold)
|
|
|
|
list_add_tail(&page->lru, &pcp->list);
|
|
|
|
else
|
|
|
|
list_add(&page->lru, &pcp->list);
|
2007-10-16 16:25:49 +08:00
|
|
|
set_page_private(page, get_pageblock_migratetype(page));
|
2005-04-17 06:20:36 +08:00
|
|
|
pcp->count++;
|
2006-01-08 17:00:42 +08:00
|
|
|
if (pcp->count >= pcp->high) {
|
|
|
|
free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
|
|
|
|
pcp->count -= pcp->batch;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
put_cpu();
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
void free_hot_page(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
free_hot_cold_page(page, 0);
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
void free_cold_page(struct page *page)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
free_hot_cold_page(page, 1);
|
|
|
|
}
|
|
|
|
|
2006-03-22 16:08:05 +08:00
|
|
|
/*
|
|
|
|
* split_page takes a non-compound higher-order page, and splits it into
|
|
|
|
* n (1<<order) sub-pages: page[0..n]
|
|
|
|
* Each sub-page must be freed individually.
|
|
|
|
*
|
|
|
|
* Note: this is probably too low level an operation for use in drivers.
|
|
|
|
* Please consult with lkml before using this in your driver.
|
|
|
|
*/
|
|
|
|
void split_page(struct page *page, unsigned int order)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(PageCompound(page));
|
|
|
|
VM_BUG_ON(!page_count(page));
|
2006-03-22 16:08:40 +08:00
|
|
|
for (i = 1; i < (1 << order); i++)
|
|
|
|
set_page_refcounted(page + i);
|
2006-03-22 16:08:05 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
|
|
|
|
* we cheat by calling it from here, in the order > 0 path. Saves a branch
|
|
|
|
* or two.
|
|
|
|
*/
|
2008-04-28 17:12:14 +08:00
|
|
|
static struct page *buffered_rmqueue(struct zone *preferred_zone,
|
2006-01-06 16:11:20 +08:00
|
|
|
struct zone *zone, int order, gfp_t gfp_flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2005-11-22 13:32:20 +08:00
|
|
|
struct page *page;
|
2005-04-17 06:20:36 +08:00
|
|
|
int cold = !!(gfp_flags & __GFP_COLD);
|
2006-01-06 16:11:20 +08:00
|
|
|
int cpu;
|
2007-10-16 16:25:59 +08:00
|
|
|
int migratetype = allocflags_to_migratetype(gfp_flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-22 13:32:20 +08:00
|
|
|
again:
|
2006-01-06 16:11:20 +08:00
|
|
|
cpu = get_cpu();
|
2006-01-08 17:00:42 +08:00
|
|
|
if (likely(order == 0)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct per_cpu_pages *pcp;
|
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
pcp = &zone_pcp(zone, cpu)->pcp;
|
2005-04-17 06:20:36 +08:00
|
|
|
local_irq_save(flags);
|
2006-01-06 16:11:20 +08:00
|
|
|
if (!pcp->count) {
|
2006-11-03 14:07:04 +08:00
|
|
|
pcp->count = rmqueue_bulk(zone, 0,
|
2007-10-16 16:25:48 +08:00
|
|
|
pcp->batch, &pcp->list, migratetype);
|
2006-01-06 16:11:20 +08:00
|
|
|
if (unlikely(!pcp->count))
|
|
|
|
goto failed;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-10-16 16:25:50 +08:00
|
|
|
|
2007-10-16 16:25:49 +08:00
|
|
|
/* Find a page of the appropriate migrate type */
|
2008-02-05 14:29:19 +08:00
|
|
|
if (cold) {
|
|
|
|
list_for_each_entry_reverse(page, &pcp->list, lru)
|
|
|
|
if (page_private(page) == migratetype)
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
list_for_each_entry(page, &pcp->list, lru)
|
|
|
|
if (page_private(page) == migratetype)
|
|
|
|
break;
|
|
|
|
}
|
2007-10-16 16:25:49 +08:00
|
|
|
|
2007-10-16 16:25:50 +08:00
|
|
|
/* Allocate more to the pcp list if necessary */
|
|
|
|
if (unlikely(&page->lru == &pcp->list)) {
|
2007-10-16 16:25:49 +08:00
|
|
|
pcp->count += rmqueue_bulk(zone, 0,
|
|
|
|
pcp->batch, &pcp->list, migratetype);
|
|
|
|
page = list_entry(pcp->list.next, struct page, lru);
|
|
|
|
}
|
2007-10-16 16:25:50 +08:00
|
|
|
|
|
|
|
list_del(&page->lru);
|
|
|
|
pcp->count--;
|
2005-11-14 08:06:43 +08:00
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
2007-10-16 16:25:48 +08:00
|
|
|
page = __rmqueue(zone, order, migratetype);
|
2006-01-06 16:11:20 +08:00
|
|
|
spin_unlock(&zone->lock);
|
|
|
|
if (!page)
|
|
|
|
goto failed;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-06-30 16:55:45 +08:00
|
|
|
__count_zone_vm_events(PGALLOC, zone, 1 << order);
|
2008-04-28 17:12:14 +08:00
|
|
|
zone_statistics(preferred_zone, zone);
|
2006-01-06 16:11:20 +08:00
|
|
|
local_irq_restore(flags);
|
|
|
|
put_cpu();
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(bad_range(zone, page));
|
2006-03-22 16:08:41 +08:00
|
|
|
if (prep_new_page(page, order, gfp_flags))
|
2006-01-06 16:11:20 +08:00
|
|
|
goto again;
|
2005-04-17 06:20:36 +08:00
|
|
|
return page;
|
2006-01-06 16:11:20 +08:00
|
|
|
|
|
|
|
failed:
|
|
|
|
local_irq_restore(flags);
|
|
|
|
put_cpu();
|
|
|
|
return NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:43 +08:00
|
|
|
#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
|
[PATCH] mm: __alloc_pages cleanup fix
I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee
Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:
--- pages_high
|
| (a)
|
--- pages_low
|
| (b)
|
--- pages_min
|
| (c)
|
--- 0
Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.
When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie. atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:44:03 +08:00
|
|
|
#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
|
|
|
|
#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
|
|
|
|
#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
|
|
|
|
#define ALLOC_HARDER 0x10 /* try to alloc harder */
|
|
|
|
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
|
|
|
|
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
|
2005-11-14 08:06:43 +08:00
|
|
|
|
2006-12-08 18:39:45 +08:00
|
|
|
#ifdef CONFIG_FAIL_PAGE_ALLOC
|
|
|
|
|
|
|
|
static struct fail_page_alloc_attr {
|
|
|
|
struct fault_attr attr;
|
|
|
|
|
|
|
|
u32 ignore_gfp_highmem;
|
|
|
|
u32 ignore_gfp_wait;
|
2007-07-16 14:40:23 +08:00
|
|
|
u32 min_order;
|
2006-12-08 18:39:45 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
|
|
|
|
|
|
|
|
struct dentry *ignore_gfp_highmem_file;
|
|
|
|
struct dentry *ignore_gfp_wait_file;
|
2007-07-16 14:40:23 +08:00
|
|
|
struct dentry *min_order_file;
|
2006-12-08 18:39:45 +08:00
|
|
|
|
|
|
|
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
|
|
|
|
|
|
|
|
} fail_page_alloc = {
|
|
|
|
.attr = FAULT_ATTR_INITIALIZER,
|
2006-12-08 18:39:53 +08:00
|
|
|
.ignore_gfp_wait = 1,
|
|
|
|
.ignore_gfp_highmem = 1,
|
2007-07-16 14:40:23 +08:00
|
|
|
.min_order = 1,
|
2006-12-08 18:39:45 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int __init setup_fail_page_alloc(char *str)
|
|
|
|
{
|
|
|
|
return setup_fault_attr(&fail_page_alloc.attr, str);
|
|
|
|
}
|
|
|
|
__setup("fail_page_alloc=", setup_fail_page_alloc);
|
|
|
|
|
|
|
|
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
|
|
|
{
|
2007-07-16 14:40:23 +08:00
|
|
|
if (order < fail_page_alloc.min_order)
|
|
|
|
return 0;
|
2006-12-08 18:39:45 +08:00
|
|
|
if (gfp_mask & __GFP_NOFAIL)
|
|
|
|
return 0;
|
|
|
|
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
|
|
|
|
return 0;
|
|
|
|
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return should_fail(&fail_page_alloc.attr, 1 << order);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
|
|
|
|
|
|
|
|
static int __init fail_page_alloc_debugfs(void)
|
|
|
|
{
|
|
|
|
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
|
|
|
|
struct dentry *dir;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = init_fault_attr_dentries(&fail_page_alloc.attr,
|
|
|
|
"fail_page_alloc");
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
dir = fail_page_alloc.attr.dentries.dir;
|
|
|
|
|
|
|
|
fail_page_alloc.ignore_gfp_wait_file =
|
|
|
|
debugfs_create_bool("ignore-gfp-wait", mode, dir,
|
|
|
|
&fail_page_alloc.ignore_gfp_wait);
|
|
|
|
|
|
|
|
fail_page_alloc.ignore_gfp_highmem_file =
|
|
|
|
debugfs_create_bool("ignore-gfp-highmem", mode, dir,
|
|
|
|
&fail_page_alloc.ignore_gfp_highmem);
|
2007-07-16 14:40:23 +08:00
|
|
|
fail_page_alloc.min_order_file =
|
|
|
|
debugfs_create_u32("min-order", mode, dir,
|
|
|
|
&fail_page_alloc.min_order);
|
2006-12-08 18:39:45 +08:00
|
|
|
|
|
|
|
if (!fail_page_alloc.ignore_gfp_wait_file ||
|
2007-07-16 14:40:23 +08:00
|
|
|
!fail_page_alloc.ignore_gfp_highmem_file ||
|
|
|
|
!fail_page_alloc.min_order_file) {
|
2006-12-08 18:39:45 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
|
|
|
|
debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
|
2007-07-16 14:40:23 +08:00
|
|
|
debugfs_remove(fail_page_alloc.min_order_file);
|
2006-12-08 18:39:45 +08:00
|
|
|
cleanup_fault_attr_dentries(&fail_page_alloc.attr);
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
late_initcall(fail_page_alloc_debugfs);
|
|
|
|
|
|
|
|
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
|
|
|
|
|
|
|
|
#else /* CONFIG_FAIL_PAGE_ALLOC */
|
|
|
|
|
|
|
|
static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_FAIL_PAGE_ALLOC */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Return 1 if free pages are above 'mark'. This takes into account the order
|
|
|
|
* of the allocation.
|
|
|
|
*/
|
|
|
|
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
2005-11-14 08:06:43 +08:00
|
|
|
int classzone_idx, int alloc_flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/* free_pages my go negative - that's OK */
|
2007-02-10 17:43:02 +08:00
|
|
|
long min = mark;
|
|
|
|
long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
int o;
|
|
|
|
|
2005-11-14 08:06:43 +08:00
|
|
|
if (alloc_flags & ALLOC_HIGH)
|
2005-04-17 06:20:36 +08:00
|
|
|
min -= min / 2;
|
2005-11-14 08:06:43 +08:00
|
|
|
if (alloc_flags & ALLOC_HARDER)
|
2005-04-17 06:20:36 +08:00
|
|
|
min -= min / 4;
|
|
|
|
|
|
|
|
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
|
|
|
|
return 0;
|
|
|
|
for (o = 0; o < order; o++) {
|
|
|
|
/* At the next order, this order's pages become unavailable */
|
|
|
|
free_pages -= z->free_area[o].nr_free << o;
|
|
|
|
|
|
|
|
/* Require fewer higher order pages to be free */
|
|
|
|
min >>= 1;
|
|
|
|
|
|
|
|
if (free_pages <= min)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
|
|
* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
|
|
|
|
* skip over zones that are not allowed by the cpuset, or that have
|
|
|
|
* been recently (in last second) found to be nearly full. See further
|
|
|
|
* comments in mmzone.h. Reduces cache footprint of zonelist scans
|
2007-10-20 07:27:18 +08:00
|
|
|
* that have to skip over a lot of full or unallowed zones.
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
*
|
|
|
|
* If the zonelist cache is present in the passed in zonelist, then
|
|
|
|
* returns a pointer to the allowed node mask (either the current
|
2007-10-16 16:25:39 +08:00
|
|
|
* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
*
|
|
|
|
* If the zonelist cache is not available for this zonelist, does
|
|
|
|
* nothing and returns NULL.
|
|
|
|
*
|
|
|
|
* If the fullzones BITMAP in the zonelist cache is stale (more than
|
|
|
|
* a second since last zap'd) then we zap it out (clear its bits.)
|
|
|
|
*
|
|
|
|
* We hold off even calling zlc_setup, until after we've checked the
|
|
|
|
* first zone in the zonelist, on the theory that most allocations will
|
|
|
|
* be satisfied from that first zone, so best to examine that zone as
|
|
|
|
* quickly as we can.
|
|
|
|
*/
|
|
|
|
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
|
|
|
|
{
|
|
|
|
struct zonelist_cache *zlc; /* cached zonelist speedup info */
|
|
|
|
nodemask_t *allowednodes; /* zonelist_cache approximation */
|
|
|
|
|
|
|
|
zlc = zonelist->zlcache_ptr;
|
|
|
|
if (!zlc)
|
|
|
|
return NULL;
|
|
|
|
|
2008-04-28 17:12:38 +08:00
|
|
|
if (time_after(jiffies, zlc->last_full_zap + HZ)) {
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
|
|
|
|
zlc->last_full_zap = jiffies;
|
|
|
|
}
|
|
|
|
|
|
|
|
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
|
|
|
|
&cpuset_current_mems_allowed :
|
2007-10-16 16:25:39 +08:00
|
|
|
&node_states[N_HIGH_MEMORY];
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
return allowednodes;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given 'z' scanning a zonelist, run a couple of quick checks to see
|
|
|
|
* if it is worth looking at further for free memory:
|
|
|
|
* 1) Check that the zone isn't thought to be full (doesn't have its
|
|
|
|
* bit set in the zonelist_cache fullzones BITMAP).
|
|
|
|
* 2) Check that the zones node (obtained from the zonelist_cache
|
|
|
|
* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
|
|
|
|
* Return true (non-zero) if zone is worth looking at further, or
|
|
|
|
* else return false (zero) if it is not.
|
|
|
|
*
|
|
|
|
* This check -ignores- the distinction between various watermarks,
|
|
|
|
* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
|
|
|
|
* found to be full for any variation of these watermarks, it will
|
|
|
|
* be considered full for up to one second by all requests, unless
|
|
|
|
* we are so low on memory on all allowed nodes that we are forced
|
|
|
|
* into the second scan of the zonelist.
|
|
|
|
*
|
|
|
|
* In the second scan we ignore this zonelist cache and exactly
|
|
|
|
* apply the watermarks to all zones, even it is slower to do so.
|
|
|
|
* We are low on memory in the second scan, and should leave no stone
|
|
|
|
* unturned looking for a free page.
|
|
|
|
*/
|
2008-04-28 17:12:17 +08:00
|
|
|
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
nodemask_t *allowednodes)
|
|
|
|
{
|
|
|
|
struct zonelist_cache *zlc; /* cached zonelist speedup info */
|
|
|
|
int i; /* index of *z in zonelist zones */
|
|
|
|
int n; /* node that zone *z is on */
|
|
|
|
|
|
|
|
zlc = zonelist->zlcache_ptr;
|
|
|
|
if (!zlc)
|
|
|
|
return 1;
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
i = z - zonelist->_zonerefs;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
n = zlc->z_to_n[i];
|
|
|
|
|
|
|
|
/* This zone is worth trying if it is allowed but not full */
|
|
|
|
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given 'z' scanning a zonelist, set the corresponding bit in
|
|
|
|
* zlc->fullzones, so that subsequent attempts to allocate a page
|
|
|
|
* from that zone don't waste time re-examining it.
|
|
|
|
*/
|
2008-04-28 17:12:17 +08:00
|
|
|
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
{
|
|
|
|
struct zonelist_cache *zlc; /* cached zonelist speedup info */
|
|
|
|
int i; /* index of *z in zonelist zones */
|
|
|
|
|
|
|
|
zlc = zonelist->zlcache_ptr;
|
|
|
|
if (!zlc)
|
|
|
|
return;
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
i = z - zonelist->_zonerefs;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
|
|
|
|
set_bit(i, zlc->fullzones);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_NUMA */
|
|
|
|
|
|
|
|
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
nodemask_t *allowednodes)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
2005-11-14 08:06:43 +08:00
|
|
|
/*
|
2006-12-07 12:31:38 +08:00
|
|
|
* get_page_from_freelist goes through the zonelist trying to allocate
|
2005-11-14 08:06:43 +08:00
|
|
|
* a page.
|
|
|
|
*/
|
|
|
|
static struct page *
|
2008-04-28 17:12:18 +08:00
|
|
|
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
|
2008-04-28 17:12:16 +08:00
|
|
|
struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
|
2005-06-22 08:14:41 +08:00
|
|
|
{
|
2008-04-28 17:12:17 +08:00
|
|
|
struct zoneref *z;
|
2005-11-14 08:06:43 +08:00
|
|
|
struct page *page = NULL;
|
2008-04-28 17:12:16 +08:00
|
|
|
int classzone_idx;
|
2008-04-28 17:12:14 +08:00
|
|
|
struct zone *zone, *preferred_zone;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
|
|
|
|
int zlc_active = 0; /* set if using zonelist_cache */
|
|
|
|
int did_zlc_setup = 0; /* just call zlc_setup() one time */
|
2008-04-28 17:12:16 +08:00
|
|
|
|
2008-04-28 17:12:18 +08:00
|
|
|
(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
|
|
|
|
&preferred_zone);
|
2008-05-24 04:04:50 +08:00
|
|
|
if (!preferred_zone)
|
|
|
|
return NULL;
|
|
|
|
|
2008-04-28 17:12:18 +08:00
|
|
|
classzone_idx = zone_idx(preferred_zone);
|
2005-11-14 08:06:43 +08:00
|
|
|
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
zonelist_scan:
|
2005-11-14 08:06:43 +08:00
|
|
|
/*
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
* Scan zonelist, looking for a zone with enough free.
|
2005-11-14 08:06:43 +08:00
|
|
|
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
|
|
|
|
*/
|
2008-04-28 17:12:18 +08:00
|
|
|
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
|
|
|
high_zoneidx, nodemask) {
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
if (NUMA_BUILD && zlc_active &&
|
|
|
|
!zlc_zone_worth_trying(zonelist, z, allowednodes))
|
|
|
|
continue;
|
2005-11-14 08:06:43 +08:00
|
|
|
if ((alloc_flags & ALLOC_CPUSET) &&
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 16:34:25 +08:00
|
|
|
!cpuset_zone_allowed_softwall(zone, gfp_mask))
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
goto try_next_zone;
|
2005-11-14 08:06:43 +08:00
|
|
|
|
|
|
|
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
|
[PATCH] mm: __alloc_pages cleanup fix
I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee
Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:
--- pages_high
|
| (a)
|
--- pages_low
|
| (b)
|
--- pages_min
|
| (c)
|
--- 0
Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.
When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie. atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:44:03 +08:00
|
|
|
unsigned long mark;
|
|
|
|
if (alloc_flags & ALLOC_WMARK_MIN)
|
2006-09-26 14:31:45 +08:00
|
|
|
mark = zone->pages_min;
|
[PATCH] mm: __alloc_pages cleanup fix
I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee
Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:
--- pages_high
|
| (a)
|
--- pages_low
|
| (b)
|
--- pages_min
|
| (c)
|
--- 0
Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.
When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie. atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:44:03 +08:00
|
|
|
else if (alloc_flags & ALLOC_WMARK_LOW)
|
2006-09-26 14:31:45 +08:00
|
|
|
mark = zone->pages_low;
|
[PATCH] mm: __alloc_pages cleanup fix
I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee
Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:
--- pages_high
|
| (a)
|
--- pages_low
|
| (b)
|
--- pages_min
|
| (c)
|
--- 0
Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.
When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie. atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:44:03 +08:00
|
|
|
else
|
2006-09-26 14:31:45 +08:00
|
|
|
mark = zone->pages_high;
|
2006-12-07 12:31:38 +08:00
|
|
|
if (!zone_watermark_ok(zone, order, mark,
|
|
|
|
classzone_idx, alloc_flags)) {
|
2006-01-19 09:42:31 +08:00
|
|
|
if (!zone_reclaim_mode ||
|
2006-09-26 14:31:45 +08:00
|
|
|
!zone_reclaim(zone, gfp_mask, order))
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
goto this_zone_full;
|
2006-12-07 12:31:38 +08:00
|
|
|
}
|
2005-11-14 08:06:43 +08:00
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:14 +08:00
|
|
|
page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
|
2006-12-07 12:31:38 +08:00
|
|
|
if (page)
|
2005-11-14 08:06:43 +08:00
|
|
|
break;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
this_zone_full:
|
|
|
|
if (NUMA_BUILD)
|
|
|
|
zlc_mark_zone_full(zonelist, z);
|
|
|
|
try_next_zone:
|
|
|
|
if (NUMA_BUILD && !did_zlc_setup) {
|
|
|
|
/* we do zlc_setup after the first zone is tried */
|
|
|
|
allowednodes = zlc_setup(zonelist, alloc_flags);
|
|
|
|
zlc_active = 1;
|
|
|
|
did_zlc_setup = 1;
|
|
|
|
}
|
2008-04-28 17:12:16 +08:00
|
|
|
}
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
|
|
|
|
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
|
|
|
|
/* Disable zlc cache for second zonelist scan */
|
|
|
|
zlc_active = 0;
|
|
|
|
goto zonelist_scan;
|
|
|
|
}
|
2005-11-14 08:06:43 +08:00
|
|
|
return page;
|
2005-06-22 08:14:41 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This is the 'heart' of the zoned buddy allocator.
|
|
|
|
*/
|
2008-04-28 17:12:18 +08:00
|
|
|
static struct page *
|
|
|
|
__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
|
|
|
|
struct zonelist *zonelist, nodemask_t *nodemask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-21 15:22:44 +08:00
|
|
|
const gfp_t wait = gfp_mask & __GFP_WAIT;
|
2008-04-28 17:12:16 +08:00
|
|
|
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
2008-04-28 17:12:17 +08:00
|
|
|
struct zoneref *z;
|
|
|
|
struct zone *zone;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct page *page;
|
|
|
|
struct reclaim_state reclaim_state;
|
|
|
|
struct task_struct *p = current;
|
|
|
|
int do_retry;
|
2005-11-14 08:06:43 +08:00
|
|
|
int alloc_flags;
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
unsigned long did_some_progress;
|
|
|
|
unsigned long pages_reclaimed = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
might_sleep_if(wait);
|
|
|
|
|
2006-12-08 18:39:45 +08:00
|
|
|
if (should_fail_alloc_page(gfp_mask, order))
|
|
|
|
return NULL;
|
|
|
|
|
2005-11-18 04:35:02 +08:00
|
|
|
restart:
|
2008-04-28 17:12:17 +08:00
|
|
|
z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
if (unlikely(!z->zone)) {
|
2007-10-16 16:25:37 +08:00
|
|
|
/*
|
|
|
|
* Happens if we have an empty zonelist as a result of
|
|
|
|
* GFP_THISNODE being used on a memoryless node
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-11-18 04:35:02 +08:00
|
|
|
|
2008-04-28 17:12:18 +08:00
|
|
|
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
|
2005-11-14 08:06:43 +08:00
|
|
|
if (page)
|
|
|
|
goto got_pg;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:33:26 +08:00
|
|
|
/*
|
|
|
|
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
|
|
|
|
* __GFP_NOWARN set) should not cause reclaim since the subsystem
|
|
|
|
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
|
|
|
|
* using a larger set of nodes after it has established that the
|
|
|
|
* allowed per node queues are empty and that nodes are
|
|
|
|
* over allocated.
|
|
|
|
*/
|
|
|
|
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
|
|
|
|
goto nopage;
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
|
|
|
|
wakeup_kswapd(zone, order);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
/*
|
2005-11-14 08:06:43 +08:00
|
|
|
* OK, we're below the kswapd watermark and have kicked background
|
|
|
|
* reclaim. Now things get more complex, so set up alloc_flags according
|
|
|
|
* to how we want to proceed.
|
|
|
|
*
|
|
|
|
* The caller may dip into page reserves a bit more if the caller
|
|
|
|
* cannot run direct reclaim, or if the caller has realtime scheduling
|
2006-01-12 04:17:19 +08:00
|
|
|
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
|
|
|
|
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
*/
|
[PATCH] mm: __alloc_pages cleanup fix
I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee
Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:
--- pages_high
|
| (a)
|
--- pages_low
|
| (b)
|
--- pages_min
|
| (c)
|
--- 0
Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.
When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie. atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-29 05:44:03 +08:00
|
|
|
alloc_flags = ALLOC_WMARK_MIN;
|
2005-11-14 08:06:43 +08:00
|
|
|
if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
|
|
|
|
alloc_flags |= ALLOC_HARDER;
|
|
|
|
if (gfp_mask & __GFP_HIGH)
|
|
|
|
alloc_flags |= ALLOC_HIGH;
|
[PATCH] Cpuset: might sleep checking zones allowed fix
Fix a couple of infrequently encountered 'sleeping function called from
invalid context' in the cpuset hooks in __alloc_pages. Could sleep while
interrupts disabled.
The routine cpuset_zone_allowed() is called by code in mm/page_alloc.c
__alloc_pages() to determine if a zone is allowed in the current tasks
cpuset. This routine can sleep, for certain GFP_KERNEL allocations, if the
zone is on a memory node not allowed in the current cpuset, but might be
allowed in a parent cpuset.
But we can't sleep in __alloc_pages() if in interrupt, nor if called for a
GFP_ATOMIC request (__GFP_WAIT not set in gfp_flags).
The rule was intended to be:
Don't call cpuset_zone_allowed() if you can't sleep, unless you
pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
the code that might scan up ancestor cpusets and sleep.
This rule was being violated in a couple of places, due to a bogus change
made (by myself, pj) to __alloc_pages() as part of the November 2005 effort
to cleanup its logic, and also due to a later fix to constrain which swap
daemons were awoken.
The bogus change can be seen at:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-11/4691.html
[PATCH 01/05] mm fix __alloc_pages cpuset ALLOC_* flags
This was first noticed on a tight memory system, in code that was disabling
interrupts and doing allocation requests with __GFP_WAIT not set, which
resulted in __might_sleep() writing complaints to the log "Debug: sleeping
function called ...", when the code in cpuset_zone_allowed() tried to take
the callback_sem cpuset semaphore.
We haven't seen a system hang on this 'might_sleep' yet, but we are at
decent risk of seeing it fairly soon, especially since the additional
cpuset_zone_allowed() check was added, conditioning wakeup_kswapd(), in
March 2006.
Special thanks to Dave Chinner, for figuring this out, and a tip of the hat
to Nick Piggin who warned me of this back in Nov 2005, before I was ready
to listen.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-05-21 06:00:09 +08:00
|
|
|
if (wait)
|
|
|
|
alloc_flags |= ALLOC_CPUSET;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Go through the zonelist again. Let __GFP_HIGH and allocations
|
2005-11-14 08:06:43 +08:00
|
|
|
* coming from realtime tasks go deeper into reserves.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* This is the last chance, in general, before the goto nopage.
|
|
|
|
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
|
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 06:18:12 +08:00
|
|
|
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-04-28 17:12:18 +08:00
|
|
|
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
|
2008-04-28 17:12:16 +08:00
|
|
|
high_zoneidx, alloc_flags);
|
2005-11-14 08:06:43 +08:00
|
|
|
if (page)
|
|
|
|
goto got_pg;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* This allocation should allow future memory freeing. */
|
2005-05-01 23:58:36 +08:00
|
|
|
|
2006-12-07 12:32:27 +08:00
|
|
|
rebalance:
|
2005-05-01 23:58:36 +08:00
|
|
|
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
|
|
|
|
&& !in_interrupt()) {
|
|
|
|
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
|
2005-11-14 08:06:41 +08:00
|
|
|
nofail_alloc:
|
2005-05-01 23:58:36 +08:00
|
|
|
/* go through the zonelist yet again, ignoring mins */
|
2008-04-28 17:12:18 +08:00
|
|
|
page = get_page_from_freelist(gfp_mask, nodemask, order,
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
|
2005-11-14 08:06:43 +08:00
|
|
|
if (page)
|
|
|
|
goto got_pg;
|
2005-11-14 08:06:41 +08:00
|
|
|
if (gfp_mask & __GFP_NOFAIL) {
|
2006-10-20 14:28:16 +08:00
|
|
|
congestion_wait(WRITE, HZ/50);
|
2005-11-14 08:06:41 +08:00
|
|
|
goto nofail_alloc;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
goto nopage;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Atomic allocations - we can't balance anything */
|
|
|
|
if (!wait)
|
|
|
|
goto nopage;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
/* We now go into synchronous reclaim */
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 17:01:49 +08:00
|
|
|
cpuset_memory_pressure_bump();
|
2005-04-17 06:20:36 +08:00
|
|
|
p->flags |= PF_MEMALLOC;
|
|
|
|
reclaim_state.reclaimed_slab = 0;
|
|
|
|
p->reclaim_state = &reclaim_state;
|
|
|
|
|
2008-04-28 17:12:12 +08:00
|
|
|
did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
p->reclaim_state = NULL;
|
|
|
|
p->flags &= ~PF_MEMALLOC;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
2007-10-16 16:25:50 +08:00
|
|
|
if (order != 0)
|
2008-02-05 14:29:11 +08:00
|
|
|
drain_all_pages();
|
2007-10-16 16:25:50 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (likely(did_some_progress)) {
|
2008-04-28 17:12:18 +08:00
|
|
|
page = get_page_from_freelist(gfp_mask, nodemask, order,
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist, high_zoneidx, alloc_flags);
|
2005-11-14 08:06:43 +08:00
|
|
|
if (page)
|
|
|
|
goto got_pg;
|
2005-04-17 06:20:36 +08:00
|
|
|
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
|
2008-04-28 17:12:17 +08:00
|
|
|
if (!try_set_zone_oom(zonelist, gfp_mask)) {
|
2007-10-17 14:25:56 +08:00
|
|
|
schedule_timeout_uninterruptible(1);
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Go through the zonelist yet one more time, keep
|
|
|
|
* very high watermark here, this is only to catch
|
|
|
|
* a parallel oom killing, we must fail if we're still
|
|
|
|
* under heavy pressure.
|
|
|
|
*/
|
2008-04-28 17:12:18 +08:00
|
|
|
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
|
|
|
|
order, zonelist, high_zoneidx,
|
|
|
|
ALLOC_WMARK_HIGH|ALLOC_CPUSET);
|
2007-10-17 14:25:56 +08:00
|
|
|
if (page) {
|
2008-04-28 17:12:17 +08:00
|
|
|
clear_zonelist_oom(zonelist, gfp_mask);
|
2005-11-14 08:06:43 +08:00
|
|
|
goto got_pg;
|
2007-10-17 14:25:56 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-31 15:37:30 +08:00
|
|
|
/* The OOM killer will not help higher order allocs so fail */
|
2007-10-17 14:25:56 +08:00
|
|
|
if (order > PAGE_ALLOC_COSTLY_ORDER) {
|
2008-04-28 17:12:17 +08:00
|
|
|
clear_zonelist_oom(zonelist, gfp_mask);
|
2007-07-31 15:37:30 +08:00
|
|
|
goto nopage;
|
2007-10-17 14:25:56 +08:00
|
|
|
}
|
2007-07-31 15:37:30 +08:00
|
|
|
|
2006-02-21 10:27:52 +08:00
|
|
|
out_of_memory(zonelist, gfp_mask, order);
|
2008-04-28 17:12:17 +08:00
|
|
|
clear_zonelist_oom(zonelist, gfp_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't let big-order allocations loop unless the caller explicitly
|
|
|
|
* requests that. Wait for some write requests to complete then retry.
|
|
|
|
*
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
|
|
|
|
* means __GFP_NOFAIL, but that may not be true in other
|
2008-04-29 15:58:23 +08:00
|
|
|
* implementations.
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
*
|
|
|
|
* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
|
|
|
|
* specified, then we retry until we no longer reclaim any pages
|
|
|
|
* (above), or we've reclaimed an order of pages at least as
|
|
|
|
* large as the allocation's order. In both cases, if the
|
|
|
|
* allocation still fails, we stop retrying.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
pages_reclaimed += did_some_progress;
|
2005-04-17 06:20:36 +08:00
|
|
|
do_retry = 0;
|
|
|
|
if (!(gfp_mask & __GFP_NORETRY)) {
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
if (order <= PAGE_ALLOC_COSTLY_ORDER) {
|
2005-04-17 06:20:36 +08:00
|
|
|
do_retry = 1;
|
page allocator: smarter retry of costly-order allocations
Because of page order checks in __alloc_pages(), hugepage (and similarly
large order) allocations will not retry unless explicitly marked
__GFP_REPEAT. However, the current retry logic is nearly an infinite
loop (or until reclaim does no progress whatsoever). For these costly
allocations, that seems like overkill and could potentially never
terminate. Mel observed that allowing current __GFP_REPEAT semantics for
hugepage allocations essentially killed the system. I believe this is
because we may continue to reclaim small orders of pages all over, but
never have enough to satisfy the hugepage allocation request. This is
clearly only a problem for large order allocations, of which hugepages
are the most obvious (to me).
Modify try_to_free_pages() to indicate how many pages were reclaimed.
Use that information in __alloc_pages() to eventually fail a large
__GFP_REPEAT allocation when we've reclaimed an order of pages equal to
or greater than the allocation's order. This relies on lumpy reclaim
functioning as advertised. Due to fragmentation, lumpy reclaim may not
be able to free up the order needed in one invocation, so multiple
iterations may be requred. In other words, the more fragmented memory
is, the more retry attempts __GFP_REPEAT will make (particularly for
higher order allocations).
This changes the semantics of __GFP_REPEAT subtly, but *only* for
allocations > PAGE_ALLOC_COSTLY_ORDER. With this patch, for those size
allocations, we will try up to some point (at least 1<<order reclaimed
pages), rather than forever (which is the case for allocations <=
PAGE_ALLOC_COSTLY_ORDER).
This change improves the /proc/sys/vm/nr_hugepages interface with a
follow-on patch that makes pool allocations use __GFP_REPEAT. Rather
than administrators repeatedly echo'ing a particular value into the
sysctl, and forcing reclaim into action manually, this change allows for
the sysctl to attempt a reasonable effort itself. Similarly, dynamic
pool growth should be more successful under load, as lumpy reclaim can
try to free up pages, rather than failing right away.
Choosing to reclaim only up to the order of the requested allocation
strikes a balance between not failing hugepage allocations and returning
to the caller when it's unlikely to every succeed. Because of lumpy
reclaim, if we have freed the order requested, hopefully it has been in
big chunks and those chunks will allow our allocation to succeed. If
that isn't the case after freeing up the current order, I don't think it
is likely to succeed in the future, although it is possible given a
particular fragmentation pattern.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Tested-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 15:58:25 +08:00
|
|
|
} else {
|
|
|
|
if (gfp_mask & __GFP_REPEAT &&
|
|
|
|
pages_reclaimed < (1 << order))
|
|
|
|
do_retry = 1;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
if (gfp_mask & __GFP_NOFAIL)
|
|
|
|
do_retry = 1;
|
|
|
|
}
|
|
|
|
if (do_retry) {
|
2006-10-20 14:28:16 +08:00
|
|
|
congestion_wait(WRITE, HZ/50);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto rebalance;
|
|
|
|
}
|
|
|
|
|
|
|
|
nopage:
|
|
|
|
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
|
|
|
|
printk(KERN_WARNING "%s: page allocation failure."
|
|
|
|
" order:%d, mode:0x%x\n",
|
|
|
|
p->comm, order, gfp_mask);
|
|
|
|
dump_stack();
|
2005-06-22 08:14:56 +08:00
|
|
|
show_mem();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
got_pg:
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:18 +08:00
|
|
|
struct page *
|
|
|
|
__alloc_pages(gfp_t gfp_mask, unsigned int order,
|
|
|
|
struct zonelist *zonelist)
|
|
|
|
{
|
|
|
|
return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct page *
|
|
|
|
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
|
|
|
struct zonelist *zonelist, nodemask_t *nodemask)
|
|
|
|
{
|
|
|
|
return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(__alloc_pages);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Common helper functions.
|
|
|
|
*/
|
2008-02-05 14:29:26 +08:00
|
|
|
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page * page;
|
|
|
|
page = alloc_pages(gfp_mask, order);
|
|
|
|
if (!page)
|
|
|
|
return 0;
|
|
|
|
return (unsigned long) page_address(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(__get_free_pages);
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
unsigned long get_zeroed_page(gfp_t gfp_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page * page;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* get_zeroed_page() returns a 32-bit address, which cannot represent
|
|
|
|
* a highmem page
|
|
|
|
*/
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
|
|
|
|
if (page)
|
|
|
|
return (unsigned long) page_address(page);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(get_zeroed_page);
|
|
|
|
|
|
|
|
void __pagevec_free(struct pagevec *pvec)
|
|
|
|
{
|
|
|
|
int i = pagevec_count(pvec);
|
|
|
|
|
|
|
|
while (--i >= 0)
|
|
|
|
free_hot_cold_page(pvec->pages[i], pvec->cold);
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
void __free_pages(struct page *page, unsigned int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-10-30 09:16:12 +08:00
|
|
|
if (put_page_testzero(page)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (order == 0)
|
|
|
|
free_hot_page(page);
|
|
|
|
else
|
|
|
|
__free_pages_ok(page, order);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(__free_pages);
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
void free_pages(unsigned long addr, unsigned int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
if (addr != 0) {
|
2006-09-26 14:30:55 +08:00
|
|
|
VM_BUG_ON(!virt_addr_valid((void *)addr));
|
2005-04-17 06:20:36 +08:00
|
|
|
__free_pages(virt_to_page((void *)addr), order);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(free_pages);
|
|
|
|
|
|
|
|
static unsigned int nr_free_zone_pages(int offset)
|
|
|
|
{
|
2008-04-28 17:12:17 +08:00
|
|
|
struct zoneref *z;
|
2008-04-28 17:12:16 +08:00
|
|
|
struct zone *zone;
|
|
|
|
|
2005-07-30 13:59:18 +08:00
|
|
|
/* Just pick one node, since fallback list is circular */
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned int sum = 0;
|
|
|
|
|
2008-04-28 17:12:14 +08:00
|
|
|
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
for_each_zone_zonelist(zone, z, zonelist, offset) {
|
2005-07-30 13:59:18 +08:00
|
|
|
unsigned long size = zone->present_pages;
|
|
|
|
unsigned long high = zone->pages_high;
|
|
|
|
if (size > high)
|
|
|
|
sum += size - high;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
|
|
|
|
*/
|
|
|
|
unsigned int nr_free_buffer_pages(void)
|
|
|
|
{
|
2005-10-21 14:55:38 +08:00
|
|
|
return nr_free_zone_pages(gfp_zone(GFP_USER));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-07-17 19:04:39 +08:00
|
|
|
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Amount of free RAM allocatable within all zones
|
|
|
|
*/
|
|
|
|
unsigned int nr_free_pagecache_pages(void)
|
|
|
|
{
|
2007-07-17 19:03:12 +08:00
|
|
|
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-09-27 16:50:06 +08:00
|
|
|
|
|
|
|
static inline void show_node(struct zone *zone)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-09-27 16:50:06 +08:00
|
|
|
if (NUMA_BUILD)
|
2006-12-07 12:33:03 +08:00
|
|
|
printk("Node %d ", zone_to_nid(zone));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void si_meminfo(struct sysinfo *val)
|
|
|
|
{
|
|
|
|
val->totalram = totalram_pages;
|
|
|
|
val->sharedram = 0;
|
2007-02-10 17:43:02 +08:00
|
|
|
val->freeram = global_page_state(NR_FREE_PAGES);
|
2005-04-17 06:20:36 +08:00
|
|
|
val->bufferram = nr_blockdev_pages();
|
|
|
|
val->totalhigh = totalhigh_pages;
|
|
|
|
val->freehigh = nr_free_highpages();
|
|
|
|
val->mem_unit = PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(si_meminfo);
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
void si_meminfo_node(struct sysinfo *val, int nid)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
|
|
|
|
|
|
val->totalram = pgdat->node_present_pages;
|
2007-02-10 17:43:02 +08:00
|
|
|
val->freeram = node_page_state(nid, NR_FREE_PAGES);
|
2006-09-26 14:31:12 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
2005-04-17 06:20:36 +08:00
|
|
|
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
|
2007-02-10 17:43:02 +08:00
|
|
|
val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
|
|
|
|
NR_FREE_PAGES);
|
2006-09-26 14:31:12 +08:00
|
|
|
#else
|
|
|
|
val->totalhigh = 0;
|
|
|
|
val->freehigh = 0;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
val->mem_unit = PAGE_SIZE;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Show free area list (used inside shift_scroll-lock stuff)
|
|
|
|
* We also calculate the percentage fragmentation. We do this by counting the
|
|
|
|
* memory on each free list with the exception of the first item on the list.
|
|
|
|
*/
|
|
|
|
void show_free_areas(void)
|
|
|
|
{
|
[PATCH] Condense output of show_free_areas()
On larger systems, the amount of output dumped on the console when you do
SysRq-M is beyond insane. This patch is trying to reduce it somewhat as
even with the smaller NUMA systems that have hit the desktop this seems to
be a fair thing to do.
The philosophy I have taken is as follows:
1) If a zone is empty, don't tell, we don't need yet another line
telling us so. The information is available since one can look up
the fact how many zones were initialized in the first place.
2) Put as much information on a line is possible, if it can be done
in one line, rahter than two, then do it in one. I tried to format
the temperature stuff for easy reading.
Change show_free_areas() to not print lines for empty zones. If no zone
output is printed, the zone is empty. This reduces the number of lines
dumped to the console in sysrq on a large system by several thousand lines.
Change the zone temperature printouts to use one line per CPU instead of
two lines (one hot, one cold). On a 1024 CPU, 1024 node system, this
reduces the console output by over a million lines of output.
While this is a bigger problem on large NUMA systems, it is also applicable
to smaller desktop sized and mid range NUMA systems.
Old format:
Mem-info:
Node 0 DMA per-cpu:
cpu 0 hot: high 42, batch 7 used:24
cpu 0 cold: high 14, batch 3 used:1
cpu 1 hot: high 42, batch 7 used:34
cpu 1 cold: high 14, batch 3 used:0
cpu 2 hot: high 42, batch 7 used:0
cpu 2 cold: high 14, batch 3 used:0
cpu 3 hot: high 42, batch 7 used:0
cpu 3 cold: high 14, batch 3 used:0
cpu 4 hot: high 42, batch 7 used:0
cpu 4 cold: high 14, batch 3 used:0
cpu 5 hot: high 42, batch 7 used:0
cpu 5 cold: high 14, batch 3 used:0
cpu 6 hot: high 42, batch 7 used:0
cpu 6 cold: high 14, batch 3 used:0
cpu 7 hot: high 42, batch 7 used:0
cpu 7 cold: high 14, batch 3 used:0
Node 0 DMA32 per-cpu: empty
Node 0 Normal per-cpu: empty
Node 0 HighMem per-cpu: empty
Node 1 DMA per-cpu:
[snip]
Free pages: 5410688kB (0kB HighMem)
Active:9536 inactive:4261 dirty:6 writeback:0 unstable:0 free:338168 slab:1931 mapped:1900 pagetables:208
Node 0 DMA free:1676304kB min:3264kB low:4080kB high:4896kB active:128048kB inactive:61568kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA32 free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 Normal free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 HighMem free:0kB min:512kB low:512kB high:512kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1951728kB min:3280kB low:4096kB high:4912kB active:5632kB inactive:1504kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
....
New format:
Mem-info:
Node 0 DMA per-cpu:
CPU 0: Hot: hi: 42, btch: 7 usd: 41 Cold: hi: 14, btch: 3 usd: 2
CPU 1: Hot: hi: 42, btch: 7 usd: 40 Cold: hi: 14, btch: 3 usd: 1
CPU 2: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 3: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 4: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 5: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 6: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 7: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
Node 1 DMA per-cpu:
[snip]
Free pages: 5411088kB (0kB HighMem)
Active:9558 inactive:4233 dirty:6 writeback:0 unstable:0 free:338193 slab:1942 mapped:1918 pagetables:208
Node 0 DMA free:1677648kB min:3264kB low:4080kB high:4896kB active:129296kB inactive:58864kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1948448kB min:3280kB low:4096kB high:4912kB active:6864kB inactive:3536kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:50:05 +08:00
|
|
|
int cpu;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct zone *zone;
|
|
|
|
|
|
|
|
for_each_zone(zone) {
|
[PATCH] Condense output of show_free_areas()
On larger systems, the amount of output dumped on the console when you do
SysRq-M is beyond insane. This patch is trying to reduce it somewhat as
even with the smaller NUMA systems that have hit the desktop this seems to
be a fair thing to do.
The philosophy I have taken is as follows:
1) If a zone is empty, don't tell, we don't need yet another line
telling us so. The information is available since one can look up
the fact how many zones were initialized in the first place.
2) Put as much information on a line is possible, if it can be done
in one line, rahter than two, then do it in one. I tried to format
the temperature stuff for easy reading.
Change show_free_areas() to not print lines for empty zones. If no zone
output is printed, the zone is empty. This reduces the number of lines
dumped to the console in sysrq on a large system by several thousand lines.
Change the zone temperature printouts to use one line per CPU instead of
two lines (one hot, one cold). On a 1024 CPU, 1024 node system, this
reduces the console output by over a million lines of output.
While this is a bigger problem on large NUMA systems, it is also applicable
to smaller desktop sized and mid range NUMA systems.
Old format:
Mem-info:
Node 0 DMA per-cpu:
cpu 0 hot: high 42, batch 7 used:24
cpu 0 cold: high 14, batch 3 used:1
cpu 1 hot: high 42, batch 7 used:34
cpu 1 cold: high 14, batch 3 used:0
cpu 2 hot: high 42, batch 7 used:0
cpu 2 cold: high 14, batch 3 used:0
cpu 3 hot: high 42, batch 7 used:0
cpu 3 cold: high 14, batch 3 used:0
cpu 4 hot: high 42, batch 7 used:0
cpu 4 cold: high 14, batch 3 used:0
cpu 5 hot: high 42, batch 7 used:0
cpu 5 cold: high 14, batch 3 used:0
cpu 6 hot: high 42, batch 7 used:0
cpu 6 cold: high 14, batch 3 used:0
cpu 7 hot: high 42, batch 7 used:0
cpu 7 cold: high 14, batch 3 used:0
Node 0 DMA32 per-cpu: empty
Node 0 Normal per-cpu: empty
Node 0 HighMem per-cpu: empty
Node 1 DMA per-cpu:
[snip]
Free pages: 5410688kB (0kB HighMem)
Active:9536 inactive:4261 dirty:6 writeback:0 unstable:0 free:338168 slab:1931 mapped:1900 pagetables:208
Node 0 DMA free:1676304kB min:3264kB low:4080kB high:4896kB active:128048kB inactive:61568kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA32 free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 Normal free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 HighMem free:0kB min:512kB low:512kB high:512kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1951728kB min:3280kB low:4096kB high:4912kB active:5632kB inactive:1504kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
....
New format:
Mem-info:
Node 0 DMA per-cpu:
CPU 0: Hot: hi: 42, btch: 7 usd: 41 Cold: hi: 14, btch: 3 usd: 2
CPU 1: Hot: hi: 42, btch: 7 usd: 40 Cold: hi: 14, btch: 3 usd: 1
CPU 2: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 3: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 4: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 5: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 6: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 7: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
Node 1 DMA per-cpu:
[snip]
Free pages: 5411088kB (0kB HighMem)
Active:9558 inactive:4233 dirty:6 writeback:0 unstable:0 free:338193 slab:1942 mapped:1918 pagetables:208
Node 0 DMA free:1677648kB min:3264kB low:4080kB high:4896kB active:129296kB inactive:58864kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1948448kB min:3280kB low:4096kB high:4912kB active:6864kB inactive:3536kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:50:05 +08:00
|
|
|
if (!populated_zone(zone))
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
[PATCH] Condense output of show_free_areas()
On larger systems, the amount of output dumped on the console when you do
SysRq-M is beyond insane. This patch is trying to reduce it somewhat as
even with the smaller NUMA systems that have hit the desktop this seems to
be a fair thing to do.
The philosophy I have taken is as follows:
1) If a zone is empty, don't tell, we don't need yet another line
telling us so. The information is available since one can look up
the fact how many zones were initialized in the first place.
2) Put as much information on a line is possible, if it can be done
in one line, rahter than two, then do it in one. I tried to format
the temperature stuff for easy reading.
Change show_free_areas() to not print lines for empty zones. If no zone
output is printed, the zone is empty. This reduces the number of lines
dumped to the console in sysrq on a large system by several thousand lines.
Change the zone temperature printouts to use one line per CPU instead of
two lines (one hot, one cold). On a 1024 CPU, 1024 node system, this
reduces the console output by over a million lines of output.
While this is a bigger problem on large NUMA systems, it is also applicable
to smaller desktop sized and mid range NUMA systems.
Old format:
Mem-info:
Node 0 DMA per-cpu:
cpu 0 hot: high 42, batch 7 used:24
cpu 0 cold: high 14, batch 3 used:1
cpu 1 hot: high 42, batch 7 used:34
cpu 1 cold: high 14, batch 3 used:0
cpu 2 hot: high 42, batch 7 used:0
cpu 2 cold: high 14, batch 3 used:0
cpu 3 hot: high 42, batch 7 used:0
cpu 3 cold: high 14, batch 3 used:0
cpu 4 hot: high 42, batch 7 used:0
cpu 4 cold: high 14, batch 3 used:0
cpu 5 hot: high 42, batch 7 used:0
cpu 5 cold: high 14, batch 3 used:0
cpu 6 hot: high 42, batch 7 used:0
cpu 6 cold: high 14, batch 3 used:0
cpu 7 hot: high 42, batch 7 used:0
cpu 7 cold: high 14, batch 3 used:0
Node 0 DMA32 per-cpu: empty
Node 0 Normal per-cpu: empty
Node 0 HighMem per-cpu: empty
Node 1 DMA per-cpu:
[snip]
Free pages: 5410688kB (0kB HighMem)
Active:9536 inactive:4261 dirty:6 writeback:0 unstable:0 free:338168 slab:1931 mapped:1900 pagetables:208
Node 0 DMA free:1676304kB min:3264kB low:4080kB high:4896kB active:128048kB inactive:61568kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA32 free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 Normal free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 HighMem free:0kB min:512kB low:512kB high:512kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1951728kB min:3280kB low:4096kB high:4912kB active:5632kB inactive:1504kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
....
New format:
Mem-info:
Node 0 DMA per-cpu:
CPU 0: Hot: hi: 42, btch: 7 usd: 41 Cold: hi: 14, btch: 3 usd: 2
CPU 1: Hot: hi: 42, btch: 7 usd: 40 Cold: hi: 14, btch: 3 usd: 1
CPU 2: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 3: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 4: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 5: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 6: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 7: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
Node 1 DMA per-cpu:
[snip]
Free pages: 5411088kB (0kB HighMem)
Active:9558 inactive:4233 dirty:6 writeback:0 unstable:0 free:338193 slab:1942 mapped:1918 pagetables:208
Node 0 DMA free:1677648kB min:3264kB low:4080kB high:4896kB active:129296kB inactive:58864kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1948448kB min:3280kB low:4096kB high:4912kB active:6864kB inactive:3536kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:50:05 +08:00
|
|
|
|
|
|
|
show_node(zone);
|
|
|
|
printk("%s per-cpu:\n", zone->name);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-11-11 04:45:56 +08:00
|
|
|
for_each_online_cpu(cpu) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct per_cpu_pageset *pageset;
|
|
|
|
|
2005-06-22 08:14:47 +08:00
|
|
|
pageset = zone_pcp(zone, cpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
|
|
|
|
cpu, pageset->pcp.high,
|
|
|
|
pageset->pcp.batch, pageset->pcp.count);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-02-09 06:20:40 +08:00
|
|
|
printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
|
2007-02-10 17:43:02 +08:00
|
|
|
" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
|
2007-02-10 17:43:05 +08:00
|
|
|
global_page_state(NR_ACTIVE),
|
|
|
|
global_page_state(NR_INACTIVE),
|
2006-06-30 16:55:39 +08:00
|
|
|
global_page_state(NR_FILE_DIRTY),
|
2006-06-30 16:55:40 +08:00
|
|
|
global_page_state(NR_WRITEBACK),
|
2006-06-30 16:55:40 +08:00
|
|
|
global_page_state(NR_UNSTABLE_NFS),
|
2007-02-10 17:43:02 +08:00
|
|
|
global_page_state(NR_FREE_PAGES),
|
2006-09-26 14:31:51 +08:00
|
|
|
global_page_state(NR_SLAB_RECLAIMABLE) +
|
|
|
|
global_page_state(NR_SLAB_UNRECLAIMABLE),
|
2006-06-30 16:55:34 +08:00
|
|
|
global_page_state(NR_FILE_MAPPED),
|
2007-02-09 06:20:40 +08:00
|
|
|
global_page_state(NR_PAGETABLE),
|
|
|
|
global_page_state(NR_BOUNCE));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
for_each_zone(zone) {
|
|
|
|
int i;
|
|
|
|
|
[PATCH] Condense output of show_free_areas()
On larger systems, the amount of output dumped on the console when you do
SysRq-M is beyond insane. This patch is trying to reduce it somewhat as
even with the smaller NUMA systems that have hit the desktop this seems to
be a fair thing to do.
The philosophy I have taken is as follows:
1) If a zone is empty, don't tell, we don't need yet another line
telling us so. The information is available since one can look up
the fact how many zones were initialized in the first place.
2) Put as much information on a line is possible, if it can be done
in one line, rahter than two, then do it in one. I tried to format
the temperature stuff for easy reading.
Change show_free_areas() to not print lines for empty zones. If no zone
output is printed, the zone is empty. This reduces the number of lines
dumped to the console in sysrq on a large system by several thousand lines.
Change the zone temperature printouts to use one line per CPU instead of
two lines (one hot, one cold). On a 1024 CPU, 1024 node system, this
reduces the console output by over a million lines of output.
While this is a bigger problem on large NUMA systems, it is also applicable
to smaller desktop sized and mid range NUMA systems.
Old format:
Mem-info:
Node 0 DMA per-cpu:
cpu 0 hot: high 42, batch 7 used:24
cpu 0 cold: high 14, batch 3 used:1
cpu 1 hot: high 42, batch 7 used:34
cpu 1 cold: high 14, batch 3 used:0
cpu 2 hot: high 42, batch 7 used:0
cpu 2 cold: high 14, batch 3 used:0
cpu 3 hot: high 42, batch 7 used:0
cpu 3 cold: high 14, batch 3 used:0
cpu 4 hot: high 42, batch 7 used:0
cpu 4 cold: high 14, batch 3 used:0
cpu 5 hot: high 42, batch 7 used:0
cpu 5 cold: high 14, batch 3 used:0
cpu 6 hot: high 42, batch 7 used:0
cpu 6 cold: high 14, batch 3 used:0
cpu 7 hot: high 42, batch 7 used:0
cpu 7 cold: high 14, batch 3 used:0
Node 0 DMA32 per-cpu: empty
Node 0 Normal per-cpu: empty
Node 0 HighMem per-cpu: empty
Node 1 DMA per-cpu:
[snip]
Free pages: 5410688kB (0kB HighMem)
Active:9536 inactive:4261 dirty:6 writeback:0 unstable:0 free:338168 slab:1931 mapped:1900 pagetables:208
Node 0 DMA free:1676304kB min:3264kB low:4080kB high:4896kB active:128048kB inactive:61568kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA32 free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 Normal free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 HighMem free:0kB min:512kB low:512kB high:512kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1951728kB min:3280kB low:4096kB high:4912kB active:5632kB inactive:1504kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
....
New format:
Mem-info:
Node 0 DMA per-cpu:
CPU 0: Hot: hi: 42, btch: 7 usd: 41 Cold: hi: 14, btch: 3 usd: 2
CPU 1: Hot: hi: 42, btch: 7 usd: 40 Cold: hi: 14, btch: 3 usd: 1
CPU 2: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 3: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 4: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 5: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 6: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 7: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
Node 1 DMA per-cpu:
[snip]
Free pages: 5411088kB (0kB HighMem)
Active:9558 inactive:4233 dirty:6 writeback:0 unstable:0 free:338193 slab:1942 mapped:1918 pagetables:208
Node 0 DMA free:1677648kB min:3264kB low:4080kB high:4896kB active:129296kB inactive:58864kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1948448kB min:3280kB low:4096kB high:4912kB active:6864kB inactive:3536kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:50:05 +08:00
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
show_node(zone);
|
|
|
|
printk("%s"
|
|
|
|
" free:%lukB"
|
|
|
|
" min:%lukB"
|
|
|
|
" low:%lukB"
|
|
|
|
" high:%lukB"
|
|
|
|
" active:%lukB"
|
|
|
|
" inactive:%lukB"
|
|
|
|
" present:%lukB"
|
|
|
|
" pages_scanned:%lu"
|
|
|
|
" all_unreclaimable? %s"
|
|
|
|
"\n",
|
|
|
|
zone->name,
|
2007-02-10 17:43:02 +08:00
|
|
|
K(zone_page_state(zone, NR_FREE_PAGES)),
|
2005-04-17 06:20:36 +08:00
|
|
|
K(zone->pages_min),
|
|
|
|
K(zone->pages_low),
|
|
|
|
K(zone->pages_high),
|
2007-02-10 17:43:01 +08:00
|
|
|
K(zone_page_state(zone, NR_ACTIVE)),
|
|
|
|
K(zone_page_state(zone, NR_INACTIVE)),
|
2005-04-17 06:20:36 +08:00
|
|
|
K(zone->present_pages),
|
|
|
|
zone->pages_scanned,
|
2007-10-17 14:25:54 +08:00
|
|
|
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
|
2005-04-17 06:20:36 +08:00
|
|
|
);
|
|
|
|
printk("lowmem_reserve[]:");
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++)
|
|
|
|
printk(" %lu", zone->lowmem_reserve[i]);
|
|
|
|
printk("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
for_each_zone(zone) {
|
2006-06-23 17:03:50 +08:00
|
|
|
unsigned long nr[MAX_ORDER], flags, order, total = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] Condense output of show_free_areas()
On larger systems, the amount of output dumped on the console when you do
SysRq-M is beyond insane. This patch is trying to reduce it somewhat as
even with the smaller NUMA systems that have hit the desktop this seems to
be a fair thing to do.
The philosophy I have taken is as follows:
1) If a zone is empty, don't tell, we don't need yet another line
telling us so. The information is available since one can look up
the fact how many zones were initialized in the first place.
2) Put as much information on a line is possible, if it can be done
in one line, rahter than two, then do it in one. I tried to format
the temperature stuff for easy reading.
Change show_free_areas() to not print lines for empty zones. If no zone
output is printed, the zone is empty. This reduces the number of lines
dumped to the console in sysrq on a large system by several thousand lines.
Change the zone temperature printouts to use one line per CPU instead of
two lines (one hot, one cold). On a 1024 CPU, 1024 node system, this
reduces the console output by over a million lines of output.
While this is a bigger problem on large NUMA systems, it is also applicable
to smaller desktop sized and mid range NUMA systems.
Old format:
Mem-info:
Node 0 DMA per-cpu:
cpu 0 hot: high 42, batch 7 used:24
cpu 0 cold: high 14, batch 3 used:1
cpu 1 hot: high 42, batch 7 used:34
cpu 1 cold: high 14, batch 3 used:0
cpu 2 hot: high 42, batch 7 used:0
cpu 2 cold: high 14, batch 3 used:0
cpu 3 hot: high 42, batch 7 used:0
cpu 3 cold: high 14, batch 3 used:0
cpu 4 hot: high 42, batch 7 used:0
cpu 4 cold: high 14, batch 3 used:0
cpu 5 hot: high 42, batch 7 used:0
cpu 5 cold: high 14, batch 3 used:0
cpu 6 hot: high 42, batch 7 used:0
cpu 6 cold: high 14, batch 3 used:0
cpu 7 hot: high 42, batch 7 used:0
cpu 7 cold: high 14, batch 3 used:0
Node 0 DMA32 per-cpu: empty
Node 0 Normal per-cpu: empty
Node 0 HighMem per-cpu: empty
Node 1 DMA per-cpu:
[snip]
Free pages: 5410688kB (0kB HighMem)
Active:9536 inactive:4261 dirty:6 writeback:0 unstable:0 free:338168 slab:1931 mapped:1900 pagetables:208
Node 0 DMA free:1676304kB min:3264kB low:4080kB high:4896kB active:128048kB inactive:61568kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 DMA32 free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 Normal free:0kB min:0kB low:0kB high:0kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 0 HighMem free:0kB min:512kB low:512kB high:512kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1951728kB min:3280kB low:4096kB high:4912kB active:5632kB inactive:1504kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
....
New format:
Mem-info:
Node 0 DMA per-cpu:
CPU 0: Hot: hi: 42, btch: 7 usd: 41 Cold: hi: 14, btch: 3 usd: 2
CPU 1: Hot: hi: 42, btch: 7 usd: 40 Cold: hi: 14, btch: 3 usd: 1
CPU 2: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 3: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 4: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 5: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 6: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
CPU 7: Hot: hi: 42, btch: 7 usd: 0 Cold: hi: 14, btch: 3 usd: 0
Node 1 DMA per-cpu:
[snip]
Free pages: 5411088kB (0kB HighMem)
Active:9558 inactive:4233 dirty:6 writeback:0 unstable:0 free:338193 slab:1942 mapped:1918 pagetables:208
Node 0 DMA free:1677648kB min:3264kB low:4080kB high:4896kB active:129296kB inactive:58864kB present:1970880kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Node 1 DMA free:1948448kB min:3280kB low:4096kB high:4912kB active:6864kB inactive:3536kB present:1982464kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0 0
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:50:05 +08:00
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
show_node(zone);
|
|
|
|
printk("%s: ", zone->name);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
|
|
for (order = 0; order < MAX_ORDER; order++) {
|
2006-06-23 17:03:50 +08:00
|
|
|
nr[order] = zone->free_area[order].nr_free;
|
|
|
|
total += nr[order] << order;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
2006-06-23 17:03:50 +08:00
|
|
|
for (order = 0; order < MAX_ORDER; order++)
|
|
|
|
printk("%lu*%lukB ", nr[order], K(1UL) << order);
|
2005-04-17 06:20:36 +08:00
|
|
|
printk("= %lukB\n", K(total));
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:30 +08:00
|
|
|
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
show_swap_cache_info();
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:18 +08:00
|
|
|
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
|
|
|
|
{
|
|
|
|
zoneref->zone = zone;
|
|
|
|
zoneref->zone_idx = zone_idx(zone);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Builds allocation fallback zone lists.
|
2006-01-06 16:11:16 +08:00
|
|
|
*
|
|
|
|
* Add all populated zones of a node to the zonelist.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-07-16 14:38:01 +08:00
|
|
|
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
|
|
|
|
int nr_zones, enum zone_type zone_type)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-01-06 16:11:16 +08:00
|
|
|
struct zone *zone;
|
|
|
|
|
2006-09-26 14:31:12 +08:00
|
|
|
BUG_ON(zone_type >= MAX_NR_ZONES);
|
2006-09-26 14:31:18 +08:00
|
|
|
zone_type++;
|
2006-01-06 16:11:18 +08:00
|
|
|
|
|
|
|
do {
|
2006-09-26 14:31:18 +08:00
|
|
|
zone_type--;
|
2006-01-06 16:11:19 +08:00
|
|
|
zone = pgdat->node_zones + zone_type;
|
2006-01-06 16:11:16 +08:00
|
|
|
if (populated_zone(zone)) {
|
2008-04-28 17:12:17 +08:00
|
|
|
zoneref_set_zone(zone,
|
|
|
|
&zonelist->_zonerefs[nr_zones++]);
|
2006-01-06 16:11:19 +08:00
|
|
|
check_highest_zone(zone_type);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-01-06 16:11:18 +08:00
|
|
|
|
2006-09-26 14:31:18 +08:00
|
|
|
} while (zone_type);
|
2006-01-06 16:11:19 +08:00
|
|
|
return nr_zones;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* zonelist_order:
|
|
|
|
* 0 = automatic detection of better ordering.
|
|
|
|
* 1 = order by ([node] distance, -zonetype)
|
|
|
|
* 2 = order by (-zonetype, [node] distance)
|
|
|
|
*
|
|
|
|
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
|
|
|
|
* the same zonelist. So only NUMA can configure this param.
|
|
|
|
*/
|
|
|
|
#define ZONELIST_ORDER_DEFAULT 0
|
|
|
|
#define ZONELIST_ORDER_NODE 1
|
|
|
|
#define ZONELIST_ORDER_ZONE 2
|
|
|
|
|
|
|
|
/* zonelist order in the kernel.
|
|
|
|
* set_zonelist_order() will set this to NODE or ZONE.
|
|
|
|
*/
|
|
|
|
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
|
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2007-07-16 14:38:01 +08:00
|
|
|
/* The value user specified ....changed by config */
|
|
|
|
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
|
/* string for sysctl */
|
|
|
|
#define NUMA_ZONELIST_ORDER_LEN 16
|
|
|
|
char numa_zonelist_order[16] = "default";
|
|
|
|
|
|
|
|
/*
|
|
|
|
* interface for configure zonelist ordering.
|
|
|
|
* command line option "numa_zonelist_order"
|
|
|
|
* = "[dD]efault - default, automatic configuration.
|
|
|
|
* = "[nN]ode - order by node locality, then by zone within node
|
|
|
|
* = "[zZ]one - order by zone, then by locality within zone
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int __parse_numa_zonelist_order(char *s)
|
|
|
|
{
|
|
|
|
if (*s == 'd' || *s == 'D') {
|
|
|
|
user_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
|
} else if (*s == 'n' || *s == 'N') {
|
|
|
|
user_zonelist_order = ZONELIST_ORDER_NODE;
|
|
|
|
} else if (*s == 'z' || *s == 'Z') {
|
|
|
|
user_zonelist_order = ZONELIST_ORDER_ZONE;
|
|
|
|
} else {
|
|
|
|
printk(KERN_WARNING
|
|
|
|
"Ignoring invalid numa_zonelist_order value: "
|
|
|
|
"%s\n", s);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __init int setup_numa_zonelist_order(char *s)
|
|
|
|
{
|
|
|
|
if (s)
|
|
|
|
return __parse_numa_zonelist_order(s);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("numa_zonelist_order", setup_numa_zonelist_order);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sysctl handler for numa_zonelist_order
|
|
|
|
*/
|
|
|
|
int numa_zonelist_order_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
char saved_string[NUMA_ZONELIST_ORDER_LEN];
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (write)
|
|
|
|
strncpy(saved_string, (char*)table->data,
|
|
|
|
NUMA_ZONELIST_ORDER_LEN);
|
|
|
|
ret = proc_dostring(table, write, file, buffer, length, ppos);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (write) {
|
|
|
|
int oldval = user_zonelist_order;
|
|
|
|
if (__parse_numa_zonelist_order((char*)table->data)) {
|
|
|
|
/*
|
|
|
|
* bogus value. restore saved string
|
|
|
|
*/
|
|
|
|
strncpy((char*)table->data, saved_string,
|
|
|
|
NUMA_ZONELIST_ORDER_LEN);
|
|
|
|
user_zonelist_order = oldval;
|
|
|
|
} else if (oldval != user_zonelist_order)
|
|
|
|
build_all_zonelists();
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define MAX_NODE_LOAD (num_online_nodes())
|
2007-07-16 14:38:01 +08:00
|
|
|
static int node_load[MAX_NUMNODES];
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
2005-05-01 23:59:25 +08:00
|
|
|
* find_next_best_node - find the next node that should appear in a given node's fallback list
|
2005-04-17 06:20:36 +08:00
|
|
|
* @node: node whose fallback list we're appending
|
|
|
|
* @used_node_mask: nodemask_t of already used nodes
|
|
|
|
*
|
|
|
|
* We use a number of factors to determine which is the next node that should
|
|
|
|
* appear on a given node's fallback list. The node should not have appeared
|
|
|
|
* already in @node's fallback list, and it should be the next closest node
|
|
|
|
* according to the distance array (which contains arbitrary distance values
|
|
|
|
* from each node to each node in the system), and should also prefer nodes
|
|
|
|
* with no CPUs, since presumably they'll have very little allocation pressure
|
|
|
|
* on them otherwise.
|
|
|
|
* It returns -1 if no node is found.
|
|
|
|
*/
|
2007-07-16 14:38:01 +08:00
|
|
|
static int find_next_best_node(int node, nodemask_t *used_node_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-02-18 03:38:21 +08:00
|
|
|
int n, val;
|
2005-04-17 06:20:36 +08:00
|
|
|
int min_val = INT_MAX;
|
|
|
|
int best_node = -1;
|
2008-04-05 09:11:10 +08:00
|
|
|
node_to_cpumask_ptr(tmp, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-02-18 03:38:21 +08:00
|
|
|
/* Use the local node if we haven't already */
|
|
|
|
if (!node_isset(node, *used_node_mask)) {
|
|
|
|
node_set(node, *used_node_mask);
|
|
|
|
return node;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-16 16:25:39 +08:00
|
|
|
for_each_node_state(n, N_HIGH_MEMORY) {
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Don't want a node to appear more than once */
|
|
|
|
if (node_isset(n, *used_node_mask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Use the distance array to find the distance */
|
|
|
|
val = node_distance(node, n);
|
|
|
|
|
2006-02-18 03:38:21 +08:00
|
|
|
/* Penalize nodes under us ("prefer the next node") */
|
|
|
|
val += (n < node);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Give preference to headless and unused nodes */
|
2008-04-05 09:11:10 +08:00
|
|
|
node_to_cpumask_ptr_next(tmp, n);
|
|
|
|
if (!cpus_empty(*tmp))
|
2005-04-17 06:20:36 +08:00
|
|
|
val += PENALTY_FOR_NODE_WITH_CPUS;
|
|
|
|
|
|
|
|
/* Slight preference for less loaded node */
|
|
|
|
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
|
|
|
|
val += node_load[n];
|
|
|
|
|
|
|
|
if (val < min_val) {
|
|
|
|
min_val = val;
|
|
|
|
best_node = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (best_node >= 0)
|
|
|
|
node_set(best_node, *used_node_mask);
|
|
|
|
|
|
|
|
return best_node;
|
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Build zonelists ordered by node and zones within node.
|
|
|
|
* This results in maximum locality--normal zone overflows into local
|
|
|
|
* DMA zone, if any--but risks exhausting DMA zone.
|
|
|
|
*/
|
|
|
|
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-07-16 14:38:01 +08:00
|
|
|
int j;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct zonelist *zonelist;
|
2007-07-16 14:38:01 +08:00
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist = &pgdat->node_zonelists[0];
|
2008-04-28 17:12:17 +08:00
|
|
|
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
|
2008-04-28 17:12:16 +08:00
|
|
|
;
|
|
|
|
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
|
|
|
|
MAX_NR_ZONES - 1);
|
2008-04-28 17:12:17 +08:00
|
|
|
zonelist->_zonerefs[j].zone = NULL;
|
|
|
|
zonelist->_zonerefs[j].zone_idx = 0;
|
2007-07-16 14:38:01 +08:00
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:37 +08:00
|
|
|
/*
|
|
|
|
* Build gfp_thisnode zonelists
|
|
|
|
*/
|
|
|
|
static void build_thisnode_zonelists(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
struct zonelist *zonelist;
|
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist = &pgdat->node_zonelists[1];
|
|
|
|
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
|
2008-04-28 17:12:17 +08:00
|
|
|
zonelist->_zonerefs[j].zone = NULL;
|
|
|
|
zonelist->_zonerefs[j].zone_idx = 0;
|
2007-10-16 16:25:37 +08:00
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
/*
|
|
|
|
* Build zonelists ordered by zone and nodes within zones.
|
|
|
|
* This results in conserving DMA zone[s] until all Normal memory is
|
|
|
|
* exhausted, but results in overflowing to remote node while memory
|
|
|
|
* may still exist in local DMA zone.
|
|
|
|
*/
|
|
|
|
static int node_order[MAX_NUMNODES];
|
|
|
|
|
|
|
|
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
|
|
|
|
{
|
|
|
|
int pos, j, node;
|
|
|
|
int zone_type; /* needs to be signed */
|
|
|
|
struct zone *z;
|
|
|
|
struct zonelist *zonelist;
|
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist = &pgdat->node_zonelists[0];
|
|
|
|
pos = 0;
|
|
|
|
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
|
|
|
|
for (j = 0; j < nr_nodes; j++) {
|
|
|
|
node = node_order[j];
|
|
|
|
z = &NODE_DATA(node)->node_zones[zone_type];
|
|
|
|
if (populated_zone(z)) {
|
2008-04-28 17:12:17 +08:00
|
|
|
zoneref_set_zone(z,
|
|
|
|
&zonelist->_zonerefs[pos++]);
|
2008-04-28 17:12:16 +08:00
|
|
|
check_highest_zone(zone_type);
|
2007-07-16 14:38:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-04-28 17:12:17 +08:00
|
|
|
zonelist->_zonerefs[pos].zone = NULL;
|
|
|
|
zonelist->_zonerefs[pos].zone_idx = 0;
|
2007-07-16 14:38:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int default_zonelist_order(void)
|
|
|
|
{
|
|
|
|
int nid, zone_type;
|
|
|
|
unsigned long low_kmem_size,total_size;
|
|
|
|
struct zone *z;
|
|
|
|
int average_size;
|
|
|
|
/*
|
|
|
|
* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
|
|
|
|
* If they are really small and used heavily, the system can fall
|
|
|
|
* into OOM very easily.
|
|
|
|
* This function detect ZONE_DMA/DMA32 size and confgigures zone order.
|
|
|
|
*/
|
|
|
|
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
|
|
|
|
low_kmem_size = 0;
|
|
|
|
total_size = 0;
|
|
|
|
for_each_online_node(nid) {
|
|
|
|
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
|
|
|
|
z = &NODE_DATA(nid)->node_zones[zone_type];
|
|
|
|
if (populated_zone(z)) {
|
|
|
|
if (zone_type < ZONE_NORMAL)
|
|
|
|
low_kmem_size += z->present_pages;
|
|
|
|
total_size += z->present_pages;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!low_kmem_size || /* there are no DMA area. */
|
|
|
|
low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
|
|
|
|
return ZONELIST_ORDER_NODE;
|
|
|
|
/*
|
|
|
|
* look into each node's config.
|
|
|
|
* If there is a node whose DMA/DMA32 memory is very big area on
|
|
|
|
* local memory, NODE_ORDER may be suitable.
|
|
|
|
*/
|
2007-10-16 16:25:39 +08:00
|
|
|
average_size = total_size /
|
|
|
|
(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
|
2007-07-16 14:38:01 +08:00
|
|
|
for_each_online_node(nid) {
|
|
|
|
low_kmem_size = 0;
|
|
|
|
total_size = 0;
|
|
|
|
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
|
|
|
|
z = &NODE_DATA(nid)->node_zones[zone_type];
|
|
|
|
if (populated_zone(z)) {
|
|
|
|
if (zone_type < ZONE_NORMAL)
|
|
|
|
low_kmem_size += z->present_pages;
|
|
|
|
total_size += z->present_pages;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (low_kmem_size &&
|
|
|
|
total_size > average_size && /* ignore small node */
|
|
|
|
low_kmem_size > total_size * 70/100)
|
|
|
|
return ZONELIST_ORDER_NODE;
|
|
|
|
}
|
|
|
|
return ZONELIST_ORDER_ZONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void set_zonelist_order(void)
|
|
|
|
{
|
|
|
|
if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
|
|
|
|
current_zonelist_order = default_zonelist_order();
|
|
|
|
else
|
|
|
|
current_zonelist_order = user_zonelist_order;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void build_zonelists(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
int j, node, load;
|
|
|
|
enum zone_type i;
|
2005-04-17 06:20:36 +08:00
|
|
|
nodemask_t used_mask;
|
2007-07-16 14:38:01 +08:00
|
|
|
int local_node, prev_node;
|
|
|
|
struct zonelist *zonelist;
|
|
|
|
int order = current_zonelist_order;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* initialize zonelists */
|
2007-10-16 16:25:37 +08:00
|
|
|
for (i = 0; i < MAX_ZONELISTS; i++) {
|
2005-04-17 06:20:36 +08:00
|
|
|
zonelist = pgdat->node_zonelists + i;
|
2008-04-28 17:12:17 +08:00
|
|
|
zonelist->_zonerefs[0].zone = NULL;
|
|
|
|
zonelist->_zonerefs[0].zone_idx = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* NUMA-aware ordering of nodes */
|
|
|
|
local_node = pgdat->node_id;
|
|
|
|
load = num_online_nodes();
|
|
|
|
prev_node = local_node;
|
|
|
|
nodes_clear(used_mask);
|
2007-07-16 14:38:01 +08:00
|
|
|
|
|
|
|
memset(node_load, 0, sizeof(node_load));
|
|
|
|
memset(node_order, 0, sizeof(node_order));
|
|
|
|
j = 0;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
|
2006-01-19 09:42:31 +08:00
|
|
|
int distance = node_distance(local_node, node);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If another node is sufficiently far away then it is better
|
|
|
|
* to reclaim pages in a zone before going off node.
|
|
|
|
*/
|
|
|
|
if (distance > RECLAIM_DISTANCE)
|
|
|
|
zone_reclaim_mode = 1;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We don't want to pressure a particular node.
|
|
|
|
* So adding penalty to the first node in same
|
|
|
|
* distance group to make it round-robin.
|
|
|
|
*/
|
2006-01-19 09:42:31 +08:00
|
|
|
if (distance != node_distance(local_node, prev_node))
|
2007-07-16 14:38:01 +08:00
|
|
|
node_load[node] = load;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
prev_node = node;
|
|
|
|
load--;
|
2007-07-16 14:38:01 +08:00
|
|
|
if (order == ZONELIST_ORDER_NODE)
|
|
|
|
build_zonelists_in_node_order(pgdat, node);
|
|
|
|
else
|
|
|
|
node_order[j++] = node; /* remember order */
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
if (order == ZONELIST_ORDER_ZONE) {
|
|
|
|
/* calculate node order -- i.e., DMA last! */
|
|
|
|
build_zonelists_in_zone_order(pgdat, j);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-10-16 16:25:37 +08:00
|
|
|
|
|
|
|
build_thisnode_zonelists(pgdat);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
/* Construct the zonelist performance cache - see further mmzone.h */
|
2007-07-16 14:38:01 +08:00
|
|
|
static void build_zonelist_cache(pg_data_t *pgdat)
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
{
|
2008-04-28 17:12:16 +08:00
|
|
|
struct zonelist *zonelist;
|
|
|
|
struct zonelist_cache *zlc;
|
2008-04-28 17:12:17 +08:00
|
|
|
struct zoneref *z;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist = &pgdat->node_zonelists[0];
|
|
|
|
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
|
|
|
|
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
|
2008-04-28 17:12:17 +08:00
|
|
|
for (z = zonelist->_zonerefs; z->zone; z++)
|
|
|
|
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#else /* CONFIG_NUMA */
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
static void set_zonelist_order(void)
|
|
|
|
{
|
|
|
|
current_zonelist_order = ZONELIST_ORDER_ZONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void build_zonelists(pg_data_t *pgdat)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-09-26 14:31:19 +08:00
|
|
|
int node, local_node;
|
2008-04-28 17:12:16 +08:00
|
|
|
enum zone_type j;
|
|
|
|
struct zonelist *zonelist;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
local_node = pgdat->node_id;
|
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
zonelist = &pgdat->node_zonelists[0];
|
|
|
|
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
/*
|
|
|
|
* Now we build the zonelist so that it contains the zones
|
|
|
|
* of all the other nodes.
|
|
|
|
* We don't want to pressure a particular node, so when
|
|
|
|
* building the zones for node N, we make sure that the
|
|
|
|
* zones coming right after the local ones are those from
|
|
|
|
* node N+1 (modulo N)
|
|
|
|
*/
|
|
|
|
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
|
|
|
|
if (!node_online(node))
|
|
|
|
continue;
|
|
|
|
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
|
|
|
|
MAX_NR_ZONES - 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2008-04-28 17:12:16 +08:00
|
|
|
for (node = 0; node < local_node; node++) {
|
|
|
|
if (!node_online(node))
|
|
|
|
continue;
|
|
|
|
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
|
|
|
|
MAX_NR_ZONES - 1);
|
|
|
|
}
|
|
|
|
|
2008-04-28 17:12:17 +08:00
|
|
|
zonelist->_zonerefs[j].zone = NULL;
|
|
|
|
zonelist->_zonerefs[j].zone_idx = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
|
2007-07-16 14:38:01 +08:00
|
|
|
static void build_zonelist_cache(pg_data_t *pgdat)
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
{
|
2008-04-28 17:12:16 +08:00
|
|
|
pgdat->node_zonelists[0].zlcache_ptr = NULL;
|
|
|
|
pgdat->node_zonelists[1].zlcache_ptr = NULL;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
2006-06-23 17:03:11 +08:00
|
|
|
/* return values int ....just for stop_machine_run() */
|
2007-07-16 14:38:01 +08:00
|
|
|
static int __build_all_zonelists(void *dummy)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-06-23 17:03:11 +08:00
|
|
|
int nid;
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
|
|
|
|
for_each_online_node(nid) {
|
2007-10-16 16:25:29 +08:00
|
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
|
|
|
|
|
|
build_zonelists(pgdat);
|
|
|
|
build_zonelist_cache(pgdat);
|
[PATCH] memory page_alloc zonelist caching speedup
Optimize the critical zonelist scanning for free pages in the kernel memory
allocator by caching the zones that were found to be full recently, and
skipping them.
Remembers the zones in a zonelist that were short of free memory in the
last second. And it stashes a zone-to-node table in the zonelist struct,
to optimize that conversion (minimize its cache footprint.)
Recent changes:
This differs in a significant way from a similar patch that I
posted a week ago. Now, instead of having a nodemask_t of
recently full nodes, I have a bitmask of recently full zones.
This solves a problem that last weeks patch had, which on
systems with multiple zones per node (such as DMA zone) would
take seeing any of these zones full as meaning that all zones
on that node were full.
Also I changed names - from "zonelist faster" to "zonelist cache",
as that seemed to better convey what we're doing here - caching
some of the key zonelist state (for faster access.)
See below for some performance benchmark results. After all that
discussion with David on why I didn't need them, I went and got
some ;). I wanted to verify that I had not hurt the normal case
of memory allocation noticeably. At least for my one little
microbenchmark, I found (1) the normal case wasn't affected, and
(2) workloads that forced scanning across multiple nodes for
memory improved up to 10% fewer System CPU cycles and lower
elapsed clock time ('sys' and 'real'). Good. See details, below.
I didn't have the logic in get_page_from_freelist() for various
full nodes and zone reclaim failures correct. That should be
fixed up now - notice the new goto labels zonelist_scan,
this_zone_full, and try_next_zone, in get_page_from_freelist().
There are two reasons I persued this alternative, over some earlier
proposals that would have focused on optimizing the fake numa
emulation case by caching the last useful zone:
1) Contrary to what I said before, we (SGI, on large ia64 sn2 systems)
have seen real customer loads where the cost to scan the zonelist
was a problem, due to many nodes being full of memory before
we got to a node we could use. Or at least, I think we have.
This was related to me by another engineer, based on experiences
from some time past. So this is not guaranteed. Most likely, though.
The following approach should help such real numa systems just as
much as it helps fake numa systems, or any combination thereof.
2) The effort to distinguish fake from real numa, using node_distance,
so that we could cache a fake numa node and optimize choosing
it over equivalent distance fake nodes, while continuing to
properly scan all real nodes in distance order, was going to
require a nasty blob of zonelist and node distance munging.
The following approach has no new dependency on node distances or
zone sorting.
See comment in the patch below for a description of what it actually does.
Technical details of note (or controversy):
- See the use of "zlc_active" and "did_zlc_setup" below, to delay
adding any work for this new mechanism until we've looked at the
first zone in zonelist. I figured the odds of the first zone
having the memory we needed were high enough that we should just
look there, first, then get fancy only if we need to keep looking.
- Some odd hackery was needed to add items to struct zonelist, while
not tripping up the custom zonelists built by the mm/mempolicy.c
code for MPOL_BIND. My usual wordy comments below explain this.
Search for "MPOL_BIND".
- Some per-node data in the struct zonelist is now modified frequently,
with no locking. Multiple CPU cores on a node could hit and mangle
this data. The theory is that this is just performance hint data,
and the memory allocator will work just fine despite any such mangling.
The fields at risk are the struct 'zonelist_cache' fields 'fullzones'
(a bitmask) and 'last_full_zap' (unsigned long jiffies). It should
all be self correcting after at most a one second delay.
- This still does a linear scan of the same lengths as before. All
I've optimized is making the scan faster, not algorithmically
shorter. It is now able to scan a compact array of 'unsigned
short' in the case of many full nodes, so one cache line should
cover quite a few nodes, rather than each node hitting another
one or two new and distinct cache lines.
- If both Andi and Nick don't find this too complicated, I will be
(pleasantly) flabbergasted.
- I removed the comment claiming we only use one cachline's worth of
zonelist. We seem, at least in the fake numa case, to have put the
lie to that claim.
- I pay no attention to the various watermarks and such in this performance
hint. A node could be marked full for one watermark, and then skipped
over when searching for a page using a different watermark. I think
that's actually quite ok, as it will tend to slightly increase the
spreading of memory over other nodes, away from a memory stressed node.
===============
Performance - some benchmark results and analysis:
This benchmark runs a memory hog program that uses multiple
threads to touch alot of memory as quickly as it can.
Multiple runs were made, touching 12, 38, 64 or 90 GBytes out of
the total 96 GBytes on the system, and using 1, 19, 37, or 55
threads (on a 56 CPU system.) System, user and real (elapsed)
timings were recorded for each run, shown in units of seconds,
in the table below.
Two kernels were tested - 2.6.18-mm3 and the same kernel with
this zonelist caching patch added. The table also shows the
percentage improvement the zonelist caching sys time is over
(lower than) the stock *-mm kernel.
number 2.6.18-mm3 zonelist-cache delta (< 0 good) percent
GBs N ------------ -------------- ---------------- systime
mem threads sys user real sys user real sys user real better
12 1 153 24 177 151 24 176 -2 0 -1 1%
12 19 99 22 8 99 22 8 0 0 0 0%
12 37 111 25 6 112 25 6 1 0 0 -0%
12 55 115 25 5 110 23 5 -5 -2 0 4%
38 1 502 74 576 497 73 570 -5 -1 -6 0%
38 19 426 78 48 373 76 39 -53 -2 -9 12%
38 37 544 83 36 547 82 36 3 -1 0 -0%
38 55 501 77 23 511 80 24 10 3 1 -1%
64 1 917 125 1042 890 124 1014 -27 -1 -28 2%
64 19 1118 138 119 965 141 103 -153 3 -16 13%
64 37 1202 151 94 1136 150 81 -66 -1 -13 5%
64 55 1118 141 61 1072 140 58 -46 -1 -3 4%
90 1 1342 177 1519 1275 174 1450 -67 -3 -69 4%
90 19 2392 199 192 2116 189 176 -276 -10 -16 11%
90 37 3313 238 175 2972 225 145 -341 -13 -30 10%
90 55 1948 210 104 1843 213 100 -105 3 -4 5%
Notes:
1) This test ran a memory hog program that started a specified number N of
threads, and had each thread allocate and touch 1/N'th of
the total memory to be used in the test run in a single loop,
writing a constant word to memory, one store every 4096 bytes.
Watching this test during some earlier trial runs, I would see
each of these threads sit down on one CPU and stay there, for
the remainder of the pass, a different CPU for each thread.
2) The 'real' column is not comparable to the 'sys' or 'user' columns.
The 'real' column is seconds wall clock time elapsed, from beginning
to end of that test pass. The 'sys' and 'user' columns are total
CPU seconds spent on that test pass. For a 19 thread test run,
for example, the sum of 'sys' and 'user' could be up to 19 times the
number of 'real' elapsed wall clock seconds.
3) Tests were run on a fresh, single-user boot, to minimize the amount
of memory already in use at the start of the test, and to minimize
the amount of background activity that might interfere.
4) Tests were done on a 56 CPU, 28 Node system with 96 GBytes of RAM.
5) Notice that the 'real' time gets large for the single thread runs, even
though the measured 'sys' and 'user' times are modest. I'm not sure what
that means - probably something to do with it being slow for one thread to
be accessing memory along ways away. Perhaps the fake numa system, running
ostensibly the same workload, would not show this substantial degradation
of 'real' time for one thread on many nodes -- lets hope not.
6) The high thread count passes (one thread per CPU - on 55 of 56 CPUs)
ran quite efficiently, as one might expect. Each pair of threads needed
to allocate and touch the memory on the node the two threads shared, a
pleasantly parallizable workload.
7) The intermediate thread count passes, when asking for alot of memory forcing
them to go to a few neighboring nodes, improved the most with this zonelist
caching patch.
Conclusions:
* This zonelist cache patch probably makes little difference one way or the
other for most workloads on real numa hardware, if those workloads avoid
heavy off node allocations.
* For memory intensive workloads requiring substantial off-node allocations
on real numa hardware, this patch improves both kernel and elapsed timings
up to ten per-cent.
* For fake numa systems, I'm optimistic, but will have to leave that up to
Rohit Seth to actually test (once I get him a 2.6.18 backport.)
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Rohit Seth <rohitseth@google.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: David Rientjes <rientjes@cs.washington.edu>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-07 12:31:48 +08:00
|
|
|
}
|
2006-06-23 17:03:11 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:01 +08:00
|
|
|
void build_all_zonelists(void)
|
2006-06-23 17:03:11 +08:00
|
|
|
{
|
2007-07-16 14:38:01 +08:00
|
|
|
set_zonelist_order();
|
|
|
|
|
2006-06-23 17:03:11 +08:00
|
|
|
if (system_state == SYSTEM_BOOTING) {
|
2006-09-27 16:50:12 +08:00
|
|
|
__build_all_zonelists(NULL);
|
2006-06-23 17:03:11 +08:00
|
|
|
cpuset_init_current_mems_allowed();
|
|
|
|
} else {
|
2007-10-20 07:27:18 +08:00
|
|
|
/* we have to stop all cpus to guarantee there is no user
|
2006-06-23 17:03:11 +08:00
|
|
|
of zonelist */
|
|
|
|
stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
|
|
|
|
/* cpuset refresh routine should be here */
|
|
|
|
}
|
2006-06-23 17:03:47 +08:00
|
|
|
vm_total_pages = nr_free_pagecache_pages();
|
2007-10-16 16:25:54 +08:00
|
|
|
/*
|
|
|
|
* Disable grouping by mobility if the number of pages in the
|
|
|
|
* system is too low to allow the mechanism to work. It would be
|
|
|
|
* more accurate, but expensive to check per-zone. This check is
|
|
|
|
* made on memory-hotadd so a system can start with mobility
|
|
|
|
* disabled and enable it later
|
|
|
|
*/
|
2007-10-16 16:26:01 +08:00
|
|
|
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
|
2007-10-16 16:25:54 +08:00
|
|
|
page_group_by_mobility_disabled = 1;
|
|
|
|
else
|
|
|
|
page_group_by_mobility_disabled = 0;
|
|
|
|
|
|
|
|
printk("Built %i zonelists in %s order, mobility grouping %s. "
|
|
|
|
"Total pages: %ld\n",
|
2007-07-16 14:38:01 +08:00
|
|
|
num_online_nodes(),
|
|
|
|
zonelist_order_name[current_zonelist_order],
|
2007-10-16 16:25:54 +08:00
|
|
|
page_group_by_mobility_disabled ? "off" : "on",
|
2007-07-16 14:38:01 +08:00
|
|
|
vm_total_pages);
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
printk("Policy zone: %s\n", zone_names[policy_zone]);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper functions to size the waitqueue hash table.
|
|
|
|
* Essentially these want to choose hash table sizes sufficiently
|
|
|
|
* large so that collisions trying to wait on pages are rare.
|
|
|
|
* But in fact, the number of active page waitqueues on typical
|
|
|
|
* systems is ridiculously low, less than 200. So this is even
|
|
|
|
* conservative, even though it seems large.
|
|
|
|
*
|
|
|
|
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
|
|
|
|
* waitqueues, i.e. the size of the waitq table given the number of pages.
|
|
|
|
*/
|
|
|
|
#define PAGES_PER_WAITQUEUE 256
|
|
|
|
|
2006-06-23 17:03:10 +08:00
|
|
|
#ifndef CONFIG_MEMORY_HOTPLUG
|
2006-06-23 17:03:08 +08:00
|
|
|
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long size = 1;
|
|
|
|
|
|
|
|
pages /= PAGES_PER_WAITQUEUE;
|
|
|
|
|
|
|
|
while (size < pages)
|
|
|
|
size <<= 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Once we have dozens or even hundreds of threads sleeping
|
|
|
|
* on IO we've got bigger problems than wait queue collision.
|
|
|
|
* Limit the size of the wait table to a reasonable size.
|
|
|
|
*/
|
|
|
|
size = min(size, 4096UL);
|
|
|
|
|
|
|
|
return max(size, 4UL);
|
|
|
|
}
|
2006-06-23 17:03:10 +08:00
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* A zone's size might be changed by hot-add, so it is not possible to determine
|
|
|
|
* a suitable size for its wait_table. So we use the maximum size now.
|
|
|
|
*
|
|
|
|
* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
|
|
|
|
*
|
|
|
|
* i386 (preemption config) : 4096 x 16 = 64Kbyte.
|
|
|
|
* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
|
|
|
|
* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
|
|
|
|
*
|
|
|
|
* The maximum entries are prepared when a zone's memory is (512K + 256) pages
|
|
|
|
* or more by the traditional way. (See above). It equals:
|
|
|
|
*
|
|
|
|
* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
|
|
|
|
* ia64(16K page size) : = ( 8G + 4M)byte.
|
|
|
|
* powerpc (64K page size) : = (32G +16M)byte.
|
|
|
|
*/
|
|
|
|
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
|
|
|
|
{
|
|
|
|
return 4096UL;
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an integer logarithm so that shifts can be used later
|
|
|
|
* to extract the more random high bits from the multiplicative
|
|
|
|
* hash function before the remainder is taken.
|
|
|
|
*/
|
|
|
|
static inline unsigned long wait_table_bits(unsigned long size)
|
|
|
|
{
|
|
|
|
return ffz(~size);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
|
|
|
|
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
/*
|
2007-10-16 16:26:01 +08:00
|
|
|
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
* of blocks reserved is based on zone->pages_min. The memory within the
|
|
|
|
* reserve will tend to store contiguous free pages. Setting min_free_kbytes
|
|
|
|
* higher will lead to a bigger reserve which will get freed as contiguous
|
|
|
|
* blocks as reclaim kicks in
|
|
|
|
*/
|
|
|
|
static void setup_zone_migrate_reserve(struct zone *zone)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, pfn, end_pfn;
|
|
|
|
struct page *page;
|
|
|
|
unsigned long reserve, block_migratetype;
|
|
|
|
|
|
|
|
/* Get the start pfn, end pfn and the number of blocks to reserve */
|
|
|
|
start_pfn = zone->zone_start_pfn;
|
|
|
|
end_pfn = start_pfn + zone->spanned_pages;
|
2007-10-16 16:26:01 +08:00
|
|
|
reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
|
|
|
|
pageblock_order;
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
continue;
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
|
|
|
|
/* Blocks with reserved pages will never free, skip them. */
|
|
|
|
if (PageReserved(page))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
block_migratetype = get_pageblock_migratetype(page);
|
|
|
|
|
|
|
|
/* If this block is reserved, account for it */
|
|
|
|
if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
|
|
|
|
reserve--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Suitable for reserving if this block is movable */
|
|
|
|
if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_RESERVE);
|
|
|
|
move_freepages_block(zone, page, MIGRATE_RESERVE);
|
|
|
|
reserve--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the reserve is met and this is a previous reserved block,
|
|
|
|
* take it back
|
|
|
|
*/
|
|
|
|
if (block_migratetype == MIGRATE_RESERVE) {
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
|
|
|
move_freepages_block(zone, page, MIGRATE_MOVABLE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2007-10-16 16:25:58 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Initially all pages are reserved - free ones are freed
|
|
|
|
* up by free_all_bootmem() once the early boot process is
|
|
|
|
* done. Non-atomic initialization, single-pass.
|
|
|
|
*/
|
2006-01-17 14:03:44 +08:00
|
|
|
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
2007-01-11 15:15:30 +08:00
|
|
|
unsigned long start_pfn, enum memmap_context context)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page *page;
|
2005-06-23 15:08:00 +08:00
|
|
|
unsigned long end_pfn = start_pfn + size;
|
|
|
|
unsigned long pfn;
|
2008-04-29 15:58:21 +08:00
|
|
|
struct zone *z;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-29 15:58:21 +08:00
|
|
|
z = &NODE_DATA(nid)->node_zones[zone];
|
2006-01-12 17:05:24 +08:00
|
|
|
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
|
2007-01-11 15:15:30 +08:00
|
|
|
/*
|
|
|
|
* There can be holes in boot-time mem_map[]s
|
|
|
|
* handed to this function. They do not
|
|
|
|
* exist on hotplugged memory.
|
|
|
|
*/
|
|
|
|
if (context == MEMMAP_EARLY) {
|
|
|
|
if (!early_pfn_valid(pfn))
|
|
|
|
continue;
|
|
|
|
if (!early_pfn_in_nid(pfn, nid))
|
|
|
|
continue;
|
|
|
|
}
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
set_page_links(page, zone, nid, pfn);
|
2006-03-22 16:08:40 +08:00
|
|
|
init_page_count(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
reset_page_mapcount(page);
|
|
|
|
SetPageReserved(page);
|
2007-10-16 16:25:48 +08:00
|
|
|
/*
|
|
|
|
* Mark the block movable so that blocks are reserved for
|
|
|
|
* movable at startup. This will force kernel allocations
|
|
|
|
* to reserve their blocks rather than leaking throughout
|
|
|
|
* the address space during boot when many long-lived
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
* kernel allocations are made. Later some blocks near
|
|
|
|
* the start are marked MIGRATE_RESERVE by
|
|
|
|
* setup_zone_migrate_reserve()
|
2008-04-29 15:58:21 +08:00
|
|
|
*
|
|
|
|
* bitmap is created for zone's valid pfn range. but memmap
|
|
|
|
* can be created for invalid pages (for alignment)
|
|
|
|
* check here not to call set_pageblock_migratetype() against
|
|
|
|
* pfn out of zone.
|
2007-10-16 16:25:48 +08:00
|
|
|
*/
|
2008-04-29 15:58:21 +08:00
|
|
|
if ((z->zone_start_pfn <= pfn)
|
|
|
|
&& (pfn < z->zone_start_pfn + z->spanned_pages)
|
|
|
|
&& !(pfn & (pageblock_nr_pages - 1)))
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
2007-10-16 16:25:48 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
INIT_LIST_HEAD(&page->lru);
|
|
|
|
#ifdef WANT_PAGE_VIRTUAL
|
|
|
|
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
|
|
|
|
if (!is_highmem_idx(zone))
|
2005-06-28 05:36:28 +08:00
|
|
|
set_page_address(page, __va(pfn << PAGE_SHIFT));
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
static void __meminit zone_init_free_lists(struct zone *zone)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-10-16 16:25:48 +08:00
|
|
|
int order, t;
|
|
|
|
for_each_migratetype_order(order, t) {
|
|
|
|
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
|
2005-04-17 06:20:36 +08:00
|
|
|
zone->free_area[order].nr_free = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef __HAVE_ARCH_MEMMAP_INIT
|
|
|
|
#define memmap_init(size, nid, zone, start_pfn) \
|
2007-01-11 15:15:30 +08:00
|
|
|
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2008-01-18 07:21:12 +08:00
|
|
|
static int zone_batchsize(struct zone *zone)
|
2005-06-22 08:14:47 +08:00
|
|
|
{
|
|
|
|
int batch;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The per-cpu-pages pools are set to around 1000th of the
|
2005-10-30 09:15:47 +08:00
|
|
|
* size of the zone. But no more than 1/2 of a meg.
|
2005-06-22 08:14:47 +08:00
|
|
|
*
|
|
|
|
* OK, so we don't know how big the cache is. So guess.
|
|
|
|
*/
|
|
|
|
batch = zone->present_pages / 1024;
|
2005-10-30 09:15:47 +08:00
|
|
|
if (batch * PAGE_SIZE > 512 * 1024)
|
|
|
|
batch = (512 * 1024) / PAGE_SIZE;
|
2005-06-22 08:14:47 +08:00
|
|
|
batch /= 4; /* We effectively *= 4 below */
|
|
|
|
if (batch < 1)
|
|
|
|
batch = 1;
|
|
|
|
|
|
|
|
/*
|
2005-12-04 10:55:25 +08:00
|
|
|
* Clamp the batch to a 2^n - 1 value. Having a power
|
|
|
|
* of 2 value was found to be more likely to have
|
|
|
|
* suboptimal cache aliasing properties in some cases.
|
2005-06-22 08:14:47 +08:00
|
|
|
*
|
2005-12-04 10:55:25 +08:00
|
|
|
* For example if 2 tasks are alternately allocating
|
|
|
|
* batches of pages, one task can end up with a lot
|
|
|
|
* of pages of one half of the possible page colors
|
|
|
|
* and the other with pages of the other colors.
|
2005-06-22 08:14:47 +08:00
|
|
|
*/
|
2005-12-04 10:55:25 +08:00
|
|
|
batch = (1 << (fls(batch + batch/2)-1)) - 1;
|
2005-10-30 09:15:47 +08:00
|
|
|
|
2005-06-22 08:14:47 +08:00
|
|
|
return batch;
|
|
|
|
}
|
|
|
|
|
2005-06-22 08:15:00 +08:00
|
|
|
inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
|
|
|
|
{
|
|
|
|
struct per_cpu_pages *pcp;
|
|
|
|
|
2005-10-26 16:58:59 +08:00
|
|
|
memset(p, 0, sizeof(*p));
|
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
pcp = &p->pcp;
|
2005-06-22 08:15:00 +08:00
|
|
|
pcp->count = 0;
|
|
|
|
pcp->high = 6 * batch;
|
|
|
|
pcp->batch = max(1UL, 1 * batch);
|
|
|
|
INIT_LIST_HEAD(&pcp->list);
|
|
|
|
}
|
|
|
|
|
2006-01-08 17:00:40 +08:00
|
|
|
/*
|
|
|
|
* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
|
|
|
|
* to the value high for the pageset p.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void setup_pagelist_highmark(struct per_cpu_pageset *p,
|
|
|
|
unsigned long high)
|
|
|
|
{
|
|
|
|
struct per_cpu_pages *pcp;
|
|
|
|
|
2008-02-05 14:29:19 +08:00
|
|
|
pcp = &p->pcp;
|
2006-01-08 17:00:40 +08:00
|
|
|
pcp->high = high;
|
|
|
|
pcp->batch = max(1UL, high/4);
|
|
|
|
if ((high/4) > (PAGE_SHIFT * 8))
|
|
|
|
pcp->batch = PAGE_SHIFT * 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-06-22 08:14:47 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
2005-06-22 08:15:00 +08:00
|
|
|
* Boot pageset table. One per cpu which is going to be used for all
|
|
|
|
* zones and all nodes. The parameters will be set in such a way
|
|
|
|
* that an item put on a list will immediately be handed over to
|
|
|
|
* the buddy list. This is safe since pageset manipulation is done
|
|
|
|
* with interrupts disabled.
|
|
|
|
*
|
|
|
|
* Some NUMA counter updates may also be caught by the boot pagesets.
|
2005-06-23 11:26:07 +08:00
|
|
|
*
|
|
|
|
* The boot_pagesets must be kept even after bootup is complete for
|
|
|
|
* unused processors and/or zones. They do play a role for bootstrapping
|
|
|
|
* hotplugged processors.
|
|
|
|
*
|
|
|
|
* zoneinfo_show() and maybe other functions do
|
|
|
|
* not check if the processor is online before following the pageset pointer.
|
|
|
|
* Other parts of the kernel may not check if the zone is available.
|
2005-06-22 08:15:00 +08:00
|
|
|
*/
|
2006-02-05 15:27:36 +08:00
|
|
|
static struct per_cpu_pageset boot_pageset[NR_CPUS];
|
2005-06-22 08:15:00 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Dynamically allocate memory for the
|
2005-06-22 08:14:47 +08:00
|
|
|
* per cpu pageset array in struct zone.
|
|
|
|
*/
|
2006-02-01 19:04:44 +08:00
|
|
|
static int __cpuinit process_zones(int cpu)
|
2005-06-22 08:14:47 +08:00
|
|
|
{
|
|
|
|
struct zone *zone, *dzone;
|
2007-10-16 16:25:36 +08:00
|
|
|
int node = cpu_to_node(cpu);
|
|
|
|
|
|
|
|
node_set_state(node, N_CPU); /* this node has a cpu */
|
2005-06-22 08:14:47 +08:00
|
|
|
|
|
|
|
for_each_zone(zone) {
|
|
|
|
|
2006-09-27 16:50:09 +08:00
|
|
|
if (!populated_zone(zone))
|
|
|
|
continue;
|
|
|
|
|
2006-01-08 17:00:41 +08:00
|
|
|
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
|
2007-10-16 16:25:36 +08:00
|
|
|
GFP_KERNEL, node);
|
2006-01-08 17:00:41 +08:00
|
|
|
if (!zone_pcp(zone, cpu))
|
2005-06-22 08:14:47 +08:00
|
|
|
goto bad;
|
|
|
|
|
2006-01-08 17:00:41 +08:00
|
|
|
setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
|
2006-01-08 17:00:40 +08:00
|
|
|
|
|
|
|
if (percpu_pagelist_fraction)
|
|
|
|
setup_pagelist_highmark(zone_pcp(zone, cpu),
|
|
|
|
(zone->present_pages / percpu_pagelist_fraction));
|
2005-06-22 08:14:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
bad:
|
|
|
|
for_each_zone(dzone) {
|
2007-08-31 14:56:17 +08:00
|
|
|
if (!populated_zone(dzone))
|
|
|
|
continue;
|
2005-06-22 08:14:47 +08:00
|
|
|
if (dzone == zone)
|
|
|
|
break;
|
2006-01-08 17:00:41 +08:00
|
|
|
kfree(zone_pcp(dzone, cpu));
|
|
|
|
zone_pcp(dzone, cpu) = NULL;
|
2005-06-22 08:14:47 +08:00
|
|
|
}
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_zone_pagesets(int cpu)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
|
|
|
|
for_each_zone(zone) {
|
|
|
|
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
|
|
|
|
|
2006-09-26 07:24:57 +08:00
|
|
|
/* Free per_cpu_pageset if it is slab allocated */
|
|
|
|
if (pset != &boot_pageset[cpu])
|
|
|
|
kfree(pset);
|
2005-06-22 08:14:47 +08:00
|
|
|
zone_pcp(zone, cpu) = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-06-27 17:54:07 +08:00
|
|
|
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
|
2005-06-22 08:14:47 +08:00
|
|
|
unsigned long action,
|
|
|
|
void *hcpu)
|
|
|
|
{
|
|
|
|
int cpu = (long)hcpu;
|
|
|
|
int ret = NOTIFY_OK;
|
|
|
|
|
|
|
|
switch (action) {
|
2006-12-07 12:33:08 +08:00
|
|
|
case CPU_UP_PREPARE:
|
2007-05-09 17:35:10 +08:00
|
|
|
case CPU_UP_PREPARE_FROZEN:
|
2006-12-07 12:33:08 +08:00
|
|
|
if (process_zones(cpu))
|
|
|
|
ret = NOTIFY_BAD;
|
|
|
|
break;
|
|
|
|
case CPU_UP_CANCELED:
|
2007-05-09 17:35:10 +08:00
|
|
|
case CPU_UP_CANCELED_FROZEN:
|
2006-12-07 12:33:08 +08:00
|
|
|
case CPU_DEAD:
|
2007-05-09 17:35:10 +08:00
|
|
|
case CPU_DEAD_FROZEN:
|
2006-12-07 12:33:08 +08:00
|
|
|
free_zone_pagesets(cpu);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2005-06-22 08:14:47 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2006-06-27 17:54:09 +08:00
|
|
|
static struct notifier_block __cpuinitdata pageset_notifier =
|
2005-06-22 08:14:47 +08:00
|
|
|
{ &pageset_cpuup_callback, NULL, 0 };
|
|
|
|
|
2005-12-15 17:18:25 +08:00
|
|
|
void __init setup_per_cpu_pageset(void)
|
2005-06-22 08:14:47 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/* Initialize per_cpu_pageset for cpu 0.
|
|
|
|
* A cpuup callback will do this for every cpu
|
|
|
|
* as it comes online
|
|
|
|
*/
|
|
|
|
err = process_zones(smp_processor_id());
|
|
|
|
BUG_ON(err);
|
|
|
|
register_cpu_notifier(&pageset_notifier);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2007-05-18 05:29:25 +08:00
|
|
|
static noinline __init_refok
|
2006-06-23 17:03:10 +08:00
|
|
|
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
|
2005-10-30 09:16:50 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
2006-06-23 17:03:10 +08:00
|
|
|
size_t alloc_size;
|
2005-10-30 09:16:50 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The per-page waitqueue mechanism uses hashed waitqueues
|
|
|
|
* per zone.
|
|
|
|
*/
|
2006-06-23 17:03:08 +08:00
|
|
|
zone->wait_table_hash_nr_entries =
|
|
|
|
wait_table_hash_nr_entries(zone_size_pages);
|
|
|
|
zone->wait_table_bits =
|
|
|
|
wait_table_bits(zone->wait_table_hash_nr_entries);
|
2006-06-23 17:03:10 +08:00
|
|
|
alloc_size = zone->wait_table_hash_nr_entries
|
|
|
|
* sizeof(wait_queue_head_t);
|
|
|
|
|
2008-05-24 04:04:52 +08:00
|
|
|
if (!slab_is_available()) {
|
2006-06-23 17:03:10 +08:00
|
|
|
zone->wait_table = (wait_queue_head_t *)
|
|
|
|
alloc_bootmem_node(pgdat, alloc_size);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This case means that a zone whose size was 0 gets new memory
|
|
|
|
* via memory hot-add.
|
|
|
|
* But it may be the case that a new node was hot-added. In
|
|
|
|
* this case vmalloc() will not be able to use this new node's
|
|
|
|
* memory - this wait_table must be initialized to use this new
|
|
|
|
* node itself as well.
|
|
|
|
* To use this new node's memory, further consideration will be
|
|
|
|
* necessary.
|
|
|
|
*/
|
2007-10-16 16:24:49 +08:00
|
|
|
zone->wait_table = vmalloc(alloc_size);
|
2006-06-23 17:03:10 +08:00
|
|
|
}
|
|
|
|
if (!zone->wait_table)
|
|
|
|
return -ENOMEM;
|
2005-10-30 09:16:50 +08:00
|
|
|
|
2006-06-23 17:03:08 +08:00
|
|
|
for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
|
2005-10-30 09:16:50 +08:00
|
|
|
init_waitqueue_head(zone->wait_table + i);
|
2006-06-23 17:03:10 +08:00
|
|
|
|
|
|
|
return 0;
|
2005-10-30 09:16:50 +08:00
|
|
|
}
|
|
|
|
|
2006-01-17 14:03:44 +08:00
|
|
|
static __meminit void zone_pcp_init(struct zone *zone)
|
2005-10-30 09:16:50 +08:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
unsigned long batch = zone_batchsize(zone);
|
|
|
|
|
|
|
|
for (cpu = 0; cpu < NR_CPUS; cpu++) {
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/* Early boot. Slab allocator not functional yet */
|
2006-01-08 17:00:41 +08:00
|
|
|
zone_pcp(zone, cpu) = &boot_pageset[cpu];
|
2005-10-30 09:16:50 +08:00
|
|
|
setup_pageset(&boot_pageset[cpu],0);
|
|
|
|
#else
|
|
|
|
setup_pageset(zone_pcp(zone,cpu), batch);
|
|
|
|
#endif
|
|
|
|
}
|
2006-03-25 19:06:49 +08:00
|
|
|
if (zone->present_pages)
|
|
|
|
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
|
|
|
|
zone->name, zone->present_pages, batch);
|
2005-10-30 09:16:50 +08:00
|
|
|
}
|
|
|
|
|
2006-06-23 17:03:10 +08:00
|
|
|
__meminit int init_currently_empty_zone(struct zone *zone,
|
|
|
|
unsigned long zone_start_pfn,
|
2007-01-11 15:15:30 +08:00
|
|
|
unsigned long size,
|
|
|
|
enum memmap_context context)
|
2005-10-30 09:16:50 +08:00
|
|
|
{
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
2006-06-23 17:03:10 +08:00
|
|
|
int ret;
|
|
|
|
ret = zone_wait_table_init(zone, size);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2005-10-30 09:16:50 +08:00
|
|
|
pgdat->nr_zones = zone_idx(zone) + 1;
|
|
|
|
|
|
|
|
zone->zone_start_pfn = zone_start_pfn;
|
|
|
|
|
2008-02-05 14:29:26 +08:00
|
|
|
zone_init_free_lists(zone);
|
2006-06-23 17:03:10 +08:00
|
|
|
|
|
|
|
return 0;
|
2005-10-30 09:16:50 +08:00
|
|
|
}
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
/*
|
|
|
|
* Basic iterator support. Return the first range of PFNs for a node
|
|
|
|
* Note: nid == MAX_NUMNODES returns first region regardless of node
|
|
|
|
*/
|
2007-05-08 15:23:07 +08:00
|
|
|
static int __meminit first_active_region_index_in_nid(int nid)
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_nodemap_entries; i++)
|
|
|
|
if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
|
|
|
|
return i;
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Basic iterator support. Return the next active range of PFNs for a node
|
2007-10-20 07:27:18 +08:00
|
|
|
* Note: nid == MAX_NUMNODES returns next region regardless of node
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
2007-05-08 15:23:07 +08:00
|
|
|
static int __meminit next_active_region_index_in_nid(int index, int nid)
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
{
|
|
|
|
for (index = index + 1; index < nr_nodemap_entries; index++)
|
|
|
|
if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
|
|
|
|
return index;
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
|
|
|
|
/*
|
|
|
|
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
|
|
|
|
* Architectures may implement their own version but if add_active_range()
|
|
|
|
* was used and there are no special requirements, this is a convenient
|
|
|
|
* alternative
|
|
|
|
*/
|
2007-05-10 18:15:27 +08:00
|
|
|
int __meminit early_pfn_to_nid(unsigned long pfn)
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_nodemap_entries; i++) {
|
|
|
|
unsigned long start_pfn = early_node_map[i].start_pfn;
|
|
|
|
unsigned long end_pfn = early_node_map[i].end_pfn;
|
|
|
|
|
|
|
|
if (start_pfn <= pfn && pfn < end_pfn)
|
|
|
|
return early_node_map[i].nid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
|
|
|
|
|
|
|
|
/* Basic iterator support to walk early_node_map[] */
|
|
|
|
#define for_each_active_range_index_in_nid(i, nid) \
|
|
|
|
for (i = first_active_region_index_in_nid(nid); i != -1; \
|
|
|
|
i = next_active_region_index_in_nid(i, nid))
|
|
|
|
|
|
|
|
/**
|
|
|
|
* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
|
2006-10-04 17:15:25 +08:00
|
|
|
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
|
|
|
|
* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*
|
|
|
|
* If an architecture guarantees that all ranges registered with
|
|
|
|
* add_active_ranges() contain no holes and may be freed, this
|
|
|
|
* this function may be used instead of calling free_bootmem() manually.
|
|
|
|
*/
|
|
|
|
void __init free_bootmem_with_active_regions(int nid,
|
|
|
|
unsigned long max_low_pfn)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_active_range_index_in_nid(i, nid) {
|
|
|
|
unsigned long size_pages = 0;
|
|
|
|
unsigned long end_pfn = early_node_map[i].end_pfn;
|
|
|
|
|
|
|
|
if (early_node_map[i].start_pfn >= max_low_pfn)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (end_pfn > max_low_pfn)
|
|
|
|
end_pfn = max_low_pfn;
|
|
|
|
|
|
|
|
size_pages = end_pfn - early_node_map[i].start_pfn;
|
|
|
|
free_bootmem_node(NODE_DATA(early_node_map[i].nid),
|
|
|
|
PFN_PHYS(early_node_map[i].start_pfn),
|
|
|
|
size_pages << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sparse_memory_present_with_active_regions - Call memory_present for each active range
|
2006-10-04 17:15:25 +08:00
|
|
|
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*
|
|
|
|
* If an architecture guarantees that all ranges registered with
|
|
|
|
* add_active_ranges() contain no holes and may be freed, this
|
2006-10-04 17:15:25 +08:00
|
|
|
* function may be used instead of calling memory_present() manually.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
|
|
|
void __init sparse_memory_present_with_active_regions(int nid)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_active_range_index_in_nid(i, nid)
|
|
|
|
memory_present(early_node_map[i].nid,
|
|
|
|
early_node_map[i].start_pfn,
|
|
|
|
early_node_map[i].end_pfn);
|
|
|
|
}
|
|
|
|
|
2006-09-27 16:49:59 +08:00
|
|
|
/**
|
|
|
|
* push_node_boundaries - Push node boundaries to at least the requested boundary
|
|
|
|
* @nid: The nid of the node to push the boundary for
|
|
|
|
* @start_pfn: The start pfn of the node
|
|
|
|
* @end_pfn: The end pfn of the node
|
|
|
|
*
|
|
|
|
* In reserve-based hot-add, mem_map is allocated that is unused until hotadd
|
|
|
|
* time. Specifically, on x86_64, SRAT will report ranges that can potentially
|
|
|
|
* be hotplugged even though no physical memory exists. This function allows
|
|
|
|
* an arch to push out the node boundaries so mem_map is allocated that can
|
|
|
|
* be used later.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
|
|
|
|
void __init push_node_boundaries(unsigned int nid,
|
|
|
|
unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
|
|
|
|
nid, start_pfn, end_pfn);
|
|
|
|
|
|
|
|
/* Initialise the boundary for this node if necessary */
|
|
|
|
if (node_boundary_end_pfn[nid] == 0)
|
|
|
|
node_boundary_start_pfn[nid] = -1UL;
|
|
|
|
|
|
|
|
/* Update the boundaries */
|
|
|
|
if (node_boundary_start_pfn[nid] > start_pfn)
|
|
|
|
node_boundary_start_pfn[nid] = start_pfn;
|
|
|
|
if (node_boundary_end_pfn[nid] < end_pfn)
|
|
|
|
node_boundary_end_pfn[nid] = end_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If necessary, push the node boundary out for reserve hotadd */
|
2007-07-16 14:38:17 +08:00
|
|
|
static void __meminit account_node_boundary(unsigned int nid,
|
2006-09-27 16:49:59 +08:00
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn)
|
|
|
|
{
|
|
|
|
printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
|
|
|
|
nid, *start_pfn, *end_pfn);
|
|
|
|
|
|
|
|
/* Return if boundary information has not been provided */
|
|
|
|
if (node_boundary_end_pfn[nid] == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Check the boundaries and update if necessary */
|
|
|
|
if (node_boundary_start_pfn[nid] < *start_pfn)
|
|
|
|
*start_pfn = node_boundary_start_pfn[nid];
|
|
|
|
if (node_boundary_end_pfn[nid] > *end_pfn)
|
|
|
|
*end_pfn = node_boundary_end_pfn[nid];
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
void __init push_node_boundaries(unsigned int nid,
|
|
|
|
unsigned long start_pfn, unsigned long end_pfn) {}
|
|
|
|
|
2007-07-16 14:38:17 +08:00
|
|
|
static void __meminit account_node_boundary(unsigned int nid,
|
2006-09-27 16:49:59 +08:00
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/**
|
|
|
|
* get_pfn_range_for_nid - Return the start and end page frames for a node
|
2006-10-04 17:15:25 +08:00
|
|
|
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
|
|
|
|
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
|
|
|
|
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*
|
|
|
|
* It returns the start and end page frame of a node based on information
|
|
|
|
* provided by an arch calling add_active_range(). If called for a node
|
|
|
|
* with no available memory, a warning is printed and the start and end
|
2006-10-04 17:15:25 +08:00
|
|
|
* PFNs will be 0.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
2007-05-08 15:23:07 +08:00
|
|
|
void __meminit get_pfn_range_for_nid(unsigned int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long *start_pfn, unsigned long *end_pfn)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
*start_pfn = -1UL;
|
|
|
|
*end_pfn = 0;
|
|
|
|
|
|
|
|
for_each_active_range_index_in_nid(i, nid) {
|
|
|
|
*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
|
|
|
|
*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:37 +08:00
|
|
|
if (*start_pfn == -1UL)
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*start_pfn = 0;
|
2006-09-27 16:49:59 +08:00
|
|
|
|
|
|
|
/* Push the node boundaries out if requested */
|
|
|
|
account_node_boundary(nid, start_pfn, end_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
}
|
|
|
|
|
2007-07-17 19:03:12 +08:00
|
|
|
/*
|
|
|
|
* This finds a zone that can be used for ZONE_MOVABLE pages. The
|
|
|
|
* assumption is made that zones within a node are ordered in monotonic
|
|
|
|
* increasing memory addresses so that the "highest" populated zone is used
|
|
|
|
*/
|
|
|
|
void __init find_usable_zone_for_movable(void)
|
|
|
|
{
|
|
|
|
int zone_index;
|
|
|
|
for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
|
|
|
|
if (zone_index == ZONE_MOVABLE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (arch_zone_highest_possible_pfn[zone_index] >
|
|
|
|
arch_zone_lowest_possible_pfn[zone_index])
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
VM_BUG_ON(zone_index == -1);
|
|
|
|
movable_zone = zone_index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
|
|
|
|
* because it is sized independant of architecture. Unlike the other zones,
|
|
|
|
* the starting point for ZONE_MOVABLE is not fixed. It may be different
|
|
|
|
* in each node depending on the size of each node and how evenly kernelcore
|
|
|
|
* is distributed. This helper function adjusts the zone ranges
|
|
|
|
* provided by the architecture for a given node by using the end of the
|
|
|
|
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
|
|
|
|
* zones within a node are in order of monotonic increases memory addresses
|
|
|
|
*/
|
|
|
|
void __meminit adjust_zone_range_for_zone_movable(int nid,
|
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long node_start_pfn,
|
|
|
|
unsigned long node_end_pfn,
|
|
|
|
unsigned long *zone_start_pfn,
|
|
|
|
unsigned long *zone_end_pfn)
|
|
|
|
{
|
|
|
|
/* Only adjust if ZONE_MOVABLE is on this node */
|
|
|
|
if (zone_movable_pfn[nid]) {
|
|
|
|
/* Size ZONE_MOVABLE */
|
|
|
|
if (zone_type == ZONE_MOVABLE) {
|
|
|
|
*zone_start_pfn = zone_movable_pfn[nid];
|
|
|
|
*zone_end_pfn = min(node_end_pfn,
|
|
|
|
arch_zone_highest_possible_pfn[movable_zone]);
|
|
|
|
|
|
|
|
/* Adjust for ZONE_MOVABLE starting within this range */
|
|
|
|
} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
|
|
|
|
*zone_end_pfn > zone_movable_pfn[nid]) {
|
|
|
|
*zone_end_pfn = zone_movable_pfn[nid];
|
|
|
|
|
|
|
|
/* Check if this whole range is within ZONE_MOVABLE */
|
|
|
|
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
|
|
|
|
*zone_start_pfn = *zone_end_pfn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/*
|
|
|
|
* Return the number of pages a zone spans in a node, including holes
|
|
|
|
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
|
|
|
|
*/
|
2007-07-16 14:38:20 +08:00
|
|
|
static unsigned long __meminit zone_spanned_pages_in_node(int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long *ignored)
|
|
|
|
{
|
|
|
|
unsigned long node_start_pfn, node_end_pfn;
|
|
|
|
unsigned long zone_start_pfn, zone_end_pfn;
|
|
|
|
|
|
|
|
/* Get the start and end of the node and zone */
|
|
|
|
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
|
|
|
|
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
|
|
|
|
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
|
2007-07-17 19:03:12 +08:00
|
|
|
adjust_zone_range_for_zone_movable(nid, zone_type,
|
|
|
|
node_start_pfn, node_end_pfn,
|
|
|
|
&zone_start_pfn, &zone_end_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
|
|
|
|
/* Check that this node has pages within the zone's required range */
|
|
|
|
if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Move the zone boundaries inside the node if necessary */
|
|
|
|
zone_end_pfn = min(zone_end_pfn, node_end_pfn);
|
|
|
|
zone_start_pfn = max(zone_start_pfn, node_start_pfn);
|
|
|
|
|
|
|
|
/* Return the spanned pages */
|
|
|
|
return zone_end_pfn - zone_start_pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
|
2006-10-04 17:15:25 +08:00
|
|
|
* then all holes in the requested range will be accounted for.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
2007-05-08 15:23:07 +08:00
|
|
|
unsigned long __meminit __absent_pages_in_range(int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long range_start_pfn,
|
|
|
|
unsigned long range_end_pfn)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
unsigned long prev_end_pfn = 0, hole_pages = 0;
|
|
|
|
unsigned long start_pfn;
|
|
|
|
|
|
|
|
/* Find the end_pfn of the first active range of pfns in the node */
|
|
|
|
i = first_active_region_index_in_nid(nid);
|
|
|
|
if (i == -1)
|
|
|
|
return 0;
|
|
|
|
|
2007-07-27 01:41:18 +08:00
|
|
|
prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
|
|
|
|
|
2006-09-27 16:49:58 +08:00
|
|
|
/* Account for ranges before physical memory on this node */
|
|
|
|
if (early_node_map[i].start_pfn > range_start_pfn)
|
2007-07-27 01:41:18 +08:00
|
|
|
hole_pages = prev_end_pfn - range_start_pfn;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
|
|
|
|
/* Find all holes for the zone within the node */
|
|
|
|
for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
|
|
|
|
|
|
|
|
/* No need to continue if prev_end_pfn is outside the zone */
|
|
|
|
if (prev_end_pfn >= range_end_pfn)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Make sure the end of the zone is not within the hole */
|
|
|
|
start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
|
|
|
|
prev_end_pfn = max(prev_end_pfn, range_start_pfn);
|
|
|
|
|
|
|
|
/* Update the hole size cound and move on */
|
|
|
|
if (start_pfn > range_start_pfn) {
|
|
|
|
BUG_ON(prev_end_pfn > start_pfn);
|
|
|
|
hole_pages += start_pfn - prev_end_pfn;
|
|
|
|
}
|
|
|
|
prev_end_pfn = early_node_map[i].end_pfn;
|
|
|
|
}
|
|
|
|
|
2006-09-27 16:49:58 +08:00
|
|
|
/* Account for ranges past physical memory on this node */
|
|
|
|
if (range_end_pfn > prev_end_pfn)
|
2006-10-29 01:38:59 +08:00
|
|
|
hole_pages += range_end_pfn -
|
2006-09-27 16:49:58 +08:00
|
|
|
max(range_start_pfn, prev_end_pfn);
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
return hole_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* absent_pages_in_range - Return number of page frames in holes within a range
|
|
|
|
* @start_pfn: The start PFN to start searching for holes
|
|
|
|
* @end_pfn: The end PFN to stop searching for holes
|
|
|
|
*
|
2006-10-04 17:15:25 +08:00
|
|
|
* It returns the number of pages frames in memory holes within a range.
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
|
|
|
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the number of page frames in holes in a zone on a node */
|
2007-07-16 14:38:20 +08:00
|
|
|
static unsigned long __meminit zone_absent_pages_in_node(int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long *ignored)
|
|
|
|
{
|
2006-09-27 16:49:58 +08:00
|
|
|
unsigned long node_start_pfn, node_end_pfn;
|
|
|
|
unsigned long zone_start_pfn, zone_end_pfn;
|
|
|
|
|
|
|
|
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
|
|
|
|
zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
|
|
|
|
node_start_pfn);
|
|
|
|
zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
|
|
|
|
node_end_pfn);
|
|
|
|
|
2007-07-17 19:03:12 +08:00
|
|
|
adjust_zone_range_for_zone_movable(nid, zone_type,
|
|
|
|
node_start_pfn, node_end_pfn,
|
|
|
|
&zone_start_pfn, &zone_end_pfn);
|
2006-09-27 16:49:58 +08:00
|
|
|
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
}
|
2006-09-27 16:49:56 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#else
|
2007-07-16 14:38:20 +08:00
|
|
|
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long *zones_size)
|
|
|
|
{
|
|
|
|
return zones_size[zone_type];
|
|
|
|
}
|
|
|
|
|
2007-07-16 14:38:20 +08:00
|
|
|
static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long zone_type,
|
|
|
|
unsigned long *zholes_size)
|
|
|
|
{
|
|
|
|
if (!zholes_size)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return zholes_size[zone_type];
|
|
|
|
}
|
2006-09-27 16:49:56 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#endif
|
|
|
|
|
2007-05-08 15:23:07 +08:00
|
|
|
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long *zones_size, unsigned long *zholes_size)
|
|
|
|
{
|
|
|
|
unsigned long realtotalpages, totalpages = 0;
|
|
|
|
enum zone_type i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++)
|
|
|
|
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
|
|
|
|
zones_size);
|
|
|
|
pgdat->node_spanned_pages = totalpages;
|
|
|
|
|
|
|
|
realtotalpages = totalpages;
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++)
|
|
|
|
realtotalpages -=
|
|
|
|
zone_absent_pages_in_node(pgdat->node_id, i,
|
|
|
|
zholes_size);
|
|
|
|
pgdat->node_present_pages = realtotalpages;
|
|
|
|
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
|
|
|
|
realtotalpages);
|
|
|
|
}
|
|
|
|
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
#ifndef CONFIG_SPARSEMEM
|
|
|
|
/*
|
|
|
|
* Calculate the size of the zone->blockflags rounded to an unsigned long
|
2007-10-16 16:26:01 +08:00
|
|
|
* Start by making sure zonesize is a multiple of pageblock_order by rounding
|
|
|
|
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
* round what is now in bits to nearest long in bits, then return it in
|
|
|
|
* bytes.
|
|
|
|
*/
|
|
|
|
static unsigned long __init usemap_size(unsigned long zonesize)
|
|
|
|
{
|
|
|
|
unsigned long usemapsize;
|
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
usemapsize = roundup(zonesize, pageblock_nr_pages);
|
|
|
|
usemapsize = usemapsize >> pageblock_order;
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
usemapsize *= NR_PAGEBLOCK_BITS;
|
|
|
|
usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
|
|
|
|
|
|
|
|
return usemapsize / 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init setup_usemap(struct pglist_data *pgdat,
|
|
|
|
struct zone *zone, unsigned long zonesize)
|
|
|
|
{
|
|
|
|
unsigned long usemapsize = usemap_size(zonesize);
|
|
|
|
zone->pageblock_flags = NULL;
|
|
|
|
if (usemapsize) {
|
|
|
|
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
|
|
|
|
memset(zone->pageblock_flags, 0, usemapsize);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static void inline setup_usemap(struct pglist_data *pgdat,
|
|
|
|
struct zone *zone, unsigned long zonesize) {}
|
|
|
|
#endif /* CONFIG_SPARSEMEM */
|
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
2007-11-29 08:21:13 +08:00
|
|
|
|
|
|
|
/* Return a sensible default order for the pageblock size. */
|
|
|
|
static inline int pageblock_default_order(void)
|
|
|
|
{
|
|
|
|
if (HPAGE_SHIFT > PAGE_SHIFT)
|
|
|
|
return HUGETLB_PAGE_ORDER;
|
|
|
|
|
|
|
|
return MAX_ORDER-1;
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:26:01 +08:00
|
|
|
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
|
|
|
|
static inline void __init set_pageblock_order(unsigned int order)
|
|
|
|
{
|
|
|
|
/* Check that pageblock_nr_pages has not already been setup */
|
|
|
|
if (pageblock_order)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assume the largest contiguous order of interest is a huge page.
|
|
|
|
* This value may be variable depending on boot parameters on IA64
|
|
|
|
*/
|
|
|
|
pageblock_order = order;
|
|
|
|
}
|
|
|
|
#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
|
|
|
|
2007-11-29 08:21:13 +08:00
|
|
|
/*
|
|
|
|
* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
|
|
|
|
* and pageblock_default_order() are unused as pageblock_order is set
|
|
|
|
* at compile-time. See include/linux/pageblock-flags.h for the values of
|
|
|
|
* pageblock_order based on the kernel config
|
|
|
|
*/
|
|
|
|
static inline int pageblock_default_order(unsigned int order)
|
|
|
|
{
|
|
|
|
return MAX_ORDER-1;
|
|
|
|
}
|
2007-10-16 16:26:01 +08:00
|
|
|
#define set_pageblock_order(x) do {} while (0)
|
|
|
|
|
|
|
|
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Set up the zone data structures:
|
|
|
|
* - mark all pages reserved
|
|
|
|
* - mark all memory queues empty
|
|
|
|
* - clear the memory bitmaps
|
|
|
|
*/
|
2008-02-24 07:24:06 +08:00
|
|
|
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long *zones_size, unsigned long *zholes_size)
|
|
|
|
{
|
2006-09-26 14:31:13 +08:00
|
|
|
enum zone_type j;
|
2005-10-30 09:16:50 +08:00
|
|
|
int nid = pgdat->node_id;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long zone_start_pfn = pgdat->node_start_pfn;
|
2006-06-23 17:03:10 +08:00
|
|
|
int ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:52 +08:00
|
|
|
pgdat_resize_init(pgdat);
|
2005-04-17 06:20:36 +08:00
|
|
|
pgdat->nr_zones = 0;
|
|
|
|
init_waitqueue_head(&pgdat->kswapd_wait);
|
|
|
|
pgdat->kswapd_max_order = 0;
|
|
|
|
|
|
|
|
for (j = 0; j < MAX_NR_ZONES; j++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + j;
|
2006-09-27 16:49:56 +08:00
|
|
|
unsigned long size, realsize, memmap_pages;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
size = zone_spanned_pages_in_node(nid, j, zones_size);
|
|
|
|
realsize = size - zone_absent_pages_in_node(nid, j,
|
|
|
|
zholes_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-27 16:49:56 +08:00
|
|
|
/*
|
|
|
|
* Adjust realsize so that it accounts for how much memory
|
|
|
|
* is used by this zone for memmap. This affects the watermark
|
|
|
|
* and per-cpu initialisations
|
|
|
|
*/
|
2008-05-24 04:04:21 +08:00
|
|
|
memmap_pages =
|
|
|
|
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
|
2006-09-27 16:49:56 +08:00
|
|
|
if (realsize >= memmap_pages) {
|
|
|
|
realsize -= memmap_pages;
|
|
|
|
printk(KERN_DEBUG
|
|
|
|
" %s zone: %lu pages used for memmap\n",
|
|
|
|
zone_names[j], memmap_pages);
|
|
|
|
} else
|
|
|
|
printk(KERN_WARNING
|
|
|
|
" %s zone: %lu pages exceeds realsize %lu\n",
|
|
|
|
zone_names[j], memmap_pages, realsize);
|
|
|
|
|
2007-02-10 17:43:07 +08:00
|
|
|
/* Account for reserved pages */
|
|
|
|
if (j == 0 && realsize > dma_reserve) {
|
2006-09-27 16:49:56 +08:00
|
|
|
realsize -= dma_reserve;
|
2007-02-10 17:43:07 +08:00
|
|
|
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
|
|
|
|
zone_names[0], dma_reserve);
|
2006-09-27 16:49:56 +08:00
|
|
|
}
|
|
|
|
|
2006-09-26 14:31:12 +08:00
|
|
|
if (!is_highmem_idx(j))
|
2005-04-17 06:20:36 +08:00
|
|
|
nr_kernel_pages += realsize;
|
|
|
|
nr_all_pages += realsize;
|
|
|
|
|
|
|
|
zone->spanned_pages = size;
|
|
|
|
zone->present_pages = realsize;
|
2006-07-03 15:24:13 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-09-27 16:50:08 +08:00
|
|
|
zone->node = nid;
|
2006-09-26 14:31:51 +08:00
|
|
|
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
|
2006-07-03 15:24:13 +08:00
|
|
|
/ 100;
|
2006-09-26 14:31:52 +08:00
|
|
|
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
|
2006-07-03 15:24:13 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
zone->name = zone_names[j];
|
|
|
|
spin_lock_init(&zone->lock);
|
|
|
|
spin_lock_init(&zone->lru_lock);
|
2005-10-30 09:16:53 +08:00
|
|
|
zone_seqlock_init(zone);
|
2005-04-17 06:20:36 +08:00
|
|
|
zone->zone_pgdat = pgdat;
|
|
|
|
|
[PATCH] vmscan: Fix temp_priority race
The temp_priority field in zone is racy, as we can walk through a reclaim
path, and just before we copy it into prev_priority, it can be overwritten
(say with DEF_PRIORITY) by another reclaimer.
The same bug is contained in both try_to_free_pages and balance_pgdat, but
it is fixed slightly differently. In balance_pgdat, we keep a separate
priority record per zone in a local array. In try_to_free_pages there is
no need to do this, as the priority level is the same for all zones that we
reclaim from.
Impact of this bug is that temp_priority is copied into prev_priority, and
setting this artificially high causes reclaimers to set distress
artificially low. They then fail to reclaim mapped pages, when they are,
in fact, under severe memory pressure (their priority may be as low as 0).
This causes the OOM killer to fire incorrectly.
From: Andrew Morton <akpm@osdl.org>
__zone_reclaim() isn't modifying zone->prev_priority. But zone->prev_priority
is used in the decision whether or not to bring mapped pages onto the inactive
list. Hence there's a risk here that __zone_reclaim() will fail because
zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up
stuck on the active list.
Fix that up by decreasing (ie making more urgent) zone->prev_priority as
__zone_reclaim() scans the zone's pages.
This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created. It should be
possible to remove that now, and to just start out at DEF_PRIORITY?
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-29 01:38:24 +08:00
|
|
|
zone->prev_priority = DEF_PRIORITY;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-30 09:16:50 +08:00
|
|
|
zone_pcp_init(zone);
|
2005-04-17 06:20:36 +08:00
|
|
|
INIT_LIST_HEAD(&zone->active_list);
|
|
|
|
INIT_LIST_HEAD(&zone->inactive_list);
|
|
|
|
zone->nr_scan_active = 0;
|
|
|
|
zone->nr_scan_inactive = 0;
|
2006-06-30 16:55:33 +08:00
|
|
|
zap_zone_vm_stats(zone);
|
2007-10-17 14:25:54 +08:00
|
|
|
zone->flags = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!size)
|
|
|
|
continue;
|
|
|
|
|
2007-11-29 08:21:13 +08:00
|
|
|
set_pageblock_order(pageblock_default_order());
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
setup_usemap(pgdat, zone, size);
|
2007-01-11 15:15:30 +08:00
|
|
|
ret = init_currently_empty_zone(zone, zone_start_pfn,
|
|
|
|
size, MEMMAP_EARLY);
|
2006-06-23 17:03:10 +08:00
|
|
|
BUG_ON(ret);
|
2008-05-15 07:05:52 +08:00
|
|
|
memmap_init(size, nid, j, zone_start_pfn);
|
2005-04-17 06:20:36 +08:00
|
|
|
zone_start_pfn += size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-05-18 05:29:25 +08:00
|
|
|
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/* Skip empty nodes */
|
|
|
|
if (!pgdat->node_spanned_pages)
|
|
|
|
return;
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#ifdef CONFIG_FLAT_NODE_MEM_MAP
|
2005-04-17 06:20:36 +08:00
|
|
|
/* ia64 gets its own node_mem_map, before this, without bootmem */
|
|
|
|
if (!pgdat->node_mem_map) {
|
2006-05-21 06:00:31 +08:00
|
|
|
unsigned long size, start, end;
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
struct page *map;
|
|
|
|
|
2006-05-21 06:00:31 +08:00
|
|
|
/*
|
|
|
|
* The zone's endpoints aren't required to be MAX_ORDER
|
|
|
|
* aligned but the node_mem_map endpoints must be in order
|
|
|
|
* for the buddy allocator to function correctly.
|
|
|
|
*/
|
|
|
|
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
|
|
|
|
end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
|
|
|
end = ALIGN(end, MAX_ORDER_NR_PAGES);
|
|
|
|
size = (end - start) * sizeof(struct page);
|
2005-06-23 15:07:39 +08:00
|
|
|
map = alloc_remap(pgdat->node_id, size);
|
|
|
|
if (!map)
|
|
|
|
map = alloc_bootmem_node(pgdat, size);
|
2006-05-21 06:00:31 +08:00
|
|
|
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-05-31 15:40:54 +08:00
|
|
|
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* With no DISCONTIG, the global mem_map is just set as node 0's
|
|
|
|
*/
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
if (pgdat == NODE_DATA(0)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
mem_map = NODE_DATA(0)->node_mem_map;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
|
|
|
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
|
2008-01-09 07:33:11 +08:00
|
|
|
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:54 +08:00
|
|
|
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-02-24 07:24:06 +08:00
|
|
|
void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long *zones_size, unsigned long node_start_pfn,
|
|
|
|
unsigned long *zholes_size)
|
|
|
|
{
|
|
|
|
pgdat->node_id = nid;
|
|
|
|
pgdat->node_start_pfn = node_start_pfn;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
calculate_node_totalpages(pgdat, zones_size, zholes_size);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
alloc_node_mem_map(pgdat);
|
|
|
|
|
|
|
|
free_area_init_core(pgdat, zones_size, zholes_size);
|
|
|
|
}
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
|
2007-05-24 04:57:55 +08:00
|
|
|
|
|
|
|
#if MAX_NUMNODES > 1
|
|
|
|
/*
|
|
|
|
* Figure out the number of possible node ids.
|
|
|
|
*/
|
|
|
|
static void __init setup_nr_node_ids(void)
|
|
|
|
{
|
|
|
|
unsigned int node;
|
|
|
|
unsigned int highest = 0;
|
|
|
|
|
|
|
|
for_each_node_mask(node, node_possible_map)
|
|
|
|
highest = node;
|
|
|
|
nr_node_ids = highest + 1;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void setup_nr_node_ids(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/**
|
|
|
|
* add_active_range - Register a range of PFNs backed by physical memory
|
|
|
|
* @nid: The node ID the range resides on
|
|
|
|
* @start_pfn: The start PFN of the available physical memory
|
|
|
|
* @end_pfn: The end PFN of the available physical memory
|
|
|
|
*
|
|
|
|
* These ranges are stored in an early_node_map[] and later used by
|
|
|
|
* free_area_init_nodes() to calculate zone sizes and holes. If the
|
|
|
|
* range spans a memory hole, it is up to the architecture to ensure
|
|
|
|
* the memory is not freed by the bootmem allocator. If possible
|
|
|
|
* the range being registered will be merged with existing ranges.
|
|
|
|
*/
|
|
|
|
void __init add_active_range(unsigned int nid, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
|
|
|
|
"%d entries of %d used\n",
|
|
|
|
nid, start_pfn, end_pfn,
|
|
|
|
nr_nodemap_entries, MAX_ACTIVE_REGIONS);
|
|
|
|
|
|
|
|
/* Merge with existing active regions if possible */
|
|
|
|
for (i = 0; i < nr_nodemap_entries; i++) {
|
|
|
|
if (early_node_map[i].nid != nid)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Skip if an existing region covers this new one */
|
|
|
|
if (start_pfn >= early_node_map[i].start_pfn &&
|
|
|
|
end_pfn <= early_node_map[i].end_pfn)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Merge forward if suitable */
|
|
|
|
if (start_pfn <= early_node_map[i].end_pfn &&
|
|
|
|
end_pfn > early_node_map[i].end_pfn) {
|
|
|
|
early_node_map[i].end_pfn = end_pfn;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Merge backward if suitable */
|
|
|
|
if (start_pfn < early_node_map[i].end_pfn &&
|
|
|
|
end_pfn >= early_node_map[i].start_pfn) {
|
|
|
|
early_node_map[i].start_pfn = start_pfn;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check that early_node_map is large enough */
|
|
|
|
if (i >= MAX_ACTIVE_REGIONS) {
|
|
|
|
printk(KERN_CRIT "More than %d memory regions, truncating\n",
|
|
|
|
MAX_ACTIVE_REGIONS);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
early_node_map[i].nid = nid;
|
|
|
|
early_node_map[i].start_pfn = start_pfn;
|
|
|
|
early_node_map[i].end_pfn = end_pfn;
|
|
|
|
nr_nodemap_entries = i + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* shrink_active_range - Shrink an existing registered range of PFNs
|
|
|
|
* @nid: The node id the range is on that should be shrunk
|
|
|
|
* @old_end_pfn: The old end PFN of the range
|
|
|
|
* @new_end_pfn: The new PFN of the range
|
|
|
|
*
|
|
|
|
* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
|
|
|
|
* The map is kept at the end physical page range that has already been
|
|
|
|
* registered with add_active_range(). This function allows an arch to shrink
|
|
|
|
* an existing registered range.
|
|
|
|
*/
|
|
|
|
void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
|
|
|
|
unsigned long new_end_pfn)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Find the old active region end and shrink */
|
|
|
|
for_each_active_range_index_in_nid(i, nid)
|
|
|
|
if (early_node_map[i].end_pfn == old_end_pfn) {
|
|
|
|
early_node_map[i].end_pfn = new_end_pfn;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* remove_all_active_ranges - Remove all currently registered regions
|
2006-10-04 17:15:25 +08:00
|
|
|
*
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
* During discovery, it may be found that a table like SRAT is invalid
|
|
|
|
* and an alternative discovery method must be used. This function removes
|
|
|
|
* all currently registered regions.
|
|
|
|
*/
|
2006-10-04 17:15:25 +08:00
|
|
|
void __init remove_all_active_ranges(void)
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
{
|
|
|
|
memset(early_node_map, 0, sizeof(early_node_map));
|
|
|
|
nr_nodemap_entries = 0;
|
2006-09-27 16:49:59 +08:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
|
|
|
|
memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
|
|
|
|
memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
|
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Compare two active node_active_regions */
|
|
|
|
static int __init cmp_node_active_region(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
struct node_active_region *arange = (struct node_active_region *)a;
|
|
|
|
struct node_active_region *brange = (struct node_active_region *)b;
|
|
|
|
|
|
|
|
/* Done this way to avoid overflows */
|
|
|
|
if (arange->start_pfn > brange->start_pfn)
|
|
|
|
return 1;
|
|
|
|
if (arange->start_pfn < brange->start_pfn)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* sort the node_map by start_pfn */
|
|
|
|
static void __init sort_node_map(void)
|
|
|
|
{
|
|
|
|
sort(early_node_map, (size_t)nr_nodemap_entries,
|
|
|
|
sizeof(struct node_active_region),
|
|
|
|
cmp_node_active_region, NULL);
|
|
|
|
}
|
|
|
|
|
2007-02-10 17:42:57 +08:00
|
|
|
/* Find the lowest pfn for a node */
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
unsigned long __init find_min_pfn_for_node(unsigned long nid)
|
|
|
|
{
|
|
|
|
int i;
|
2007-02-10 17:42:57 +08:00
|
|
|
unsigned long min_pfn = ULONG_MAX;
|
2006-11-23 20:01:41 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/* Assuming a sorted map, the first range found has the starting pfn */
|
|
|
|
for_each_active_range_index_in_nid(i, nid)
|
2007-02-10 17:42:57 +08:00
|
|
|
min_pfn = min(min_pfn, early_node_map[i].start_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
|
2007-02-10 17:42:57 +08:00
|
|
|
if (min_pfn == ULONG_MAX) {
|
|
|
|
printk(KERN_WARNING
|
|
|
|
"Could not find start_pfn for node %lu\n", nid);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return min_pfn;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* find_min_pfn_with_active_regions - Find the minimum PFN registered
|
|
|
|
*
|
|
|
|
* It returns the minimum PFN based on information provided via
|
2006-10-04 17:15:25 +08:00
|
|
|
* add_active_range().
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
|
|
|
unsigned long __init find_min_pfn_with_active_regions(void)
|
|
|
|
{
|
|
|
|
return find_min_pfn_for_node(MAX_NUMNODES);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* find_max_pfn_with_active_regions - Find the maximum PFN registered
|
|
|
|
*
|
|
|
|
* It returns the maximum PFN based on information provided via
|
2006-10-04 17:15:25 +08:00
|
|
|
* add_active_range().
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*/
|
|
|
|
unsigned long __init find_max_pfn_with_active_regions(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
unsigned long max_pfn = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_nodemap_entries; i++)
|
|
|
|
max_pfn = max(max_pfn, early_node_map[i].end_pfn);
|
|
|
|
|
|
|
|
return max_pfn;
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:39 +08:00
|
|
|
/*
|
|
|
|
* early_calculate_totalpages()
|
|
|
|
* Sum pages in active regions for movable zone.
|
|
|
|
* Populate N_HIGH_MEMORY for calculating usable_nodes.
|
|
|
|
*/
|
2007-10-16 16:26:03 +08:00
|
|
|
static unsigned long __init early_calculate_totalpages(void)
|
2007-07-17 19:03:15 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
unsigned long totalpages = 0;
|
|
|
|
|
2007-10-16 16:25:39 +08:00
|
|
|
for (i = 0; i < nr_nodemap_entries; i++) {
|
|
|
|
unsigned long pages = early_node_map[i].end_pfn -
|
2007-07-17 19:03:15 +08:00
|
|
|
early_node_map[i].start_pfn;
|
2007-10-16 16:25:39 +08:00
|
|
|
totalpages += pages;
|
|
|
|
if (pages)
|
|
|
|
node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
|
|
|
|
}
|
|
|
|
return totalpages;
|
2007-07-17 19:03:15 +08:00
|
|
|
}
|
|
|
|
|
2007-07-17 19:03:12 +08:00
|
|
|
/*
|
|
|
|
* Find the PFN the Movable zone begins in each node. Kernel memory
|
|
|
|
* is spread evenly between nodes as long as the nodes have enough
|
|
|
|
* memory. When they don't, some nodes will have more kernelcore than
|
|
|
|
* others
|
|
|
|
*/
|
|
|
|
void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
|
|
|
|
{
|
|
|
|
int i, nid;
|
|
|
|
unsigned long usable_startpfn;
|
|
|
|
unsigned long kernelcore_node, kernelcore_remaining;
|
2007-10-16 16:25:39 +08:00
|
|
|
unsigned long totalpages = early_calculate_totalpages();
|
|
|
|
int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
|
2007-07-17 19:03:12 +08:00
|
|
|
|
2007-07-17 19:03:15 +08:00
|
|
|
/*
|
|
|
|
* If movablecore was specified, calculate what size of
|
|
|
|
* kernelcore that corresponds so that memory usable for
|
|
|
|
* any allocation type is evenly spread. If both kernelcore
|
|
|
|
* and movablecore are specified, then the value of kernelcore
|
|
|
|
* will be used for required_kernelcore if it's greater than
|
|
|
|
* what movablecore would have allowed.
|
|
|
|
*/
|
|
|
|
if (required_movablecore) {
|
|
|
|
unsigned long corepages;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Round-up so that ZONE_MOVABLE is at least as large as what
|
|
|
|
* was requested by the user
|
|
|
|
*/
|
|
|
|
required_movablecore =
|
|
|
|
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
|
|
|
|
corepages = totalpages - required_movablecore;
|
|
|
|
|
|
|
|
required_kernelcore = max(required_kernelcore, corepages);
|
|
|
|
}
|
|
|
|
|
2007-07-17 19:03:12 +08:00
|
|
|
/* If kernelcore was not specified, there is no ZONE_MOVABLE */
|
|
|
|
if (!required_kernelcore)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
|
|
|
|
find_usable_zone_for_movable();
|
|
|
|
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
|
|
|
|
|
|
|
|
restart:
|
|
|
|
/* Spread kernelcore memory as evenly as possible throughout nodes */
|
|
|
|
kernelcore_node = required_kernelcore / usable_nodes;
|
2007-10-16 16:25:39 +08:00
|
|
|
for_each_node_state(nid, N_HIGH_MEMORY) {
|
2007-07-17 19:03:12 +08:00
|
|
|
/*
|
|
|
|
* Recalculate kernelcore_node if the division per node
|
|
|
|
* now exceeds what is necessary to satisfy the requested
|
|
|
|
* amount of memory for the kernel
|
|
|
|
*/
|
|
|
|
if (required_kernelcore < kernelcore_node)
|
|
|
|
kernelcore_node = required_kernelcore / usable_nodes;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As the map is walked, we track how much memory is usable
|
|
|
|
* by the kernel using kernelcore_remaining. When it is
|
|
|
|
* 0, the rest of the node is usable by ZONE_MOVABLE
|
|
|
|
*/
|
|
|
|
kernelcore_remaining = kernelcore_node;
|
|
|
|
|
|
|
|
/* Go through each range of PFNs within this node */
|
|
|
|
for_each_active_range_index_in_nid(i, nid) {
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
unsigned long size_pages;
|
|
|
|
|
|
|
|
start_pfn = max(early_node_map[i].start_pfn,
|
|
|
|
zone_movable_pfn[nid]);
|
|
|
|
end_pfn = early_node_map[i].end_pfn;
|
|
|
|
if (start_pfn >= end_pfn)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Account for what is only usable for kernelcore */
|
|
|
|
if (start_pfn < usable_startpfn) {
|
|
|
|
unsigned long kernel_pages;
|
|
|
|
kernel_pages = min(end_pfn, usable_startpfn)
|
|
|
|
- start_pfn;
|
|
|
|
|
|
|
|
kernelcore_remaining -= min(kernel_pages,
|
|
|
|
kernelcore_remaining);
|
|
|
|
required_kernelcore -= min(kernel_pages,
|
|
|
|
required_kernelcore);
|
|
|
|
|
|
|
|
/* Continue if range is now fully accounted */
|
|
|
|
if (end_pfn <= usable_startpfn) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Push zone_movable_pfn to the end so
|
|
|
|
* that if we have to rebalance
|
|
|
|
* kernelcore across nodes, we will
|
|
|
|
* not double account here
|
|
|
|
*/
|
|
|
|
zone_movable_pfn[nid] = end_pfn;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
start_pfn = usable_startpfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The usable PFN range for ZONE_MOVABLE is from
|
|
|
|
* start_pfn->end_pfn. Calculate size_pages as the
|
|
|
|
* number of pages used as kernelcore
|
|
|
|
*/
|
|
|
|
size_pages = end_pfn - start_pfn;
|
|
|
|
if (size_pages > kernelcore_remaining)
|
|
|
|
size_pages = kernelcore_remaining;
|
|
|
|
zone_movable_pfn[nid] = start_pfn + size_pages;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some kernelcore has been met, update counts and
|
|
|
|
* break if the kernelcore for this node has been
|
|
|
|
* satisified
|
|
|
|
*/
|
|
|
|
required_kernelcore -= min(required_kernelcore,
|
|
|
|
size_pages);
|
|
|
|
kernelcore_remaining -= size_pages;
|
|
|
|
if (!kernelcore_remaining)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there is still required_kernelcore, we do another pass with one
|
|
|
|
* less node in the count. This will push zone_movable_pfn[nid] further
|
|
|
|
* along on the nodes that still have memory until kernelcore is
|
|
|
|
* satisified
|
|
|
|
*/
|
|
|
|
usable_nodes--;
|
|
|
|
if (usable_nodes && required_kernelcore > usable_nodes)
|
|
|
|
goto restart;
|
|
|
|
|
|
|
|
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
|
|
|
|
for (nid = 0; nid < MAX_NUMNODES; nid++)
|
|
|
|
zone_movable_pfn[nid] =
|
|
|
|
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
|
|
|
|
}
|
|
|
|
|
2007-10-16 16:25:39 +08:00
|
|
|
/* Any regular memory on that node ? */
|
|
|
|
static void check_for_regular_memory(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
enum zone_type zone_type;
|
|
|
|
|
|
|
|
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
|
|
|
|
struct zone *zone = &pgdat->node_zones[zone_type];
|
|
|
|
if (zone->present_pages)
|
|
|
|
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/**
|
|
|
|
* free_area_init_nodes - Initialise all pg_data_t and zone data
|
2006-10-04 17:15:25 +08:00
|
|
|
* @max_zone_pfn: an array of max PFNs for each zone
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
*
|
|
|
|
* This will call free_area_init_node() for each active node in the system.
|
|
|
|
* Using the page ranges provided by add_active_range(), the size of each
|
|
|
|
* zone in each node and their holes is calculated. If the maximum PFN
|
|
|
|
* between two adjacent zones match, it is assumed that the zone is empty.
|
|
|
|
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
|
|
|
|
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
|
|
|
|
* starts where the previous one ended. For example, ZONE_DMA32 starts
|
|
|
|
* at arch_max_dma_pfn.
|
|
|
|
*/
|
|
|
|
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
|
|
|
|
{
|
|
|
|
unsigned long nid;
|
|
|
|
enum zone_type i;
|
|
|
|
|
2007-02-10 17:42:57 +08:00
|
|
|
/* Sort early_node_map as initialisation assumes it is sorted */
|
|
|
|
sort_node_map();
|
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
/* Record where the zone boundaries are */
|
|
|
|
memset(arch_zone_lowest_possible_pfn, 0,
|
|
|
|
sizeof(arch_zone_lowest_possible_pfn));
|
|
|
|
memset(arch_zone_highest_possible_pfn, 0,
|
|
|
|
sizeof(arch_zone_highest_possible_pfn));
|
|
|
|
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
|
|
|
|
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
|
|
|
|
for (i = 1; i < MAX_NR_ZONES; i++) {
|
2007-07-17 19:03:12 +08:00
|
|
|
if (i == ZONE_MOVABLE)
|
|
|
|
continue;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
arch_zone_lowest_possible_pfn[i] =
|
|
|
|
arch_zone_highest_possible_pfn[i-1];
|
|
|
|
arch_zone_highest_possible_pfn[i] =
|
|
|
|
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
|
|
|
|
}
|
2007-07-17 19:03:12 +08:00
|
|
|
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
|
|
|
|
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
|
|
|
|
|
|
|
|
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
|
|
|
|
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
|
|
|
|
find_zone_movable_pfns_for_nodes(zone_movable_pfn);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
|
|
|
|
/* Print out the zone ranges */
|
|
|
|
printk("Zone PFN ranges:\n");
|
2007-07-17 19:03:12 +08:00
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++) {
|
|
|
|
if (i == ZONE_MOVABLE)
|
|
|
|
continue;
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
printk(" %-8s %8lu -> %8lu\n",
|
|
|
|
zone_names[i],
|
|
|
|
arch_zone_lowest_possible_pfn[i],
|
|
|
|
arch_zone_highest_possible_pfn[i]);
|
2007-07-17 19:03:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
|
|
|
|
printk("Movable zone start PFN for each node\n");
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
if (zone_movable_pfn[i])
|
|
|
|
printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
|
|
|
|
}
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
|
|
|
|
/* Print out the early_node_map[] */
|
|
|
|
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
|
|
|
|
for (i = 0; i < nr_nodemap_entries; i++)
|
|
|
|
printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
|
|
|
|
early_node_map[i].start_pfn,
|
|
|
|
early_node_map[i].end_pfn);
|
|
|
|
|
|
|
|
/* Initialise every node */
|
2007-02-21 05:57:52 +08:00
|
|
|
setup_nr_node_ids();
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
for_each_online_node(nid) {
|
|
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
|
|
free_area_init_node(nid, pgdat, NULL,
|
|
|
|
find_min_pfn_for_node(nid), NULL);
|
2007-10-16 16:25:39 +08:00
|
|
|
|
|
|
|
/* Any memory on that node */
|
|
|
|
if (pgdat->node_present_pages)
|
|
|
|
node_set_state(nid, N_HIGH_MEMORY);
|
|
|
|
check_for_regular_memory(pgdat);
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
}
|
|
|
|
}
|
2007-07-17 19:03:12 +08:00
|
|
|
|
2007-07-17 19:03:15 +08:00
|
|
|
static int __init cmdline_parse_core(char *p, unsigned long *core)
|
2007-07-17 19:03:12 +08:00
|
|
|
{
|
|
|
|
unsigned long long coremem;
|
|
|
|
if (!p)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
coremem = memparse(p, &p);
|
2007-07-17 19:03:15 +08:00
|
|
|
*core = coremem >> PAGE_SHIFT;
|
2007-07-17 19:03:12 +08:00
|
|
|
|
2007-07-17 19:03:15 +08:00
|
|
|
/* Paranoid check that UL is enough for the coremem value */
|
2007-07-17 19:03:12 +08:00
|
|
|
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2007-07-17 19:03:14 +08:00
|
|
|
|
2007-07-17 19:03:15 +08:00
|
|
|
/*
|
|
|
|
* kernelcore=size sets the amount of memory for use for allocations that
|
|
|
|
* cannot be reclaimed or migrated.
|
|
|
|
*/
|
|
|
|
static int __init cmdline_parse_kernelcore(char *p)
|
|
|
|
{
|
|
|
|
return cmdline_parse_core(p, &required_kernelcore);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* movablecore=size sets the amount of memory for use for allocations that
|
|
|
|
* can be reclaimed or migrated.
|
|
|
|
*/
|
|
|
|
static int __init cmdline_parse_movablecore(char *p)
|
|
|
|
{
|
|
|
|
return cmdline_parse_core(p, &required_movablecore);
|
|
|
|
}
|
|
|
|
|
2007-07-17 19:03:14 +08:00
|
|
|
early_param("kernelcore", cmdline_parse_kernelcore);
|
2007-07-17 19:03:15 +08:00
|
|
|
early_param("movablecore", cmdline_parse_movablecore);
|
2007-07-17 19:03:14 +08:00
|
|
|
|
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active
ranges of page frames are located. Once located, the code to calculate zone
sizes and holes in each architecture is very similar. Some of this zone and
hole sizing code is difficult to read for no good reason. This set of patches
eliminates the similar-looking architecture-specific code.
The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range(). When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones. The zone sizes and holes are then calculated in an architecture
independent manner.
Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
It adjusts the watermarks slightly
Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on
IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine. These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.
There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.
The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy. There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.
Additional credit;
Dave Hansen for the initial suggestion and comments on early patches
Andy Whitcroft for reviewing early versions and catching numerous
errors
Tony Luck for testing and debugging on IA64
Bob Picco for fixing bugs related to pfn registration, reviewing a
number of patch revisions, providing a number of suggestions
on future direction and testing heavily
Jack Steiner and Robin Holt for testing on IA64 and clarifying
issues related to memory holes
Yasunori for testing on IA64
Andi Kleen for reviewing and feeding back about x86_64
Christian Kujau for providing valuable information related to ACPI
problems on x86_64 and testing potential fixes
This patch:
Define the structure to represent an active range of page frames within a node
in an architecture independent manner. Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 16:49:43 +08:00
|
|
|
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
|
|
|
|
|
2006-09-27 16:49:56 +08:00
|
|
|
/**
|
2006-10-04 17:15:25 +08:00
|
|
|
* set_dma_reserve - set the specified number of pages reserved in the first zone
|
|
|
|
* @new_dma_reserve: The number of pages to mark reserved
|
2006-09-27 16:49:56 +08:00
|
|
|
*
|
|
|
|
* The per-cpu batchsize and zone watermarks are determined by present_pages.
|
|
|
|
* In the DMA zone, a significant percentage may be consumed by kernel image
|
|
|
|
* and other unfreeable allocations which can skew the watermarks badly. This
|
2006-10-04 17:15:25 +08:00
|
|
|
* function may optionally be used to account for unfreeable pages in the
|
|
|
|
* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
|
|
|
|
* smaller per-cpu batchsize.
|
2006-09-27 16:49:56 +08:00
|
|
|
*/
|
|
|
|
void __init set_dma_reserve(unsigned long new_dma_reserve)
|
|
|
|
{
|
|
|
|
dma_reserve = new_dma_reserve;
|
|
|
|
}
|
|
|
|
|
2005-06-23 15:07:47 +08:00
|
|
|
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
2005-04-17 06:20:36 +08:00
|
|
|
static bootmem_data_t contig_bootmem_data;
|
|
|
|
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(contig_page_data);
|
2005-06-23 15:07:47 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void __init free_area_init(unsigned long *zones_size)
|
|
|
|
{
|
2005-06-23 15:07:47 +08:00
|
|
|
free_area_init_node(0, NODE_DATA(0), zones_size,
|
2005-04-17 06:20:36 +08:00
|
|
|
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int page_alloc_cpu_notify(struct notifier_block *self,
|
|
|
|
unsigned long action, void *hcpu)
|
|
|
|
{
|
|
|
|
int cpu = (unsigned long)hcpu;
|
|
|
|
|
2007-05-09 17:35:10 +08:00
|
|
|
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
|
2008-02-05 14:29:11 +08:00
|
|
|
drain_pages(cpu);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Spill the event counters of the dead processor
|
|
|
|
* into the current processors event counters.
|
|
|
|
* This artificially elevates the count of the current
|
|
|
|
* processor.
|
|
|
|
*/
|
2006-06-30 16:55:45 +08:00
|
|
|
vm_events_fold_cpu(cpu);
|
2008-02-05 14:29:11 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Zero the differential counters of the dead processor
|
|
|
|
* so that the vm statistics are consistent.
|
|
|
|
*
|
|
|
|
* This is only okay since the processor is dead and cannot
|
|
|
|
* race with what we are doing.
|
|
|
|
*/
|
2006-06-30 16:55:33 +08:00
|
|
|
refresh_cpu_vm_stats(cpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init page_alloc_init(void)
|
|
|
|
{
|
|
|
|
hotcpu_notifier(page_alloc_cpu_notify, 0);
|
|
|
|
}
|
|
|
|
|
2006-04-11 13:52:59 +08:00
|
|
|
/*
|
|
|
|
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
|
|
|
|
* or min_free_kbytes changes.
|
|
|
|
*/
|
|
|
|
static void calculate_totalreserve_pages(void)
|
|
|
|
{
|
|
|
|
struct pglist_data *pgdat;
|
|
|
|
unsigned long reserve_pages = 0;
|
2006-09-26 14:31:18 +08:00
|
|
|
enum zone_type i, j;
|
2006-04-11 13:52:59 +08:00
|
|
|
|
|
|
|
for_each_online_pgdat(pgdat) {
|
|
|
|
for (i = 0; i < MAX_NR_ZONES; i++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + i;
|
|
|
|
unsigned long max = 0;
|
|
|
|
|
|
|
|
/* Find valid and maximum lowmem_reserve in the zone */
|
|
|
|
for (j = i; j < MAX_NR_ZONES; j++) {
|
|
|
|
if (zone->lowmem_reserve[j] > max)
|
|
|
|
max = zone->lowmem_reserve[j];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we treat pages_high as reserved pages. */
|
|
|
|
max += zone->pages_high;
|
|
|
|
|
|
|
|
if (max > zone->present_pages)
|
|
|
|
max = zone->present_pages;
|
|
|
|
reserve_pages += max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
totalreserve_pages = reserve_pages;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* setup_per_zone_lowmem_reserve - called whenever
|
|
|
|
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
|
|
|
|
* has a correct pages reserved value, so an adequate number of
|
|
|
|
* pages are left in the zone after a successful __alloc_pages().
|
|
|
|
*/
|
|
|
|
static void setup_per_zone_lowmem_reserve(void)
|
|
|
|
{
|
|
|
|
struct pglist_data *pgdat;
|
2006-09-26 14:31:18 +08:00
|
|
|
enum zone_type j, idx;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-27 17:15:59 +08:00
|
|
|
for_each_online_pgdat(pgdat) {
|
2005-04-17 06:20:36 +08:00
|
|
|
for (j = 0; j < MAX_NR_ZONES; j++) {
|
|
|
|
struct zone *zone = pgdat->node_zones + j;
|
|
|
|
unsigned long present_pages = zone->present_pages;
|
|
|
|
|
|
|
|
zone->lowmem_reserve[j] = 0;
|
|
|
|
|
2006-09-26 14:31:18 +08:00
|
|
|
idx = j;
|
|
|
|
while (idx) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct zone *lower_zone;
|
|
|
|
|
2006-09-26 14:31:18 +08:00
|
|
|
idx--;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (sysctl_lowmem_reserve_ratio[idx] < 1)
|
|
|
|
sysctl_lowmem_reserve_ratio[idx] = 1;
|
|
|
|
|
|
|
|
lower_zone = pgdat->node_zones + idx;
|
|
|
|
lower_zone->lowmem_reserve[j] = present_pages /
|
|
|
|
sysctl_lowmem_reserve_ratio[idx];
|
|
|
|
present_pages += lower_zone->present_pages;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-04-11 13:52:59 +08:00
|
|
|
|
|
|
|
/* update totalreserve_pages */
|
|
|
|
calculate_totalreserve_pages();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-10-04 17:15:25 +08:00
|
|
|
/**
|
|
|
|
* setup_per_zone_pages_min - called when min_free_kbytes changes.
|
|
|
|
*
|
|
|
|
* Ensures that the pages_{min,low,high} values for each zone are set correctly
|
|
|
|
* with respect to min_free_kbytes.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-10-30 09:16:54 +08:00
|
|
|
void setup_per_zone_pages_min(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
unsigned long lowmem_pages = 0;
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/* Calculate total number of !ZONE_HIGHMEM pages */
|
|
|
|
for_each_zone(zone) {
|
|
|
|
if (!is_highmem(zone))
|
|
|
|
lowmem_pages += zone->present_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
for_each_zone(zone) {
|
2006-05-16 00:43:59 +08:00
|
|
|
u64 tmp;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock_irqsave(&zone->lru_lock, flags);
|
2006-05-16 00:43:59 +08:00
|
|
|
tmp = (u64)pages_min * zone->present_pages;
|
|
|
|
do_div(tmp, lowmem_pages);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (is_highmem(zone)) {
|
|
|
|
/*
|
2005-11-14 08:06:45 +08:00
|
|
|
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
|
|
|
|
* need highmem pages, so cap pages_min to a small
|
|
|
|
* value here.
|
|
|
|
*
|
|
|
|
* The (pages_high-pages_low) and (pages_low-pages_min)
|
|
|
|
* deltas controls asynch page reclaim, and so should
|
|
|
|
* not be capped for highmem.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
int min_pages;
|
|
|
|
|
|
|
|
min_pages = zone->present_pages / 1024;
|
|
|
|
if (min_pages < SWAP_CLUSTER_MAX)
|
|
|
|
min_pages = SWAP_CLUSTER_MAX;
|
|
|
|
if (min_pages > 128)
|
|
|
|
min_pages = 128;
|
|
|
|
zone->pages_min = min_pages;
|
|
|
|
} else {
|
2005-11-14 08:06:45 +08:00
|
|
|
/*
|
|
|
|
* If it's a lowmem zone, reserve a number of pages
|
2005-04-17 06:20:36 +08:00
|
|
|
* proportionate to the zone's size.
|
|
|
|
*/
|
2005-11-14 08:06:45 +08:00
|
|
|
zone->pages_min = tmp;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-05-16 00:43:59 +08:00
|
|
|
zone->pages_low = zone->pages_min + (tmp >> 2);
|
|
|
|
zone->pages_high = zone->pages_min + (tmp >> 1);
|
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages.
The effect of this is that the pages free to satisfy min_free_kbytes tends
to be preserved since boot time at the same location of memory ffor a very
long time and as a contiguous block. When an administrator sets the
reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks
that remain free. This allows the occasional high atomic allocation to
succeed up until the point the blocks are split. In practice, it is
difficult to split these blocks but when they do split, the benefit of
having min_free_kbytes for contiguous blocks disappears. Additionally,
increasing min_free_kbytes once the system has been running for some time
has no guarantee of creating contiguous blocks.
On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large
blocks when there are no free pages of the appropriate type available. A
side-effect of this is that all blocks in memory tends to be used up and
the contiguous free blocks from boot time are not preserved like in the
vanilla allocator. This can cause a problem if a new caller is unwilling
to reclaim or does not reclaim for long enough.
A failure scenario was found for a wireless network device allocating
order-1 atomic allocations but the allocations were not intense or frequent
enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC.
This was reproduced on a desktop by booting with mem=256mb, forcing the
driver to allocate at order-1, running a bittorrent client (downloading a
debian ISO) and building a kernel with -j2.
This patch addresses the problem on the desktop machine booted with
mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES
blocks, the number of which depends on the value of min_free_kbytes. These
blocks are only fallen back to when there is no other free pages. Then the
smallest possible page is used just like the normal buddy allocator instead
of the largest possible page to preserve contiguous pages The pages in free
lists in the reserve blocks are never taken for another migrate type. The
results is that even if min_free_kbytes is set to a low value, contiguous
blocks will be preserved in the MIGRATE_RESERVE blocks.
This works better than the vanilla allocator because if min_free_kbytes is
increased, a new reserve block will be chosen based on the location of
reclaimable pages and the block will free up as contiguous pages. In the
vanilla allocator, no effort is made to target a block of pages to free as
contiguous pages and min_free_kbytes pages are scattered randomly.
This effect has been observed on the test machine. min_free_kbytes was set
initially low but it was kept as a contiguous free block within
MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a
period of time, the free blocks were within the reserve and coalescing.
How long it takes to free up depends on how quickly LRU is rotating.
Amusingly, this means that more activity will free the blocks faster.
This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more
effective than grouping contiguous free pages together. It all depends on
whether the number of active atomic high allocations exceeds
min_free_kbytes or not. If the number of active allocations exceeds
min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes
should be set higher. Once there are no more reports of allocation
failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and
see if the reports stay missing.
Credit to Mariusz Kozlowski for discovering the problem, describing the
failure scenario and testing patches and scenarios.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:58 +08:00
|
|
|
setup_zone_migrate_reserve(zone);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
|
|
|
}
|
2006-04-11 13:52:59 +08:00
|
|
|
|
|
|
|
/* update totalreserve_pages */
|
|
|
|
calculate_totalreserve_pages();
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialise min_free_kbytes.
|
|
|
|
*
|
|
|
|
* For small machines we want it small (128k min). For large machines
|
|
|
|
* we want it large (64MB max). But it is not linear, because network
|
|
|
|
* bandwidth does not increase linearly with machine size. We use
|
|
|
|
*
|
|
|
|
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
|
|
|
|
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
|
|
|
|
*
|
|
|
|
* which yields
|
|
|
|
*
|
|
|
|
* 16MB: 512k
|
|
|
|
* 32MB: 724k
|
|
|
|
* 64MB: 1024k
|
|
|
|
* 128MB: 1448k
|
|
|
|
* 256MB: 2048k
|
|
|
|
* 512MB: 2896k
|
|
|
|
* 1024MB: 4096k
|
|
|
|
* 2048MB: 5792k
|
|
|
|
* 4096MB: 8192k
|
|
|
|
* 8192MB: 11584k
|
|
|
|
* 16384MB: 16384k
|
|
|
|
*/
|
|
|
|
static int __init init_per_zone_pages_min(void)
|
|
|
|
{
|
|
|
|
unsigned long lowmem_kbytes;
|
|
|
|
|
|
|
|
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
|
|
|
|
|
|
|
|
min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
|
|
|
|
if (min_free_kbytes < 128)
|
|
|
|
min_free_kbytes = 128;
|
|
|
|
if (min_free_kbytes > 65536)
|
|
|
|
min_free_kbytes = 65536;
|
|
|
|
setup_per_zone_pages_min();
|
|
|
|
setup_per_zone_lowmem_reserve();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
module_init(init_per_zone_pages_min)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
|
|
|
|
* that we can call two helper functions whenever min_free_kbytes
|
|
|
|
* changes.
|
|
|
|
*/
|
|
|
|
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
proc_dointvec(table, write, file, buffer, length, ppos);
|
2007-05-07 05:49:30 +08:00
|
|
|
if (write)
|
|
|
|
setup_per_zone_pages_min();
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-07-03 15:24:13 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
for_each_zone(zone)
|
2006-09-26 14:31:51 +08:00
|
|
|
zone->min_unmapped_pages = (zone->present_pages *
|
2006-07-03 15:24:13 +08:00
|
|
|
sysctl_min_unmapped_ratio) / 100;
|
|
|
|
return 0;
|
|
|
|
}
|
2006-09-26 14:31:52 +08:00
|
|
|
|
|
|
|
int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
|
|
|
|
if (rc)
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
for_each_zone(zone)
|
|
|
|
zone->min_slab_pages = (zone->present_pages *
|
|
|
|
sysctl_min_slab_ratio) / 100;
|
|
|
|
return 0;
|
|
|
|
}
|
2006-07-03 15:24:13 +08:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
|
|
|
|
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
|
|
|
|
* whenever sysctl_lowmem_reserve_ratio changes.
|
|
|
|
*
|
|
|
|
* The reserve ratio obviously has absolutely no relation with the
|
|
|
|
* pages_min watermarks. The lowmem reserve ratio can only make sense
|
|
|
|
* if in function of the boot time zone sizes.
|
|
|
|
*/
|
|
|
|
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
proc_dointvec_minmax(table, write, file, buffer, length, ppos);
|
|
|
|
setup_per_zone_lowmem_reserve();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-01-08 17:00:40 +08:00
|
|
|
/*
|
|
|
|
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
|
|
|
|
* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
|
|
|
|
* can have before it gets flushed back to buddy allocator.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned int cpu;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
|
|
|
|
if (!write || (ret == -EINVAL))
|
|
|
|
return ret;
|
|
|
|
for_each_zone(zone) {
|
|
|
|
for_each_online_cpu(cpu) {
|
|
|
|
unsigned long high;
|
|
|
|
high = zone->present_pages / percpu_pagelist_fraction;
|
|
|
|
setup_pagelist_highmark(zone_pcp(zone, cpu), high);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-08-24 18:08:07 +08:00
|
|
|
int hashdist = HASHDIST_DEFAULT;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
static int __init set_hashdist(char *str)
|
|
|
|
{
|
|
|
|
if (!str)
|
|
|
|
return 0;
|
|
|
|
hashdist = simple_strtoul(str, &str, 0);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("hashdist=", set_hashdist);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* allocate a large system hash table from bootmem
|
|
|
|
* - it is assumed that the hash table must contain an exact power-of-2
|
|
|
|
* quantity of entries
|
|
|
|
* - limit is the number of hash buckets, not the total allocation size
|
|
|
|
*/
|
|
|
|
void *__init alloc_large_system_hash(const char *tablename,
|
|
|
|
unsigned long bucketsize,
|
|
|
|
unsigned long numentries,
|
|
|
|
int scale,
|
|
|
|
int flags,
|
|
|
|
unsigned int *_hash_shift,
|
|
|
|
unsigned int *_hash_mask,
|
|
|
|
unsigned long limit)
|
|
|
|
{
|
|
|
|
unsigned long long max = limit;
|
|
|
|
unsigned long log2qty, size;
|
|
|
|
void *table = NULL;
|
|
|
|
|
|
|
|
/* allow the kernel cmdline to have a say */
|
|
|
|
if (!numentries) {
|
|
|
|
/* round applicable memory size up to nearest megabyte */
|
2006-12-07 12:37:33 +08:00
|
|
|
numentries = nr_kernel_pages;
|
2005-04-17 06:20:36 +08:00
|
|
|
numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
|
|
|
|
numentries >>= 20 - PAGE_SHIFT;
|
|
|
|
numentries <<= 20 - PAGE_SHIFT;
|
|
|
|
|
|
|
|
/* limit to 1 bucket per 2^scale bytes of low memory */
|
|
|
|
if (scale > PAGE_SHIFT)
|
|
|
|
numentries >>= (scale - PAGE_SHIFT);
|
|
|
|
else
|
|
|
|
numentries <<= (PAGE_SHIFT - scale);
|
2007-01-06 08:36:30 +08:00
|
|
|
|
|
|
|
/* Make sure we've got at least a 0-order allocation.. */
|
|
|
|
if (unlikely((numentries * bucketsize) < PAGE_SIZE))
|
|
|
|
numentries = PAGE_SIZE / bucketsize;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-03-25 19:08:02 +08:00
|
|
|
numentries = roundup_pow_of_two(numentries);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* limit allocation size to 1/16 total memory by default */
|
|
|
|
if (max == 0) {
|
|
|
|
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
|
|
|
|
do_div(max, bucketsize);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (numentries > max)
|
|
|
|
numentries = max;
|
|
|
|
|
2006-12-08 18:37:49 +08:00
|
|
|
log2qty = ilog2(numentries);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
size = bucketsize << log2qty;
|
|
|
|
if (flags & HASH_EARLY)
|
|
|
|
table = alloc_bootmem(size);
|
|
|
|
else if (hashdist)
|
|
|
|
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
|
|
|
|
else {
|
2008-04-28 17:13:35 +08:00
|
|
|
unsigned long order = get_order(size);
|
2005-04-17 06:20:36 +08:00
|
|
|
table = (void*) __get_free_pages(GFP_ATOMIC, order);
|
2007-07-16 14:38:05 +08:00
|
|
|
/*
|
|
|
|
* If bucketsize is not a power-of-two, we may free
|
|
|
|
* some pages at the end of hash table.
|
|
|
|
*/
|
|
|
|
if (table) {
|
|
|
|
unsigned long alloc_end = (unsigned long)table +
|
|
|
|
(PAGE_SIZE << order);
|
|
|
|
unsigned long used = (unsigned long)table +
|
|
|
|
PAGE_ALIGN(size);
|
|
|
|
split_page(virt_to_page(table), order);
|
|
|
|
while (used < alloc_end) {
|
|
|
|
free_page(used);
|
|
|
|
used += PAGE_SIZE;
|
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
} while (!table && size > PAGE_SIZE && --log2qty);
|
|
|
|
|
|
|
|
if (!table)
|
|
|
|
panic("Failed to allocate %s hash table\n", tablename);
|
|
|
|
|
2007-07-16 14:38:23 +08:00
|
|
|
printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
|
2005-04-17 06:20:36 +08:00
|
|
|
tablename,
|
|
|
|
(1U << log2qty),
|
2006-12-08 18:37:49 +08:00
|
|
|
ilog2(size) - PAGE_SHIFT,
|
2005-04-17 06:20:36 +08:00
|
|
|
size);
|
|
|
|
|
|
|
|
if (_hash_shift)
|
|
|
|
*_hash_shift = log2qty;
|
|
|
|
if (_hash_mask)
|
|
|
|
*_hash_mask = (1 << log2qty) - 1;
|
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
2006-03-27 17:15:25 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
|
|
|
|
struct page *pfn_to_page(unsigned long pfn)
|
|
|
|
{
|
2006-06-23 17:03:12 +08:00
|
|
|
return __pfn_to_page(pfn);
|
2006-03-27 17:15:25 +08:00
|
|
|
}
|
|
|
|
unsigned long page_to_pfn(struct page *page)
|
|
|
|
{
|
2006-06-23 17:03:12 +08:00
|
|
|
return __page_to_pfn(page);
|
2006-03-27 17:15:25 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(pfn_to_page);
|
|
|
|
EXPORT_SYMBOL(page_to_pfn);
|
|
|
|
#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
|
2006-10-20 14:29:05 +08:00
|
|
|
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
/* Return a pointer to the bitmap storing bits affecting a block of pages */
|
|
|
|
static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
|
|
|
|
unsigned long pfn)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
return __pfn_to_section(pfn)->pageblock_flags;
|
|
|
|
#else
|
|
|
|
return zone->pageblock_flags;
|
|
|
|
#endif /* CONFIG_SPARSEMEM */
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
pfn &= (PAGES_PER_SECTION-1);
|
2007-10-16 16:26:01 +08:00
|
|
|
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
#else
|
|
|
|
pfn = pfn - zone->zone_start_pfn;
|
2007-10-16 16:26:01 +08:00
|
|
|
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
#endif /* CONFIG_SPARSEMEM */
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2007-10-16 16:26:01 +08:00
|
|
|
* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
* @page: The page within the block of interest
|
|
|
|
* @start_bitidx: The first bit of interest to retrieve
|
|
|
|
* @end_bitidx: The last bit of interest
|
|
|
|
* returns pageblock_bits flags
|
|
|
|
*/
|
|
|
|
unsigned long get_pageblock_flags_group(struct page *page,
|
|
|
|
int start_bitidx, int end_bitidx)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned long *bitmap;
|
|
|
|
unsigned long pfn, bitidx;
|
|
|
|
unsigned long flags = 0;
|
|
|
|
unsigned long value = 1;
|
|
|
|
|
|
|
|
zone = page_zone(page);
|
|
|
|
pfn = page_to_pfn(page);
|
|
|
|
bitmap = get_pageblock_bitmap(zone, pfn);
|
|
|
|
bitidx = pfn_to_bitidx(zone, pfn);
|
|
|
|
|
|
|
|
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
|
|
|
|
if (test_bit(bitidx + start_bitidx, bitmap))
|
|
|
|
flags |= value;
|
2006-10-20 14:29:05 +08:00
|
|
|
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2007-10-16 16:26:01 +08:00
|
|
|
* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
* @page: The page within the block of interest
|
|
|
|
* @start_bitidx: The first bit of interest
|
|
|
|
* @end_bitidx: The last bit of interest
|
|
|
|
* @flags: The flags to set
|
|
|
|
*/
|
|
|
|
void set_pageblock_flags_group(struct page *page, unsigned long flags,
|
|
|
|
int start_bitidx, int end_bitidx)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned long *bitmap;
|
|
|
|
unsigned long pfn, bitidx;
|
|
|
|
unsigned long value = 1;
|
|
|
|
|
|
|
|
zone = page_zone(page);
|
|
|
|
pfn = page_to_pfn(page);
|
|
|
|
bitmap = get_pageblock_bitmap(zone, pfn);
|
|
|
|
bitidx = pfn_to_bitidx(zone, pfn);
|
2008-04-29 15:58:21 +08:00
|
|
|
VM_BUG_ON(pfn < zone->zone_start_pfn);
|
|
|
|
VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
|
Add a bitmap that is used to track flags affecting a block of pages
Here is the latest revision of the anti-fragmentation patches. Of particular
note in this version is special treatment of high-order atomic allocations.
Care is taken to group them together and avoid grouping pages of other types
near them. Artifical tests imply that it works. I'm trying to get the
hardware together that would allow setting up of a "real" test. If anyone
already has a setup and test that can trigger the atomic-allocation problem,
I'd appreciate a test of these patches and a report. The second major change
is that these patches will apply cleanly with patches that implement
anti-fragmentation through zones.
kernbench shows effectively no performance difference varying between -0.2%
and +2% on a variety of test machines. Success rates for huge page allocation
are dramatically increased. For example, on a ppc64 machine, the vanilla
kernel was only able to allocate 1% of memory as a hugepage and this was due
to a single hugepage reserved as min_free_kbytes. With these patches applied,
17% was allocatable as superpages. With reclaim-related fixes from Andy
Whitcroft, it was 40% and further reclaim-related improvements should increase
this further.
Changelog Since V28
o Group high-order atomic allocations together
o It is no longer required to set min_free_kbytes to 10% of memory. A value
of 16384 in most cases will be sufficient
o Now applied with zone-based anti-fragmentation
o Fix incorrect VM_BUG_ON within buffered_rmqueue()
o Reorder the stack so later patches do not back out work from earlier patches
o Fix bug were journal pages were being treated as movable
o Bias placement of non-movable pages to lower PFNs
o More agressive clustering of reclaimable pages in reactions to workloads
like updatedb that flood the size of inode caches
Changelog Since V27
o Renamed anti-fragmentation to Page Clustering. Anti-fragmentation was giving
the mistaken impression that it was the 100% solution for high order
allocations. Instead, it greatly increases the chances high-order
allocations will succeed and lays the foundation for defragmentation and
memory hot-remove to work properly
o Redefine page groupings based on ability to migrate or reclaim instead of
basing on reclaimability alone
o Get rid of spurious inits
o Per-cpu lists are no longer split up per-type. Instead the per-cpu list is
searched for a page of the appropriate type
o Added more explanation commentary
o Fix up bug in pageblock code where bitmap was used before being initalised
Changelog Since V26
o Fix double init of lists in setup_pageset
Changelog Since V25
o Fix loop order of for_each_rclmtype_order so that order of loop matches args
o gfpflags_to_rclmtype uses gfp_t instead of unsigned long
o Rename get_pageblock_type() to get_page_rclmtype()
o Fix alignment problem in move_freepages()
o Add mechanism for assigning flags to blocks of pages instead of page->flags
o On fallback, do not examine the preferred list of free pages a second time
The purpose of these patches is to reduce external fragmentation by grouping
pages of related types together. When pages are migrated (or reclaimed under
memory pressure), large contiguous pages will be freed.
This patch works by categorising allocations by their ability to migrate;
Movable - The pages may be moved with the page migration mechanism. These are
generally userspace pages.
Reclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
Unmovable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
HighAtomic - These are high-order allocations belonging to callers that
cannot sleep or perform any IO. In practice, this is restricted to
jumbo frame allocation for network receive. It is assumed that the
allocations are short-lived
Instead of having one MAX_ORDER-sized array of free lists in struct free_area,
there is one for each type of reclaimability. Once a 2^MAX_ORDER block of
pages is split for a type of allocation, it is added to the free-lists for
that type, in effect reserving it. Hence, over time, pages of the different
types can be clustered together.
When the preferred freelists are expired, the largest possible block is taken
from an alternative list. Buddies that are split from that large block are
placed on the preferred allocation-type freelists to mitigate fragmentation.
This implementation gives best-effort for low fragmentation in all zones.
Ideally, min_free_kbytes needs to be set to a value equal to 4 * (1 <<
(MAX_ORDER-1)) pages in most cases. This would be 16384 on x86 and x86_64 for
example.
Our tests show that about 60-70% of physical memory can be allocated on a
desktop after a few days uptime. In benchmarks and stress tests, we are
finding that 80% of memory is available as contiguous blocks at the end of the
test. To compare, a standard kernel was getting < 1% of memory as large pages
on a desktop and about 8-12% of memory as large pages at the end of stress
tests.
Following this email are 12 patches that implement thie page grouping feature.
The first patch introduces a mechanism for storing flags related to a whole
block of pages. Then allocations are split between movable and all other
allocations. Following that are patches to deal with per-cpu pages and make
the mechanism configurable. The next patch moves free pages between lists
when partially allocated blocks are used for pages of another migrate type.
The second last patch groups reclaimable kernel allocations such as inode
caches together. The final patch related to groupings keeps high-order atomic
allocations.
The last two patches are more concerned with control of fragmentation. The
second last patch biases placement of non-movable allocations towards the
start of memory. This is with a view of supporting memory hot-remove of DIMMs
with higher PFNs in the future. The biasing could be enforced a lot heavier
but it would cost. The last patch agressively clusters reclaimable pages like
inode caches together.
The fragmentation reduction strategy needs to track if pages within a block
can be moved or reclaimed so that pages are freed to the appropriate list.
This patch adds a bitmap for flags affecting a whole a MAX_ORDER block of
pages.
In non-SPARSEMEM configurations, the bitmap is stored in the struct zone and
allocated during initialisation. SPARSEMEM statically allocates the bitmap in
a struct mem_section so that bitmaps do not have to be resized during memory
hotadd. This wastes a small amount of memory per unused section (usually
sizeof(unsigned long)) but the complexity of dynamically allocating the memory
is quite high.
Additional credit to Andy Whitcroft who reviewed up an earlier implementation
of the mechanism an suggested how to make it a *lot* cleaner.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:25:47 +08:00
|
|
|
|
|
|
|
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
|
|
|
|
if (flags & value)
|
|
|
|
__set_bit(bitidx + start_bitidx, bitmap);
|
|
|
|
else
|
|
|
|
__clear_bit(bitidx + start_bitidx, bitmap);
|
|
|
|
}
|
2007-10-16 16:26:11 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is designed as sub function...plz see page_isolation.c also.
|
|
|
|
* set/clear page block's type to be ISOLATE.
|
|
|
|
* page allocater never alloc memory from ISOLATE block.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int set_migratetype_isolate(struct page *page)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned long flags;
|
|
|
|
int ret = -EBUSY;
|
|
|
|
|
|
|
|
zone = page_zone(page);
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
|
|
/*
|
|
|
|
* In future, more migrate types will be able to be isolation target.
|
|
|
|
*/
|
|
|
|
if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
|
|
|
|
goto out;
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
|
|
|
|
move_freepages_block(zone, page, MIGRATE_ISOLATE);
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
|
|
if (!ret)
|
2008-02-05 14:29:11 +08:00
|
|
|
drain_all_pages();
|
2007-10-16 16:26:11 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void unset_migratetype_isolate(struct page *page)
|
|
|
|
{
|
|
|
|
struct zone *zone;
|
|
|
|
unsigned long flags;
|
|
|
|
zone = page_zone(page);
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
|
|
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
|
|
|
|
goto out;
|
|
|
|
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
|
|
|
move_freepages_block(zone, page, MIGRATE_MOVABLE);
|
|
|
|
out:
|
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
|
|
}
|
2007-10-16 16:26:12 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
|
|
/*
|
|
|
|
* All pages in the range must be isolated before calling this.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
struct zone *zone;
|
|
|
|
int order, i;
|
|
|
|
unsigned long pfn;
|
|
|
|
unsigned long flags;
|
|
|
|
/* find the first valid pfn */
|
|
|
|
for (pfn = start_pfn; pfn < end_pfn; pfn++)
|
|
|
|
if (pfn_valid(pfn))
|
|
|
|
break;
|
|
|
|
if (pfn == end_pfn)
|
|
|
|
return;
|
|
|
|
zone = page_zone(pfn_to_page(pfn));
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
|
|
pfn = start_pfn;
|
|
|
|
while (pfn < end_pfn) {
|
|
|
|
if (!pfn_valid(pfn)) {
|
|
|
|
pfn++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
BUG_ON(page_count(page));
|
|
|
|
BUG_ON(!PageBuddy(page));
|
|
|
|
order = page_order(page);
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
printk(KERN_INFO "remove from free list %lx %d %lx\n",
|
|
|
|
pfn, 1 << order, end_pfn);
|
|
|
|
#endif
|
|
|
|
list_del(&page->lru);
|
|
|
|
rmv_page_order(page);
|
|
|
|
zone->free_area[order].nr_free--;
|
|
|
|
__mod_zone_page_state(zone, NR_FREE_PAGES,
|
|
|
|
- (1UL << order));
|
|
|
|
for (i = 0; i < (1 << order); i++)
|
|
|
|
SetPageReserved((page+i));
|
|
|
|
pfn += (1 << order);
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
|
|
}
|
|
|
|
#endif
|