2005-04-17 06:20:36 +08:00
|
|
|
#ifndef __LINUX_GFP_H
|
|
|
|
#define __LINUX_GFP_H
|
|
|
|
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/linkage.h>
|
2009-03-13 21:13:37 +08:00
|
|
|
#include <linux/topology.h>
|
2009-06-17 06:31:54 +08:00
|
|
|
#include <linux/mmdebug.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct vm_area_struct;
|
|
|
|
|
2010-10-27 05:22:04 +08:00
|
|
|
/* Plain integer GFP bitmasks. Do not use this directly. */
|
|
|
|
#define ___GFP_DMA 0x01u
|
|
|
|
#define ___GFP_HIGHMEM 0x02u
|
|
|
|
#define ___GFP_DMA32 0x04u
|
|
|
|
#define ___GFP_MOVABLE 0x08u
|
|
|
|
#define ___GFP_WAIT 0x10u
|
|
|
|
#define ___GFP_HIGH 0x20u
|
|
|
|
#define ___GFP_IO 0x40u
|
|
|
|
#define ___GFP_FS 0x80u
|
|
|
|
#define ___GFP_COLD 0x100u
|
|
|
|
#define ___GFP_NOWARN 0x200u
|
|
|
|
#define ___GFP_REPEAT 0x400u
|
|
|
|
#define ___GFP_NOFAIL 0x800u
|
|
|
|
#define ___GFP_NORETRY 0x1000u
|
|
|
|
#define ___GFP_COMP 0x4000u
|
|
|
|
#define ___GFP_ZERO 0x8000u
|
|
|
|
#define ___GFP_NOMEMALLOC 0x10000u
|
|
|
|
#define ___GFP_HARDWALL 0x20000u
|
|
|
|
#define ___GFP_THISNODE 0x40000u
|
|
|
|
#define ___GFP_RECLAIMABLE 0x80000u
|
|
|
|
#ifdef CONFIG_KMEMCHECK
|
|
|
|
#define ___GFP_NOTRACK 0x200000u
|
|
|
|
#else
|
|
|
|
#define ___GFP_NOTRACK 0
|
|
|
|
#endif
|
2011-01-14 07:46:49 +08:00
|
|
|
#define ___GFP_NO_KSWAPD 0x400000u
|
2011-03-23 07:33:12 +08:00
|
|
|
#define ___GFP_OTHER_NODE 0x800000u
|
mm: try to distribute dirty pages fairly across zones
The maximum number of dirty pages that exist in the system at any time is
determined by a number of pages considered dirtyable and a user-configured
percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all the
zones in the system minus their lowmem reserves and high watermarks, so
that the system can retain a healthy number of free pages without having
to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does not
care about the global state but rather the state of individual memory
zones. And right now there is nothing that prevents one zone from filling
up with dirty pages while other zones are spared, which frequently leads
to situations where kswapd, in order to restore the watermark of free
pages, does indeed have to write pages from that zone's LRU list. This
can interfere so badly with IO from the flusher threads that major
filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim
already, taking away the VM's only possibility to keep such a zone
balanced, aside from hoping the flushers will soon clean pages from that
zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory what
the global limit is to the global amount of dirtyable memory, and try to
make sure that no single zone receives more than its fair share of the
globally allowed dirty pages in the first place. As the number of pages
considered dirtyable excludes the zones' lowmem reserves and high
watermarks, the maximum number of dirty pages in a zone is such that the
zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to pass
__GFP_WRITE when they know in advance that the page will be written to and
become dirty soon. The page allocator will then attempt to allocate from
the first zone of the zonelist - which on NUMA is determined by the task's
NUMA memory policy - that has not exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads that
previously could fit their dirty pages completely in the higher zone may
be forced to allocate from lower zones, but the amount of pages that
"spill over" are limited themselves by the lower zones' dirty constraints,
and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for NUMA
configurations where the zones allowed for allocation are in sum not big
enough to trigger the global dirty limits, wake up the flusher threads and
remedy the situation. Because of this, an allocation that could not
succeed on any of the considered zones is allowed to ignore the dirty
limits before going into direct reclaim or even failing the allocation,
until a future patch changes the global dirty throttling and flusher
thread activation so that they take individual zone states into account.
Test results
15M DMA + 3246M DMA32 + 504 Normal = 3765M memory
40% dirty ratio
16G USB thumb drive
10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15))
seconds nr_vmscan_write
(stddev) min| median| max
xfs
vanilla: 549.747( 3.492) 0.000| 0.000| 0.000
patched: 550.996( 3.802) 0.000| 0.000| 0.000
fuse-ntfs
vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000
patched: 558.049(17.914) 0.000| 0.000| 43.000
btrfs
vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000
patched: 563.365(11.368) 0.000| 0.000| 1362.000
ext4
vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000
patched: 568.806(17.496) 0.000| 0.000| 0.000
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-11 07:07:49 +08:00
|
|
|
#define ___GFP_WRITE 0x1000000u
|
2010-10-27 05:22:04 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* GFP bitmasks..
|
2006-09-26 14:31:14 +08:00
|
|
|
*
|
|
|
|
* Zone modifiers (see linux/mmzone.h - low three bits)
|
|
|
|
*
|
|
|
|
* Do not put any conditional on these. If necessary modify the definitions
|
2010-05-25 05:32:44 +08:00
|
|
|
* without the underscores and use them consistently. The definitions here may
|
2006-09-26 14:31:14 +08:00
|
|
|
* be used in bit comparisons.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-10-27 05:22:04 +08:00
|
|
|
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
|
|
|
|
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
|
|
|
|
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
|
|
|
|
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
|
2009-06-17 06:32:46 +08:00
|
|
|
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Action modifiers - doesn't change the zoning
|
|
|
|
*
|
|
|
|
* __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
|
|
|
|
* _might_ fail. This depends upon the particular VM implementation.
|
|
|
|
*
|
|
|
|
* __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
|
2010-03-06 05:42:23 +08:00
|
|
|
* cannot handle allocation failures. This modifier is deprecated and no new
|
|
|
|
* users should be added.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* __GFP_NORETRY: The VM implementation must not retry indefinitely.
|
2007-07-17 19:03:05 +08:00
|
|
|
*
|
|
|
|
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
|
|
|
|
* mechanism or reclaimed
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-10-27 05:22:04 +08:00
|
|
|
#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */
|
|
|
|
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */
|
|
|
|
#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */
|
|
|
|
#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */
|
|
|
|
#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */
|
|
|
|
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */
|
|
|
|
#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
|
|
|
|
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
|
|
|
|
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
|
|
|
|
#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
|
|
|
|
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
|
|
|
|
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
|
|
|
|
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
|
|
|
|
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
|
|
|
|
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
|
|
|
|
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-14 07:46:49 +08:00
|
|
|
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
|
2011-03-23 07:33:12 +08:00
|
|
|
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
|
mm: try to distribute dirty pages fairly across zones
The maximum number of dirty pages that exist in the system at any time is
determined by a number of pages considered dirtyable and a user-configured
percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all the
zones in the system minus their lowmem reserves and high watermarks, so
that the system can retain a healthy number of free pages without having
to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does not
care about the global state but rather the state of individual memory
zones. And right now there is nothing that prevents one zone from filling
up with dirty pages while other zones are spared, which frequently leads
to situations where kswapd, in order to restore the watermark of free
pages, does indeed have to write pages from that zone's LRU list. This
can interfere so badly with IO from the flusher threads that major
filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim
already, taking away the VM's only possibility to keep such a zone
balanced, aside from hoping the flushers will soon clean pages from that
zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory what
the global limit is to the global amount of dirtyable memory, and try to
make sure that no single zone receives more than its fair share of the
globally allowed dirty pages in the first place. As the number of pages
considered dirtyable excludes the zones' lowmem reserves and high
watermarks, the maximum number of dirty pages in a zone is such that the
zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to pass
__GFP_WRITE when they know in advance that the page will be written to and
become dirty soon. The page allocator will then attempt to allocate from
the first zone of the zonelist - which on NUMA is determined by the task's
NUMA memory policy - that has not exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads that
previously could fit their dirty pages completely in the higher zone may
be forced to allocate from lower zones, but the amount of pages that
"spill over" are limited themselves by the lower zones' dirty constraints,
and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for NUMA
configurations where the zones allowed for allocation are in sum not big
enough to trigger the global dirty limits, wake up the flusher threads and
remedy the situation. Because of this, an allocation that could not
succeed on any of the considered zones is allowed to ignore the dirty
limits before going into direct reclaim or even failing the allocation,
until a future patch changes the global dirty throttling and flusher
thread activation so that they take individual zone states into account.
Test results
15M DMA + 3246M DMA32 + 504 Normal = 3765M memory
40% dirty ratio
16G USB thumb drive
10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15))
seconds nr_vmscan_write
(stddev) min| median| max
xfs
vanilla: 549.747( 3.492) 0.000| 0.000| 0.000
patched: 550.996( 3.802) 0.000| 0.000| 0.000
fuse-ntfs
vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000
patched: 558.049(17.914) 0.000| 0.000| 43.000
btrfs
vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000
patched: 563.365(11.368) 0.000| 0.000| 1362.000
ext4
vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000
patched: 568.806(17.496) 0.000| 0.000| 0.000
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-11 07:07:49 +08:00
|
|
|
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
|
2011-01-14 07:46:49 +08:00
|
|
|
|
kmemcheck: add mm functions
With kmemcheck enabled, the slab allocator needs to do this:
1. Tell kmemcheck to allocate the shadow memory which stores the status of
each byte in the allocation proper, e.g. whether it is initialized or
uninitialized.
2. Tell kmemcheck which parts of memory that should be marked uninitialized.
There are actually a few more states, such as "not yet allocated" and
"recently freed".
If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
memory that can take page faults because of kmemcheck.
If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
request memory with the __GFP_NOTRACK flag. This does not prevent the page
faults from occuring, however, but marks the object in question as being
initialized so that no warnings will ever be produced for this object.
In addition to (and in contrast to) __GFP_NOTRACK, the
__GFP_NOTRACK_FALSE_POSITIVE flag indicates that the allocation should
not be tracked _because_ it would produce a false positive. Their values
are identical, but need not be so in the future (for example, we could now
enable/disable false positives with a config option).
Parts of this patch were contributed by Pekka Enberg but merged for
atomicity.
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
[rebased for mainline inclusion]
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
2008-05-31 21:56:17 +08:00
|
|
|
/*
|
|
|
|
* This may seem redundant, but it's a way of annotating false positives vs.
|
|
|
|
* allocations that simply cannot be supported (e.g. page tables).
|
|
|
|
*/
|
|
|
|
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
|
|
|
|
|
mm: try to distribute dirty pages fairly across zones
The maximum number of dirty pages that exist in the system at any time is
determined by a number of pages considered dirtyable and a user-configured
percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all the
zones in the system minus their lowmem reserves and high watermarks, so
that the system can retain a healthy number of free pages without having
to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does not
care about the global state but rather the state of individual memory
zones. And right now there is nothing that prevents one zone from filling
up with dirty pages while other zones are spared, which frequently leads
to situations where kswapd, in order to restore the watermark of free
pages, does indeed have to write pages from that zone's LRU list. This
can interfere so badly with IO from the flusher threads that major
filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim
already, taking away the VM's only possibility to keep such a zone
balanced, aside from hoping the flushers will soon clean pages from that
zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory what
the global limit is to the global amount of dirtyable memory, and try to
make sure that no single zone receives more than its fair share of the
globally allowed dirty pages in the first place. As the number of pages
considered dirtyable excludes the zones' lowmem reserves and high
watermarks, the maximum number of dirty pages in a zone is such that the
zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to pass
__GFP_WRITE when they know in advance that the page will be written to and
become dirty soon. The page allocator will then attempt to allocate from
the first zone of the zonelist - which on NUMA is determined by the task's
NUMA memory policy - that has not exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads that
previously could fit their dirty pages completely in the higher zone may
be forced to allocate from lower zones, but the amount of pages that
"spill over" are limited themselves by the lower zones' dirty constraints,
and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for NUMA
configurations where the zones allowed for allocation are in sum not big
enough to trigger the global dirty limits, wake up the flusher threads and
remedy the situation. Because of this, an allocation that could not
succeed on any of the considered zones is allowed to ignore the dirty
limits before going into direct reclaim or even failing the allocation,
until a future patch changes the global dirty throttling and flusher
thread activation so that they take individual zone states into account.
Test results
15M DMA + 3246M DMA32 + 504 Normal = 3765M memory
40% dirty ratio
16G USB thumb drive
10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15))
seconds nr_vmscan_write
(stddev) min| median| max
xfs
vanilla: 549.747( 3.492) 0.000| 0.000| 0.000
patched: 550.996( 3.802) 0.000| 0.000| 0.000
fuse-ntfs
vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000
patched: 558.049(17.914) 0.000| 0.000| 43.000
btrfs
vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000
patched: 563.365(11.368) 0.000| 0.000| 1362.000
ext4
vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000
patched: 568.806(17.496) 0.000| 0.000| 0.000
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-11 07:07:49 +08:00
|
|
|
#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
|
2005-10-21 14:55:38 +08:00
|
|
|
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-04-11 13:53:27 +08:00
|
|
|
/* This equals 0, but use constants in case they ever change */
|
|
|
|
#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
|
2006-01-12 04:17:19 +08:00
|
|
|
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define GFP_ATOMIC (__GFP_HIGH)
|
|
|
|
#define GFP_NOIO (__GFP_WAIT)
|
|
|
|
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
|
|
|
|
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
|
2007-10-16 16:25:52 +08:00
|
|
|
#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
|
|
|
|
__GFP_RECLAIMABLE)
|
2005-09-07 06:18:10 +08:00
|
|
|
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
|
|
|
|
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
|
|
|
|
__GFP_HIGHMEM)
|
2007-07-17 19:03:05 +08:00
|
|
|
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
|
|
|
|
__GFP_HARDWALL | __GFP_HIGHMEM | \
|
|
|
|
__GFP_MOVABLE)
|
2010-03-06 05:42:13 +08:00
|
|
|
#define GFP_IOFS (__GFP_IO | __GFP_FS)
|
thp: transparent hugepage core
Lately I've been working to make KVM use hugepages transparently without
the usual restrictions of hugetlbfs. Some of the restrictions I'd like to
see removed:
1) hugepages have to be swappable or the guest physical memory remains
locked in RAM and can't be paged out to swap
2) if a hugepage allocation fails, regular pages should be allocated
instead and mixed in the same vma without any failure and without
userland noticing
3) if some task quits and more hugepages become available in the
buddy, guest physical memory backed by regular pages should be
relocated on hugepages automatically in regions under
madvise(MADV_HUGEPAGE) (ideally event driven by waking up the
kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes
not null)
4) avoidance of reservation and maximization of use of hugepages whenever
possible. Reservation (needed to avoid runtime fatal faliures) may be ok for
1 machine with 1 database with 1 database cache with 1 database cache size
known at boot time. It's definitely not feasible with a virtualization
hypervisor usage like RHEV-H that runs an unknown number of virtual machines
with an unknown size of each virtual machine with an unknown amount of
pagecache that could be potentially useful in the host for guest not using
O_DIRECT (aka cache=off).
hugepages in the virtualization hypervisor (and also in the guest!) are
much more important than in a regular host not using virtualization,
becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24
to 19 in case only the hypervisor uses transparent hugepages, and they
decrease the tlb-miss cacheline accesses from 19 to 15 in case both the
linux hypervisor and the linux guest both uses this patch (though the
guest will limit the addition speedup to anonymous regions only for
now...). Even more important is that the tlb miss handler is much slower
on a NPT/EPT guest than for a regular shadow paging or no-virtualization
scenario. So maximizing the amount of virtual memory cached by the TLB
pays off significantly more with NPT/EPT than without (even if there would
be no significant speedup in the tlb-miss runtime).
The first (and more tedious) part of this work requires allowing the VM to
handle anonymous hugepages mixed with regular pages transparently on
regular anonymous vmas. This is what this patch tries to achieve in the
least intrusive possible way. We want hugepages and hugetlb to be used in
a way so that all applications can benefit without changes (as usual we
leverage the KVM virtualization design: by improving the Linux VM at
large, KVM gets the performance boost too).
The most important design choice is: always fallback to 4k allocation if
the hugepage allocation fails! This is the _very_ opposite of some large
pagecache patches that failed with -EIO back then if a 64k (or similar)
allocation failed...
Second important decision (to reduce the impact of the feature on the
existing pagetable handling code) is that at any time we can split an
hugepage into 512 regular pages and it has to be done with an operation
that can't fail. This way the reliability of the swapping isn't decreased
(no need to allocate memory when we are short on memory to swap) and it's
trivial to plug a split_huge_page* one-liner where needed without
polluting the VM. Over time we can teach mprotect, mremap and friends to
handle pmd_trans_huge natively without calling split_huge_page*. The fact
it can't fail isn't just for swap: if split_huge_page would return -ENOMEM
(instead of the current void) we'd need to rollback the mprotect from the
middle of it (ideally including undoing the split_vma) which would be a
big change and in the very wrong direction (it'd likely be simpler not to
call split_huge_page at all and to teach mprotect and friends to handle
hugepages instead of rolling them back from the middle). In short the
very value of split_huge_page is that it can't fail.
The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and
incremental and it'll just be an "harmless" addition later if this initial
part is agreed upon. It also should be noted that locking-wise replacing
regular pages with hugepages is going to be very easy if compared to what
I'm doing below in split_huge_page, as it will only happen when
page_count(page) matches page_mapcount(page) if we can take the PG_lock
and mmap_sem in write mode. collapse_huge_page will be a "best effort"
that (unlike split_huge_page) can fail at the minimal sign of trouble and
we can try again later. collapse_huge_page will be similar to how KSM
works and the madvise(MADV_HUGEPAGE) will work similar to
madvise(MADV_MERGEABLE).
The default I like is that transparent hugepages are used at page fault
time. This can be changed with
/sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set
to three values "always", "madvise", "never" which mean respectively that
hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions,
or never used. /sys/kernel/mm/transparent_hugepage/defrag instead
controls if the hugepage allocation should defrag memory aggressively
"always", only inside "madvise" regions, or "never".
The pmd_trans_splitting/pmd_trans_huge locking is very solid. The
put_page (from get_user_page users that can't use mmu notifier like
O_DIRECT) that runs against a __split_huge_page_refcount instead was a
pain to serialize in a way that would result always in a coherent page
count for both tail and head. I think my locking solution with a
compound_lock taken only after the page_first is valid and is still a
PageHead should be safe but it surely needs review from SMP race point of
view. In short there is no current existing way to serialize the O_DIRECT
final put_page against split_huge_page_refcount so I had to invent a new
one (O_DIRECT loses knowledge on the mapping status by the time gup_fast
returns so...). And I didn't want to impact all gup/gup_fast users for
now, maybe if we change the gup interface substantially we can avoid this
locking, I admit I didn't think too much about it because changing the gup
unpinning interface would be invasive.
If we ignored O_DIRECT we could stick to the existing compound refcounting
code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM
(and any other mmu notifier user) would call it without FOLL_GET (and if
FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the
current task mmu notifier list yet). But O_DIRECT is fundamental for
decent performance of virtualized I/O on fast storage so we can't avoid it
to solve the race of put_page against split_huge_page_refcount to achieve
a complete hugepage feature for KVM.
Swap and oom works fine (well just like with regular pages ;). MMU
notifier is handled transparently too, with the exception of the young bit
on the pmd, that didn't have a range check but I think KVM will be fine
because the whole point of hugepages is that EPT/NPT will also use a huge
pmd when they notice gup returns pages with PageCompound set, so they
won't care of a range and there's just the pmd young bit to check in that
case.
NOTE: in some cases if the L2 cache is small, this may slowdown and waste
memory during COWs because 4M of memory are accessed in a single fault
instead of 8k (the payoff is that after COW the program can run faster).
So we might want to switch the copy_huge_page (and clear_huge_page too) to
not temporal stores. I also extensively researched ways to avoid this
cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k
up to 1M (I can send those patches that fully implemented prefault) but I
concluded they're not worth it and they add an huge additional complexity
and they remove all tlb benefits until the full hugepage has been faulted
in, to save a little bit of memory and some cache during app startup, but
they still don't improve substantially the cache-trashing during startup
if the prefault happens in >4k chunks. One reason is that those 4k pte
entries copied are still mapped on a perfectly cache-colored hugepage, so
the trashing is the worst one can generate in those copies (cow of 4k page
copies aren't so well colored so they trashes less, but again this results
in software running faster after the page fault). Those prefault patches
allowed things like a pte where post-cow pages were local 4k regular anon
pages and the not-yet-cowed pte entries were pointing in the middle of
some hugepage mapped read-only. If it doesn't payoff substantially with
todays hardware it will payoff even less in the future with larger l2
caches, and the prefault logic would blot the VM a lot. If one is
emebdded transparent_hugepage can be disabled during boot with sysfs or
with the boot commandline parameter transparent_hugepage=0 (or
transparent_hugepage=2 to restrict hugepages inside madvise regions) that
will ensure not a single hugepage is allocated at boot time. It is simple
enough to just disable transparent hugepage globally and let transparent
hugepages be allocated selectively by applications in the MADV_HUGEPAGE
region (both at page fault time, and if enabled with the
collapse_huge_page too through the kernel daemon).
This patch supports only hugepages mapped in the pmd, archs that have
smaller hugepages will not fit in this patch alone. Also some archs like
power have certain tlb limits that prevents mixing different page size in
the same regions so they will not fit in this framework that requires
"graceful fallback" to basic PAGE_SIZE in case of physical memory
fragmentation. hugetlbfs remains a perfect fit for those because its
software limits happen to match the hardware limits. hugetlbfs also
remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped
to be found not fragmented after a certain system uptime and that would be
very expensive to defragment with relocation, so requiring reservation.
hugetlbfs is the "reservation way", the point of transparent hugepages is
not to have any reservation at all and maximizing the use of cache and
hugepages at all times automatically.
Some performance result:
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep
ages3
memset page fault 1566023
memset tlb miss 453854
memset second tlb miss 453321
random access tlb miss 41635
random access second tlb miss 41658
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3
memset page fault 1566471
memset tlb miss 453375
memset second tlb miss 453320
random access tlb miss 41636
random access second tlb miss 41637
vmx andrea # ./largepages3
memset page fault 1566642
memset tlb miss 453417
memset second tlb miss 453313
random access tlb miss 41630
random access second tlb miss 41647
vmx andrea # ./largepages3
memset page fault 1566872
memset tlb miss 453418
memset second tlb miss 453315
random access tlb miss 41618
random access second tlb miss 41659
vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage
vmx andrea # ./largepages3
memset page fault 2182476
memset tlb miss 460305
memset second tlb miss 460179
random access tlb miss 44483
random access second tlb miss 44186
vmx andrea # ./largepages3
memset page fault 2182791
memset tlb miss 460742
memset second tlb miss 459962
random access tlb miss 43981
random access second tlb miss 43988
============
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define SIZE (3UL*1024*1024*1024)
int main()
{
char *p = malloc(SIZE), *p2;
struct timeval before, after;
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset page fault %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset second tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
for (p2 = p; p2 < p+SIZE; p2 += 4096)
*p2 = 0;
gettimeofday(&after, NULL);
printf("random access tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
for (p2 = p; p2 < p+SIZE; p2 += 4096)
*p2 = 0;
gettimeofday(&after, NULL);
printf("random access second tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
return 0;
}
============
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 07:46:52 +08:00
|
|
|
#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
|
|
|
|
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
|
|
|
|
__GFP_NO_KSWAPD)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-27 16:50:07 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-09-26 14:31:46 +08:00
|
|
|
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
|
2006-09-27 16:50:07 +08:00
|
|
|
#else
|
2007-02-10 00:38:55 +08:00
|
|
|
#define GFP_THISNODE ((__force gfp_t)0)
|
2006-09-27 16:50:07 +08:00
|
|
|
#endif
|
|
|
|
|
2007-10-16 16:25:41 +08:00
|
|
|
/* This mask makes up all the page movable related flags */
|
2007-10-16 16:25:52 +08:00
|
|
|
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
|
2007-10-16 16:25:41 +08:00
|
|
|
|
|
|
|
/* Control page allocator reclaim behavior */
|
|
|
|
#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
|
|
|
|
__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
|
|
|
|
__GFP_NORETRY|__GFP_NOMEMALLOC)
|
|
|
|
|
2009-06-12 19:03:06 +08:00
|
|
|
/* Control slab gfp mask during early boot */
|
2010-05-25 05:32:45 +08:00
|
|
|
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
|
2009-06-12 19:03:06 +08:00
|
|
|
|
2007-10-16 16:25:41 +08:00
|
|
|
/* Control allocation constraints */
|
|
|
|
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
|
|
|
|
|
|
|
|
/* Do not use these with a slab allocator */
|
|
|
|
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
|
2006-09-26 14:31:46 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
|
|
|
|
platforms, used as appropriate on others */
|
|
|
|
|
|
|
|
#define GFP_DMA __GFP_DMA
|
|
|
|
|
2005-11-06 00:25:53 +08:00
|
|
|
/* 4GB DMA on some platforms */
|
|
|
|
#define GFP_DMA32 __GFP_DMA32
|
|
|
|
|
Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo
This patch provides fragmentation avoidance statistics via /proc/pagetypeinfo.
The information is collected only on request so there is no runtime overhead.
The statistics are in three parts:
The first part prints information on the size of blocks that pages are
being grouped on and looks like
Page block order: 10
Pages per block: 1024
The second part is a more detailed version of /proc/buddyinfo and looks like
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
Node 0, zone DMA, type Unmovable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reclaimable 1 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Reserve 0 4 4 0 0 0 0 1 0 1 0
Node 0, zone Normal, type Unmovable 111 8 4 4 2 3 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 293 89 8 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 6 13 9 7 6 3 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 4
The third part looks like
Number of blocks type Unmovable Reclaimable Movable Reserve
Node 0, zone DMA 0 1 2 1
Node 0, zone Normal 3 17 94 4
To walk the zones within a node with interrupts disabled, walk_zones_in_node()
is introduced and shared between /proc/buddyinfo, /proc/zoneinfo and
/proc/pagetypeinfo to reduce code duplication. It seems specific to what
vmstat.c requires but could be broken out as a general utility function in
mmzone.c if there were other other potential users.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 16:26:02 +08:00
|
|
|
/* Convert GFP flags to their corresponding migrate type */
|
|
|
|
static inline int allocflags_to_migratetype(gfp_t gfp_flags)
|
|
|
|
{
|
|
|
|
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
|
|
|
|
|
|
|
|
if (unlikely(page_group_by_mobility_disabled))
|
|
|
|
return MIGRATE_UNMOVABLE;
|
|
|
|
|
|
|
|
/* Group based on mobility */
|
|
|
|
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
|
|
|
|
((gfp_flags & __GFP_RECLAIMABLE) != 0);
|
|
|
|
}
|
2005-11-06 00:25:53 +08:00
|
|
|
|
2009-06-17 06:32:46 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
|
|
|
|
#endif
|
|
|
|
|
2007-02-10 17:43:10 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA
|
2009-06-17 06:32:46 +08:00
|
|
|
#define OPT_ZONE_DMA ZONE_DMA
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_DMA ZONE_NORMAL
|
2007-02-10 17:43:10 +08:00
|
|
|
#endif
|
2009-06-17 06:32:46 +08:00
|
|
|
|
2006-09-26 14:31:17 +08:00
|
|
|
#ifdef CONFIG_ZONE_DMA32
|
2009-06-17 06:32:46 +08:00
|
|
|
#define OPT_ZONE_DMA32 ZONE_DMA32
|
|
|
|
#else
|
|
|
|
#define OPT_ZONE_DMA32 ZONE_NORMAL
|
2006-09-26 14:31:17 +08:00
|
|
|
#endif
|
2009-06-17 06:32:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
|
|
|
|
* zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
|
|
|
|
* and there are 16 of them to cover all possible combinations of
|
2010-05-25 05:32:44 +08:00
|
|
|
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
|
2009-06-17 06:32:46 +08:00
|
|
|
*
|
|
|
|
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
|
|
|
|
* But GFP_MOVABLE is not only a zone specifier but also an allocation
|
|
|
|
* policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
|
2010-05-25 05:32:44 +08:00
|
|
|
* Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
|
2009-06-17 06:32:46 +08:00
|
|
|
*
|
|
|
|
* bit result
|
|
|
|
* =================
|
|
|
|
* 0x0 => NORMAL
|
|
|
|
* 0x1 => DMA or NORMAL
|
|
|
|
* 0x2 => HIGHMEM or NORMAL
|
|
|
|
* 0x3 => BAD (DMA+HIGHMEM)
|
|
|
|
* 0x4 => DMA32 or DMA or NORMAL
|
|
|
|
* 0x5 => BAD (DMA+DMA32)
|
|
|
|
* 0x6 => BAD (HIGHMEM+DMA32)
|
|
|
|
* 0x7 => BAD (HIGHMEM+DMA32+DMA)
|
|
|
|
* 0x8 => NORMAL (MOVABLE+0)
|
|
|
|
* 0x9 => DMA or NORMAL (MOVABLE+DMA)
|
|
|
|
* 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too)
|
|
|
|
* 0xb => BAD (MOVABLE+HIGHMEM+DMA)
|
|
|
|
* 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32)
|
|
|
|
* 0xd => BAD (MOVABLE+DMA32+DMA)
|
|
|
|
* 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
|
|
|
|
* 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
|
|
|
|
*
|
|
|
|
* ZONES_SHIFT must be <= 2 on 32 bit platforms.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if 16 * ZONES_SHIFT > BITS_PER_LONG
|
|
|
|
#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define GFP_ZONE_TABLE ( \
|
2010-10-27 05:22:04 +08:00
|
|
|
(ZONE_NORMAL << 0 * ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \
|
|
|
|
| (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \
|
|
|
|
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \
|
|
|
|
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \
|
2009-06-17 06:32:46 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
/*
|
2010-05-25 05:32:44 +08:00
|
|
|
* GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
|
2009-06-17 06:32:46 +08:00
|
|
|
* __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
|
|
|
|
* entry starting with bit 0. Bit is set if the combination is not
|
|
|
|
* allowed.
|
|
|
|
*/
|
|
|
|
#define GFP_ZONE_BAD ( \
|
2010-10-27 05:22:04 +08:00
|
|
|
1 << (___GFP_DMA | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_DMA | ___GFP_DMA32) \
|
|
|
|
| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \
|
|
|
|
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \
|
2009-06-17 06:32:46 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
static inline enum zone_type gfp_zone(gfp_t flags)
|
|
|
|
{
|
|
|
|
enum zone_type z;
|
2010-10-27 05:22:04 +08:00
|
|
|
int bit = (__force int) (flags & GFP_ZONEMASK);
|
2009-06-17 06:32:46 +08:00
|
|
|
|
|
|
|
z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
|
|
|
|
((1 << ZONES_SHIFT) - 1);
|
2011-05-25 08:11:42 +08:00
|
|
|
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
|
2009-06-17 06:32:46 +08:00
|
|
|
return z;
|
2006-09-26 14:31:17 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* There is only one page-allocator function, and two main namespaces to
|
|
|
|
* it. The alloc_page*() variants return 'struct page *' and as such
|
|
|
|
* can allocate highmem pages, the *get*page*() variants return
|
|
|
|
* virtual kernel addresses to the allocated page(s).
|
|
|
|
*/
|
|
|
|
|
2008-04-28 17:12:16 +08:00
|
|
|
static inline int gfp_zonelist(gfp_t flags)
|
|
|
|
{
|
|
|
|
if (NUMA_BUILD && unlikely(flags & __GFP_THISNODE))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We get the zone list from the current node and the gfp_mask.
|
|
|
|
* This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
|
2008-04-28 17:12:16 +08:00
|
|
|
* There are two zonelists per node, one for all zones with memory and
|
|
|
|
* one containing just zones from the node the zonelist belongs to.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
|
|
|
|
* optimized to &contig_page_data at compile-time.
|
|
|
|
*/
|
2008-04-28 17:12:14 +08:00
|
|
|
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
|
|
|
|
{
|
2008-04-28 17:12:16 +08:00
|
|
|
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
|
2008-04-28 17:12:14 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifndef HAVE_ARCH_FREE_PAGE
|
|
|
|
static inline void arch_free_page(struct page *page, int order) { }
|
|
|
|
#endif
|
2006-12-07 12:32:00 +08:00
|
|
|
#ifndef HAVE_ARCH_ALLOC_PAGE
|
|
|
|
static inline void arch_alloc_page(struct page *page, int order) { }
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-24 12:27:01 +08:00
|
|
|
struct page *
|
2009-06-17 06:31:52 +08:00
|
|
|
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
2008-07-24 12:27:01 +08:00
|
|
|
struct zonelist *zonelist, nodemask_t *nodemask);
|
|
|
|
|
|
|
|
static inline struct page *
|
|
|
|
__alloc_pages(gfp_t gfp_mask, unsigned int order,
|
|
|
|
struct zonelist *zonelist)
|
|
|
|
{
|
2009-06-17 06:31:52 +08:00
|
|
|
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
|
2008-07-24 12:27:01 +08:00
|
|
|
}
|
|
|
|
|
2005-10-07 14:46:04 +08:00
|
|
|
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned int order)
|
|
|
|
{
|
2006-01-12 05:43:45 +08:00
|
|
|
/* Unknown node is current node */
|
|
|
|
if (nid < 0)
|
|
|
|
nid = numa_node_id();
|
|
|
|
|
2008-04-28 17:12:14 +08:00
|
|
|
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2009-06-17 06:31:54 +08:00
|
|
|
static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
|
|
|
|
unsigned int order)
|
|
|
|
{
|
2012-01-11 07:07:38 +08:00
|
|
|
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
|
2009-06-17 06:31:54 +08:00
|
|
|
|
|
|
|
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
2005-10-07 14:46:04 +08:00
|
|
|
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
static inline struct page *
|
2005-10-07 14:46:04 +08:00
|
|
|
alloc_pages(gfp_t gfp_mask, unsigned int order)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
return alloc_pages_current(gfp_mask, order);
|
|
|
|
}
|
2011-01-14 07:47:05 +08:00
|
|
|
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
2011-03-05 09:36:29 +08:00
|
|
|
struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
int node);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
|
|
|
#define alloc_pages(gfp_mask, order) \
|
|
|
|
alloc_pages_node(numa_node_id(), gfp_mask, order)
|
2011-03-05 09:36:29 +08:00
|
|
|
#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \
|
2011-01-14 07:47:05 +08:00
|
|
|
alloc_pages(gfp_mask, order)
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
2011-03-05 09:36:29 +08:00
|
|
|
#define alloc_page_vma(gfp_mask, vma, addr) \
|
|
|
|
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
|
2011-03-05 09:36:30 +08:00
|
|
|
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
|
|
|
|
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
|
|
|
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-24 12:28:11 +08:00
|
|
|
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
|
|
|
|
void free_pages_exact(void *virt, size_t size);
|
2011-05-12 06:13:34 +08:00
|
|
|
/* This is different from alloc_pages_exact_node !!! */
|
|
|
|
void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
|
2008-07-24 12:28:11 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define __get_free_page(gfp_mask) \
|
2010-05-25 05:32:45 +08:00
|
|
|
__get_free_pages((gfp_mask), 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define __get_dma_pages(gfp_mask, order) \
|
2010-05-25 05:32:45 +08:00
|
|
|
__get_free_pages((gfp_mask) | GFP_DMA, (order))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern void __free_pages(struct page *page, unsigned int order);
|
|
|
|
extern void free_pages(unsigned long addr, unsigned int order);
|
2010-03-06 05:41:54 +08:00
|
|
|
extern void free_hot_cold_page(struct page *page, int cold);
|
2012-01-11 07:07:04 +08:00
|
|
|
extern void free_hot_cold_page_list(struct list_head *list, int cold);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define __free_page(page) __free_pages((page), 0)
|
2010-05-25 05:32:45 +08:00
|
|
|
#define free_page(addr) free_pages((addr), 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void page_alloc_init(void);
|
2007-05-09 17:35:14 +08:00
|
|
|
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
|
2008-02-05 14:29:11 +08:00
|
|
|
void drain_all_pages(void);
|
|
|
|
void drain_local_pages(void *dummy);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-01-11 07:07:15 +08:00
|
|
|
/*
|
|
|
|
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
|
|
|
|
* GFP flags are used before interrupts are enabled. Once interrupts are
|
|
|
|
* enabled, it is set to __GFP_BITS_MASK while the system is running. During
|
|
|
|
* hibernation, it is used by PM to avoid I/O during memory allocation while
|
|
|
|
* devices are suspended.
|
|
|
|
*/
|
2009-06-18 11:24:12 +08:00
|
|
|
extern gfp_t gfp_allowed_mask;
|
|
|
|
|
2010-12-04 05:57:45 +08:00
|
|
|
extern void pm_restrict_gfp_mask(void);
|
|
|
|
extern void pm_restore_gfp_mask(void);
|
2009-06-18 11:24:12 +08:00
|
|
|
|
2012-01-11 07:07:15 +08:00
|
|
|
#ifdef CONFIG_PM_SLEEP
|
|
|
|
extern bool pm_suspended_storage(void);
|
|
|
|
#else
|
|
|
|
static inline bool pm_suspended_storage(void)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PM_SLEEP */
|
|
|
|
|
2011-12-29 20:09:50 +08:00
|
|
|
#ifdef CONFIG_CMA
|
|
|
|
|
|
|
|
/* The below functions must be run on a range from a single zone. */
|
2012-04-03 21:06:15 +08:00
|
|
|
extern int alloc_contig_range(unsigned long start, unsigned long end,
|
|
|
|
unsigned migratetype);
|
2011-12-29 20:09:50 +08:00
|
|
|
extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
|
|
|
|
|
2011-12-29 20:09:50 +08:00
|
|
|
/* CMA stuff */
|
|
|
|
extern void init_cma_reserved_pageblock(struct page *page);
|
|
|
|
|
2011-12-29 20:09:50 +08:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __LINUX_GFP_H */
|