2010-05-25 05:32:27 +08:00
|
|
|
#ifndef _LINUX_COMPACTION_H
|
|
|
|
#define _LINUX_COMPACTION_H
|
|
|
|
|
2016-07-29 06:49:28 +08:00
|
|
|
/*
|
|
|
|
* Determines how hard direct compaction should try to succeed.
|
|
|
|
* Lower value means higher priority, analogically to reclaim priority.
|
|
|
|
*/
|
|
|
|
enum compact_priority {
|
|
|
|
COMPACT_PRIO_SYNC_LIGHT,
|
|
|
|
MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
|
|
DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
|
|
|
|
COMPACT_PRIO_ASYNC,
|
|
|
|
INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
|
|
|
|
};
|
|
|
|
|
2010-05-25 05:32:30 +08:00
|
|
|
/* Return values for compact_zone() and try_to_compact_pages() */
|
2015-11-06 10:47:56 +08:00
|
|
|
/* When adding new states, please adjust include/trace/events/compaction.h */
|
2016-05-21 07:56:38 +08:00
|
|
|
enum compact_result {
|
2016-05-21 07:56:50 +08:00
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
|
|
COMPACT_NOT_SUITABLE_ZONE,
|
2016-05-21 07:56:38 +08:00
|
|
|
/*
|
|
|
|
* compaction didn't start as it was not possible or direct reclaim
|
|
|
|
* was more suitable
|
|
|
|
*/
|
|
|
|
COMPACT_SKIPPED,
|
2016-05-21 07:56:44 +08:00
|
|
|
/* compaction didn't start as it was deferred due to past failures */
|
|
|
|
COMPACT_DEFERRED,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
2016-05-21 07:56:44 +08:00
|
|
|
/* compaction not active last round */
|
|
|
|
COMPACT_INACTIVE = COMPACT_DEFERRED,
|
|
|
|
|
2016-05-21 07:56:50 +08:00
|
|
|
/* For more detailed tracepoint output - internal to compaction */
|
|
|
|
COMPACT_NO_SUITABLE_PAGE,
|
2016-05-21 07:56:38 +08:00
|
|
|
/* compaction should continue to another pageblock */
|
|
|
|
COMPACT_CONTINUE,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
2016-05-21 07:56:38 +08:00
|
|
|
/*
|
2016-05-21 07:56:50 +08:00
|
|
|
* The full zone was compacted scanned but wasn't successfull to compact
|
|
|
|
* suitable pages.
|
2016-05-21 07:56:38 +08:00
|
|
|
*/
|
2016-05-21 07:56:50 +08:00
|
|
|
COMPACT_COMPLETE,
|
2016-05-21 07:56:47 +08:00
|
|
|
/*
|
|
|
|
* direct compaction has scanned part of the zone but wasn't successfull
|
|
|
|
* to compact suitable pages.
|
|
|
|
*/
|
|
|
|
COMPACT_PARTIAL_SKIPPED,
|
2016-05-21 07:56:50 +08:00
|
|
|
|
|
|
|
/* compaction terminated prematurely due to lock contentions */
|
|
|
|
COMPACT_CONTENDED,
|
|
|
|
|
2016-05-21 07:56:47 +08:00
|
|
|
/*
|
2016-05-21 07:56:50 +08:00
|
|
|
* direct compaction partially compacted a zone and there might be
|
|
|
|
* suitable pages
|
2016-05-21 07:56:47 +08:00
|
|
|
*/
|
2016-05-21 07:56:50 +08:00
|
|
|
COMPACT_PARTIAL,
|
2016-05-21 07:56:38 +08:00
|
|
|
};
|
2010-05-25 05:32:27 +08:00
|
|
|
|
2015-02-12 07:25:44 +08:00
|
|
|
struct alloc_context; /* in mm/internal.h */
|
|
|
|
|
2010-05-25 05:32:28 +08:00
|
|
|
#ifdef CONFIG_COMPACTION
|
|
|
|
extern int sysctl_compact_memory;
|
|
|
|
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *length, loff_t *ppos);
|
2010-05-25 05:32:31 +08:00
|
|
|
extern int sysctl_extfrag_threshold;
|
|
|
|
extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *length, loff_t *ppos);
|
2015-04-16 07:13:20 +08:00
|
|
|
extern int sysctl_compact_unevictable_allowed;
|
2010-05-25 05:32:30 +08:00
|
|
|
|
|
|
|
extern int fragmentation_index(struct zone *zone, unsigned int order);
|
2016-05-21 07:56:38 +08:00
|
|
|
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
|
mm, compaction: simplify contended compaction handling
Async compaction detects contention either due to failing trylock on
zone->lock or lru_lock, or by need_resched(). Since 1f9efdef4f3f ("mm,
compaction: khugepaged should not give up due to need_resched()") the
code got quite complicated to distinguish these two up to the
__alloc_pages_slowpath() level, so different decisions could be taken
for khugepaged allocations.
After the recent changes, khugepaged allocations don't check for
contended compaction anymore, so we again don't need to distinguish lock
and sched contention, and simplify the current convoluted code a lot.
However, I believe it's also possible to simplify even more and
completely remove the check for contended compaction after the initial
async compaction for costly orders, which was originally aimed at THP
page fault allocations. There are several reasons why this can be done
now:
- with the new defaults, THP page faults no longer do reclaim/compaction at
all, unless the system admin has overridden the default, or application has
indicated via madvise that it can benefit from THP's. In both cases, it
means that the potential extra latency is expected and worth the benefits.
- even if reclaim/compaction proceeds after this patch where it previously
wouldn't, the second compaction attempt is still async and will detect the
contention and back off, if the contention persists
- there are still heuristics like deferred compaction and pageblock skip bits
in place that prevent excessive THP page fault latencies
Link: http://lkml.kernel.org/r/20160721073614.24395-9-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-29 06:49:30 +08:00
|
|
|
unsigned int order, unsigned int alloc_flags,
|
|
|
|
const struct alloc_context *ac, enum compact_priority prio);
|
2013-02-23 08:32:33 +08:00
|
|
|
extern void compact_pgdat(pg_data_t *pgdat, int order);
|
2012-10-09 07:32:47 +08:00
|
|
|
extern void reset_isolation_suitable(pg_data_t *pgdat);
|
2016-05-21 07:56:38 +08:00
|
|
|
extern enum compact_result compaction_suitable(struct zone *zone, int order,
|
2016-05-20 08:13:38 +08:00
|
|
|
unsigned int alloc_flags, int classzone_idx);
|
2010-05-25 05:32:32 +08:00
|
|
|
|
2015-02-12 07:27:09 +08:00
|
|
|
extern void defer_compaction(struct zone *zone, int order);
|
|
|
|
extern bool compaction_deferred(struct zone *zone, int order);
|
|
|
|
extern void compaction_defer_reset(struct zone *zone, int order,
|
|
|
|
bool alloc_success);
|
|
|
|
extern bool compaction_restarting(struct zone *zone, int order);
|
2012-10-09 07:32:47 +08:00
|
|
|
|
2016-05-21 07:56:56 +08:00
|
|
|
/* Compaction has made some progress and retrying makes sense */
|
|
|
|
static inline bool compaction_made_progress(enum compact_result result)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Even though this might sound confusing this in fact tells us
|
|
|
|
* that the compaction successfully isolated and migrated some
|
|
|
|
* pageblocks.
|
|
|
|
*/
|
|
|
|
if (result == COMPACT_PARTIAL)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compaction has failed and it doesn't make much sense to keep retrying. */
|
|
|
|
static inline bool compaction_failed(enum compact_result result)
|
|
|
|
{
|
|
|
|
/* All zones were scanned completely and still not result. */
|
|
|
|
if (result == COMPACT_COMPLETE)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compaction has backed off for some reason. It might be throttling or
|
|
|
|
* lock contention. Retrying is still worthwhile.
|
|
|
|
*/
|
|
|
|
static inline bool compaction_withdrawn(enum compact_result result)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Compaction backed off due to watermark checks for order-0
|
|
|
|
* so the regular reclaim has to try harder and reclaim something.
|
|
|
|
*/
|
|
|
|
if (result == COMPACT_SKIPPED)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If compaction is deferred for high-order allocations, it is
|
|
|
|
* because sync compaction recently failed. If this is the case
|
|
|
|
* and the caller requested a THP allocation, we do not want
|
|
|
|
* to heavily disrupt the system, so we fail the allocation
|
|
|
|
* instead of entering direct reclaim.
|
|
|
|
*/
|
|
|
|
if (result == COMPACT_DEFERRED)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If compaction in async mode encounters contention or blocks higher
|
|
|
|
* priority task we back off early rather than cause stalls.
|
|
|
|
*/
|
|
|
|
if (result == COMPACT_CONTENDED)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Page scanners have met but we haven't scanned full zones so this
|
|
|
|
* is a back off in fact.
|
|
|
|
*/
|
|
|
|
if (result == COMPACT_PARTIAL_SKIPPED)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-05-21 07:57:12 +08:00
|
|
|
|
|
|
|
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
|
|
|
|
int alloc_flags);
|
|
|
|
|
mm, compaction: introduce kcompactd
Memory compaction can be currently performed in several contexts:
- kswapd balancing a zone after a high-order allocation failure
- direct compaction to satisfy a high-order allocation, including THP
page fault attemps
- khugepaged trying to collapse a hugepage
- manually from /proc
The purpose of compaction is two-fold. The obvious purpose is to
satisfy a (pending or future) high-order allocation, and is easy to
evaluate. The other purpose is to keep overal memory fragmentation low
and help the anti-fragmentation mechanism. The success wrt the latter
purpose is more
The current situation wrt the purposes has a few drawbacks:
- compaction is invoked only when a high-order page or hugepage is not
available (or manually). This might be too late for the purposes of
keeping memory fragmentation low.
- direct compaction increases latency of allocations. Again, it would
be better if compaction was performed asynchronously to keep
fragmentation low, before the allocation itself comes.
- (a special case of the previous) the cost of compaction during THP
page faults can easily offset the benefits of THP.
- kswapd compaction appears to be complex, fragile and not working in
some scenarios. It could also end up compacting for a high-order
allocation request when it should be reclaiming memory for a later
order-0 request.
To improve the situation, we should be able to benefit from an
equivalent of kswapd, but for compaction - i.e. a background thread
which responds to fragmentation and the need for high-order allocations
(including hugepages) somewhat proactively.
One possibility is to extend the responsibilities of kswapd, which could
however complicate its design too much. It should be better to let
kswapd handle reclaim, as order-0 allocations are often more critical
than high-order ones.
Another possibility is to extend khugepaged, but this kthread is a
single instance and tied to THP configs.
This patch goes with the option of a new set of per-node kthreads called
kcompactd, and lays the foundations, without introducing any new
tunables. The lifecycle mimics kswapd kthreads, including the memory
hotplug hooks.
For compaction, kcompactd uses the standard compaction_suitable() and
ompact_finished() criteria and the deferred compaction functionality.
Unlike direct compaction, it uses only sync compaction, as there's no
allocation latency to minimize.
This patch doesn't yet add a call to wakeup_kcompactd. The kswapd
compact/reclaim loop for high-order pages will be replaced by waking up
kcompactd in the next patch with the description of what's wrong with
the old approach.
Waking up of the kcompactd threads is also tied to kswapd activity and
follows these rules:
- we don't want to affect any fastpaths, so wake up kcompactd only from
the slowpath, as it's done for kswapd
- if kswapd is doing reclaim, it's more important than compaction, so
don't invoke kcompactd until kswapd goes to sleep
- the target order used for kswapd is passed to kcompactd
Future possible future uses for kcompactd include the ability to wake up
kcompactd on demand in special situations, such as when hugepages are
not available (currently not done due to __GFP_NO_KSWAPD) or when a
fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also
possible to perform periodic compaction with kcompactd.
[arnd@arndb.de: fix build errors with kcompactd]
[paul.gortmaker@windriver.com: don't use modular references for non modular code]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 05:18:08 +08:00
|
|
|
extern int kcompactd_run(int nid);
|
|
|
|
extern void kcompactd_stop(int nid);
|
|
|
|
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
|
|
|
|
|
2010-05-25 05:32:30 +08:00
|
|
|
#else
|
2013-02-23 08:32:33 +08:00
|
|
|
static inline void compact_pgdat(pg_data_t *pgdat, int order)
|
2012-03-22 07:33:52 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-10-09 07:32:47 +08:00
|
|
|
static inline void reset_isolation_suitable(pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-05-21 07:56:38 +08:00
|
|
|
static inline enum compact_result compaction_suitable(struct zone *zone, int order,
|
mm, compaction: pass classzone_idx and alloc_flags to watermark checking
Compaction relies on zone watermark checks for decisions such as if it's
worth to start compacting in compaction_suitable() or whether compaction
should stop in compact_finished(). The watermark checks take
classzone_idx and alloc_flags parameters, which are related to the memory
allocation request. But from the context of compaction they are currently
passed as 0, including the direct compaction which is invoked to satisfy
the allocation request, and could therefore know the proper values.
The lack of proper values can lead to mismatch between decisions taken
during compaction and decisions related to the allocation request. Lack
of proper classzone_idx value means that lowmem_reserve is not taken into
account. This has manifested (during recent changes to deferred
compaction) when DMA zone was used as fallback for preferred Normal zone.
compaction_suitable() without proper classzone_idx would think that the
watermarks are already satisfied, but watermark check in
get_page_from_freelist() would fail. Because of this problem, deferring
compaction has extra complexity that can be removed in the following
patch.
The issue (not confirmed in practice) with missing alloc_flags is opposite
in nature. For allocations that include ALLOC_HIGH, ALLOC_HIGHER or
ALLOC_CMA in alloc_flags (the last includes all MOVABLE allocations on
CMA-enabled systems) the watermark checking in compaction with 0 passed
will be stricter than in get_page_from_freelist(). In these cases
compaction might be running for a longer time than is really needed.
Another issue compaction_suitable() is that the check for "does the zone
need compaction at all?" comes only after the check "does the zone have
enough free free pages to succeed compaction". The latter considers extra
pages for migration and can therefore in some situations fail and return
COMPACT_SKIPPED, although the high-order allocation would succeed and we
should return COMPACT_PARTIAL.
This patch fixes these problems by adding alloc_flags and classzone_idx to
struct compact_control and related functions involved in direct compaction
and watermark checking. Where possible, all other callers of
compaction_suitable() pass proper values where those are known. This is
currently limited to classzone_idx, which is sometimes known in kswapd
context. However, the direct reclaim callers should_continue_reclaim()
and compaction_ready() do not currently know the proper values, so the
coordination between reclaim and compaction may still not be as accurate
as it could. This can be fixed later, if it's shown to be an issue.
Additionaly the checks in compact_suitable() are reordered to address the
second issue described above.
The effect of this patch should be slightly better high-order allocation
success rates and/or less compaction overhead, depending on the type of
allocations and presence of CMA. It allows simplifying deferred
compaction code in a followup patch.
When testing with stress-highalloc, there was some slight improvement
(which might be just due to variance) in success rates of non-THP-like
allocations.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Christoph Lameter <cl@linux.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-11 07:43:22 +08:00
|
|
|
int alloc_flags, int classzone_idx)
|
2011-01-14 07:45:56 +08:00
|
|
|
{
|
|
|
|
return COMPACT_SKIPPED;
|
|
|
|
}
|
|
|
|
|
2012-03-22 07:33:52 +08:00
|
|
|
static inline void defer_compaction(struct zone *zone, int order)
|
2010-05-25 05:32:32 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2012-03-22 07:33:52 +08:00
|
|
|
static inline bool compaction_deferred(struct zone *zone, int order)
|
2010-05-25 05:32:32 +08:00
|
|
|
{
|
2012-08-01 07:42:49 +08:00
|
|
|
return true;
|
2010-05-25 05:32:32 +08:00
|
|
|
}
|
|
|
|
|
2016-05-21 07:56:56 +08:00
|
|
|
static inline bool compaction_made_progress(enum compact_result result)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool compaction_failed(enum compact_result result)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool compaction_withdrawn(enum compact_result result)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
mm, compaction: introduce kcompactd
Memory compaction can be currently performed in several contexts:
- kswapd balancing a zone after a high-order allocation failure
- direct compaction to satisfy a high-order allocation, including THP
page fault attemps
- khugepaged trying to collapse a hugepage
- manually from /proc
The purpose of compaction is two-fold. The obvious purpose is to
satisfy a (pending or future) high-order allocation, and is easy to
evaluate. The other purpose is to keep overal memory fragmentation low
and help the anti-fragmentation mechanism. The success wrt the latter
purpose is more
The current situation wrt the purposes has a few drawbacks:
- compaction is invoked only when a high-order page or hugepage is not
available (or manually). This might be too late for the purposes of
keeping memory fragmentation low.
- direct compaction increases latency of allocations. Again, it would
be better if compaction was performed asynchronously to keep
fragmentation low, before the allocation itself comes.
- (a special case of the previous) the cost of compaction during THP
page faults can easily offset the benefits of THP.
- kswapd compaction appears to be complex, fragile and not working in
some scenarios. It could also end up compacting for a high-order
allocation request when it should be reclaiming memory for a later
order-0 request.
To improve the situation, we should be able to benefit from an
equivalent of kswapd, but for compaction - i.e. a background thread
which responds to fragmentation and the need for high-order allocations
(including hugepages) somewhat proactively.
One possibility is to extend the responsibilities of kswapd, which could
however complicate its design too much. It should be better to let
kswapd handle reclaim, as order-0 allocations are often more critical
than high-order ones.
Another possibility is to extend khugepaged, but this kthread is a
single instance and tied to THP configs.
This patch goes with the option of a new set of per-node kthreads called
kcompactd, and lays the foundations, without introducing any new
tunables. The lifecycle mimics kswapd kthreads, including the memory
hotplug hooks.
For compaction, kcompactd uses the standard compaction_suitable() and
ompact_finished() criteria and the deferred compaction functionality.
Unlike direct compaction, it uses only sync compaction, as there's no
allocation latency to minimize.
This patch doesn't yet add a call to wakeup_kcompactd. The kswapd
compact/reclaim loop for high-order pages will be replaced by waking up
kcompactd in the next patch with the description of what's wrong with
the old approach.
Waking up of the kcompactd threads is also tied to kswapd activity and
follows these rules:
- we don't want to affect any fastpaths, so wake up kcompactd only from
the slowpath, as it's done for kswapd
- if kswapd is doing reclaim, it's more important than compaction, so
don't invoke kcompactd until kswapd goes to sleep
- the target order used for kswapd is passed to kcompactd
Future possible future uses for kcompactd include the ability to wake up
kcompactd on demand in special situations, such as when hugepages are
not available (currently not done due to __GFP_NO_KSWAPD) or when a
fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also
possible to perform periodic compaction with kcompactd.
[arnd@arndb.de: fix build errors with kcompactd]
[paul.gortmaker@windriver.com: don't use modular references for non modular code]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 05:18:08 +08:00
|
|
|
static inline int kcompactd_run(int nid)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static inline void kcompactd_stop(int nid)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-05-25 05:32:28 +08:00
|
|
|
#endif /* CONFIG_COMPACTION */
|
|
|
|
|
2010-05-25 05:32:29 +08:00
|
|
|
#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
|
mm: migrate: support non-lru movable page migration
We have allowed migration for only LRU pages until now and it was enough
to make high-order pages. But recently, embedded system(e.g., webOS,
android) uses lots of non-movable pages(e.g., zram, GPU memory) so we
have seen several reports about troubles of small high-order allocation.
For fixing the problem, there were several efforts (e,g,. enhance
compaction algorithm, SLUB fallback to 0-order page, reserved memory,
vmalloc and so on) but if there are lots of non-movable pages in system,
their solutions are void in the long run.
So, this patch is to support facility to change non-movable pages with
movable. For the feature, this patch introduces functions related to
migration to address_space_operations as well as some page flags.
If a driver want to make own pages movable, it should define three
functions which are function pointers of struct
address_space_operations.
1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
What VM expects on isolate_page function of driver is to return *true*
if driver isolates page successfully. On returing true, VM marks the
page as PG_isolated so concurrent isolation in several CPUs skip the
page for isolation. If a driver cannot isolate the page, it should
return *false*.
Once page is successfully isolated, VM uses page.lru fields so driver
shouldn't expect to preserve values in that fields.
2. int (*migratepage) (struct address_space *mapping,
struct page *newpage, struct page *oldpage, enum migrate_mode);
After isolation, VM calls migratepage of driver with isolated page. The
function of migratepage is to move content of the old page to new page
and set up fields of struct page newpage. Keep in mind that you should
indicate to the VM the oldpage is no longer movable via
__ClearPageMovable() under page_lock if you migrated the oldpage
successfully and returns 0. If driver cannot migrate the page at the
moment, driver can return -EAGAIN. On -EAGAIN, VM will retry page
migration in a short time because VM interprets -EAGAIN as "temporal
migration failure". On returning any error except -EAGAIN, VM will give
up the page migration without retrying in this time.
Driver shouldn't touch page.lru field VM using in the functions.
3. void (*putback_page)(struct page *);
If migration fails on isolated page, VM should return the isolated page
to the driver so VM calls driver's putback_page with migration failed
page. In this function, driver should put the isolated page back to the
own data structure.
4. non-lru movable page flags
There are two page flags for supporting non-lru movable page.
* PG_movable
Driver should use the below function to make page movable under
page_lock.
void __SetPageMovable(struct page *page, struct address_space *mapping)
It needs argument of address_space for registering migration family
functions which will be called by VM. Exactly speaking, PG_movable is
not a real flag of struct page. Rather than, VM reuses page->mapping's
lower bits to represent it.
#define PAGE_MAPPING_MOVABLE 0x2
page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
so driver shouldn't access page->mapping directly. Instead, driver
should use page_mapping which mask off the low two bits of page->mapping
so it can get right struct address_space.
For testing of non-lru movable page, VM supports __PageMovable function.
However, it doesn't guarantee to identify non-lru movable page because
page->mapping field is unified with other variables in struct page. As
well, if driver releases the page after isolation by VM, page->mapping
doesn't have stable value although it has PAGE_MAPPING_MOVABLE (Look at
__ClearPageMovable). But __PageMovable is cheap to catch whether page
is LRU or non-lru movable once the page has been isolated. Because LRU
pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
good for just peeking to test non-lru movable pages before more
expensive checking with lock_page in pfn scanning to select victim.
For guaranteeing non-lru movable page, VM provides PageMovable function.
Unlike __PageMovable, PageMovable functions validates page->mapping and
mapping->a_ops->isolate_page under lock_page. The lock_page prevents
sudden destroying of page->mapping.
Driver using __SetPageMovable should clear the flag via
__ClearMovablePage under page_lock before the releasing the page.
* PG_isolated
To prevent concurrent isolation among several CPUs, VM marks isolated
page as PG_isolated under lock_page. So if a CPU encounters PG_isolated
non-lru movable page, it can skip it. Driver doesn't need to manipulate
the flag because VM will set/clear it automatically. Keep in mind that
if driver sees PG_isolated page, it means the page have been isolated by
VM so it shouldn't touch page.lru field. PG_isolated is alias with
PG_reclaim flag so driver shouldn't use the flag for own purpose.
[opensource.ganesh@gmail.com: mm/compaction: remove local variable is_lru]
Link: http://lkml.kernel.org/r/20160618014841.GA7422@leo-test
Link: http://lkml.kernel.org/r/1464736881-24886-3-git-send-email-minchan@kernel.org
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-07-27 06:23:05 +08:00
|
|
|
struct node;
|
2010-05-25 05:32:29 +08:00
|
|
|
extern int compaction_register_node(struct node *node);
|
|
|
|
extern void compaction_unregister_node(struct node *node);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline int compaction_register_node(struct node *node)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void compaction_unregister_node(struct node *node)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
|
|
|
|
|
2010-05-25 05:32:27 +08:00
|
|
|
#endif /* _LINUX_COMPACTION_H */
|