2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_HUGETLB_H
|
|
|
|
#define _LINUX_HUGETLB_H
|
|
|
|
|
2011-05-27 03:03:50 +08:00
|
|
|
#include <linux/mm_types.h>
|
2014-01-24 07:52:54 +08:00
|
|
|
#include <linux/mmdebug.h>
|
2007-07-30 06:36:13 +08:00
|
|
|
#include <linux/fs.h>
|
2010-05-28 08:29:15 +08:00
|
|
|
#include <linux/hugetlb_inline.h>
|
2012-08-01 07:42:24 +08:00
|
|
|
#include <linux/cgroup.h>
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:25 +08:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/kref.h>
|
2016-01-16 08:56:32 +08:00
|
|
|
#include <asm/pgtable.h>
|
2007-07-30 06:36:13 +08:00
|
|
|
|
2009-09-25 05:47:45 +08:00
|
|
|
struct ctl_table;
|
|
|
|
struct user_struct;
|
2012-08-01 07:42:03 +08:00
|
|
|
struct mmu_gather;
|
2009-09-25 05:47:45 +08:00
|
|
|
|
2017-07-07 06:38:53 +08:00
|
|
|
#ifndef is_hugepd
|
|
|
|
/*
|
|
|
|
* Some architectures requires a hugepage directory format that is
|
|
|
|
* required to support multiple hugepage sizes. For example
|
|
|
|
* a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
|
|
|
|
* introduced the same on powerpc. This allows for a more flexible hugepage
|
|
|
|
* pagetable layout.
|
|
|
|
*/
|
|
|
|
typedef struct { unsigned long pd; } hugepd_t;
|
|
|
|
#define is_hugepd(hugepd) (0)
|
|
|
|
#define __hugepd(x) ((hugepd_t) { (x) })
|
|
|
|
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
|
|
|
unsigned pdshift, unsigned long end,
|
|
|
|
int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
|
|
|
unsigned pdshift, unsigned long end,
|
|
|
|
int write, struct page **pages, int *nr);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
|
|
|
|
#include <linux/mempolicy.h>
|
2007-03-02 07:46:08 +08:00
|
|
|
#include <linux/shm.h>
|
2005-06-22 08:14:44 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:12 +08:00
|
|
|
struct hugepage_subpool {
|
|
|
|
spinlock_t lock;
|
|
|
|
long count;
|
hugetlbfs: add minimum size tracking fields to subpool structure
hugetlbfs allocates huge pages from the global pool as needed. Even if
the global pool contains a sufficient number pages for the filesystem size
at mount time, those global pages could be grabbed for some other use. As
a result, filesystem huge page allocations may fail due to lack of pages.
Applications such as a database want to use huge pages for performance
reasons. hugetlbfs filesystem semantics with ownership and modes work
well to manage access to a pool of huge pages. However, the application
would like some reasonable assurance that allocations will not fail due to
a lack of huge pages. At application startup time, the application would
like to configure itself to use a specific number of huge pages. Before
starting, the application can check to make sure that enough huge pages
exist in the system global pools. However, there are no guarantees that
those pages will be available when needed by the application. What the
application wants is exclusive use of a subset of huge pages.
Add a new hugetlbfs mount option 'min_size=<value>' to indicate that the
specified number of pages will be available for use by the filesystem. At
mount time, this number of huge pages will be reserved for exclusive use
of the filesystem. If there is not a sufficient number of free pages, the
mount will fail. As pages are allocated to and freeed from the
filesystem, the number of reserved pages is adjusted so that the specified
minimum is maintained.
This patch (of 4):
Add a field to the subpool structure to indicate the minimimum number of
huge pages to always be used by this subpool. This minimum count includes
allocated pages as well as reserved pages. If the minimum number of pages
for the subpool have not been allocated, pages are reserved up to this
minimum. An additional field (rsv_hpages) is used to track the number of
pages reserved to meet this minimum size. The hstate pointer in the
subpool is convenient to have when reserving and unreserving the pages.
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-04-16 07:13:36 +08:00
|
|
|
long max_hpages; /* Maximum huge pages or -1 if no maximum. */
|
|
|
|
long used_hpages; /* Used count against maximum, includes */
|
|
|
|
/* both alloced and reserved pages. */
|
|
|
|
struct hstate *hstate;
|
|
|
|
long min_hpages; /* Minimum huge pages or -1 if no minimum. */
|
|
|
|
long rsv_hpages; /* Pages reserved against global pool to */
|
|
|
|
/* sasitfy minimum size. */
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:12 +08:00
|
|
|
};
|
|
|
|
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:25 +08:00
|
|
|
struct resv_map {
|
|
|
|
struct kref refs;
|
2014-04-04 05:47:27 +08:00
|
|
|
spinlock_t lock;
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:25 +08:00
|
|
|
struct list_head regions;
|
2015-09-09 06:01:28 +08:00
|
|
|
long adds_in_progress;
|
|
|
|
struct list_head region_cache;
|
|
|
|
long region_cache_count;
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:25 +08:00
|
|
|
};
|
|
|
|
extern struct resv_map *resv_map_alloc(void);
|
|
|
|
void resv_map_release(struct kref *ref);
|
|
|
|
|
2012-08-01 07:42:10 +08:00
|
|
|
extern spinlock_t hugetlb_lock;
|
|
|
|
extern int hugetlb_max_hstate __read_mostly;
|
|
|
|
#define for_each_hstate(h) \
|
|
|
|
for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
|
|
|
|
|
2015-04-16 07:13:42 +08:00
|
|
|
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
|
|
|
|
long min_hpages);
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:12 +08:00
|
|
|
void hugepage_put_subpool(struct hugepage_subpool *spool);
|
|
|
|
|
2008-07-24 12:27:23 +08:00
|
|
|
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
|
2009-09-24 06:57:19 +08:00
|
|
|
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
|
|
|
|
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
|
|
|
|
int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
|
hugetlb: derive huge pages nodes allowed from task mempolicy
This patch derives a "nodes_allowed" node mask from the numa mempolicy of
the task modifying the number of persistent huge pages to control the
allocation, freeing and adjusting of surplus huge pages when the pool page
count is modified via the new sysctl or sysfs attribute
"nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows:
* For "default" [NULL] task mempolicy, a NULL nodemask_t pointer
is produced. This will cause the hugetlb subsystem to use
node_online_map as the "nodes_allowed". This preserves the
behavior before this patch.
* For "preferred" mempolicy, including explicit local allocation,
a nodemask with the single preferred node will be produced.
"local" policy will NOT track any internode migrations of the
task adjusting nr_hugepages.
* For "bind" and "interleave" policy, the mempolicy's nodemask
will be used.
* Other than to inform the construction of the nodes_allowed node
mask, the actual mempolicy mode is ignored. That is, all modes
behave like interleave over the resulting nodes_allowed mask
with no "fallback".
See the updated documentation [next patch] for more information
about the implications of this patch.
Examples:
Starting with:
Node 0 HugePages_Total: 0
Node 1 HugePages_Total: 0
Node 2 HugePages_Total: 0
Node 3 HugePages_Total: 0
Default behavior [with or without this patch] balances persistent
hugepage allocation across nodes [with sufficient contiguous memory]:
sysctl vm.nr_hugepages[_mempolicy]=32
yields:
Node 0 HugePages_Total: 8
Node 1 HugePages_Total: 8
Node 2 HugePages_Total: 8
Node 3 HugePages_Total: 8
Of course, we only have nr_hugepages_mempolicy with the patch,
but with default mempolicy, nr_hugepages_mempolicy behaves the
same as nr_hugepages.
Applying mempolicy--e.g., with numactl [using '-m' a.k.a.
'--membind' because it allows multiple nodes to be specified
and it's easy to type]--we can allocate huge pages on
individual nodes or sets of nodes. So, starting from the
condition above, with 8 huge pages per node, add 8 more to
node 2 using:
numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40
This yields:
Node 0 HugePages_Total: 8
Node 1 HugePages_Total: 8
Node 2 HugePages_Total: 16
Node 3 HugePages_Total: 8
The incremental 8 huge pages were restricted to node 2 by the
specified mempolicy.
Similarly, we can use mempolicy to free persistent huge pages
from specified nodes:
numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32
yields:
Node 0 HugePages_Total: 4
Node 1 HugePages_Total: 4
Node 2 HugePages_Total: 16
Node 3 HugePages_Total: 8
The 8 huge pages freed were balanced over nodes 0 and 1.
[rientjes@google.com: accomodate reworked NODEMASK_ALLOC]
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 09:58:21 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
|
|
|
|
void __user *, size_t *, loff_t *);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
|
2013-02-23 08:35:55 +08:00
|
|
|
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
|
|
|
struct page **, struct vm_area_struct **,
|
2017-02-23 07:43:13 +08:00
|
|
|
unsigned long *, unsigned long *, long, unsigned int,
|
|
|
|
int *);
|
hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed
After patch 2 in this series, a process that successfully calls mmap() for
a MAP_PRIVATE mapping will be guaranteed to successfully fault until a
process calls fork(). At that point, the next write fault from the parent
could fail due to COW if the child still has a reference.
We only reserve pages for the parent but a copy must be made to avoid
leaking data from the parent to the child after fork(). Reserves could be
taken for both parent and child at fork time to guarantee faults but if
the mapping is large it is highly likely we will not have sufficient pages
for the reservation, and it is common to fork only to exec() immediatly
after. A failure here would be very undesirable.
Note that the current behaviour of mainline with MAP_PRIVATE pages is
pretty bad. The following situation is allowed to occur today.
1. Process calls mmap(MAP_PRIVATE)
2. Process calls mlock() to fault all pages and makes sure it succeeds
3. Process forks()
4. Process writes to MAP_PRIVATE mapping while child still exists
5. If the COW fails at this point, the process gets SIGKILLed even though it
had taken care to ensure the pages existed
This patch improves the situation by guaranteeing the reliability of the
process that successfully calls mmap(). When the parent performs COW, it
will try to satisfy the allocation without using reserves. If that fails
the parent will steal the page leaving any children without a page.
Faults from the child after that point will result in failure. If the
child COW happens first, an attempt will be made to allocate the page
without reserves and the child will get SIGKILLed on failure.
To summarise the new behaviour:
1. If the original mapper performs COW on a private mapping with multiple
references, it will attempt to allocate a hugepage from the pool or
the buddy allocator without using the existing reserves. On fail, VMAs
mapping the same area are traversed and the page being COW'd is unmapped
where found. It will then steal the original page as the last mapper in
the normal way.
2. The VMAs the pages were unmapped from are flagged to note that pages
with data no longer exist. Future no-page faults on those VMAs will
terminate the process as otherwise it would appear that data was corrupted.
A warning is printed to the console that this situation occured.
2. If the child performs COW first, it will attempt to satisfy the COW
from the pool if there are enough pages or via the buddy allocator if
overcommit is allowed and the buddy allocator can satisfy the request. If
it fails, the child will be killed.
If the pool is large enough, existing applications will not notice that
the reserves were a factor. Existing applications depending on the
no-reserves been set are unlikely to exist as for much of the history of
hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that
point or failing the mmap().
[npiggin@suse.de: fix CONFIG_HUGETLB=n build]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:27:25 +08:00
|
|
|
void unmap_hugepage_range(struct vm_area_struct *,
|
2012-08-01 07:42:03 +08:00
|
|
|
unsigned long, unsigned long, struct page *);
|
mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables
If a process creates a large hugetlbfs mapping that is eligible for page
table sharing and forks heavily with children some of whom fault and
others which destroy the mapping then it is possible for page tables to
get corrupted. Some teardowns of the mapping encounter a "bad pmd" and
output a message to the kernel log. The final teardown will trigger a
BUG_ON in mm/filemap.c.
This was reproduced in 3.4 but is known to have existed for a long time
and goes back at least as far as 2.6.37. It was probably was introduced
in 2.6.20 by [39dde65c: shared page table for hugetlb page]. The messages
look like this;
[ ..........] Lots of bad pmd messages followed by this
[ 127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7).
[ 127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7).
[ 127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7).
[ 127.186778] ------------[ cut here ]------------
[ 127.186781] kernel BUG at mm/filemap.c:134!
[ 127.186782] invalid opcode: 0000 [#1] SMP
[ 127.186783] CPU 7
[ 127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod
[ 127.186801]
[ 127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR
[ 127.186804] RIP: 0010:[<ffffffff810ed6ce>] [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[ 127.186809] RSP: 0000:ffff8804144b5c08 EFLAGS: 00010002
[ 127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0
[ 127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00
[ 127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003
[ 127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8
[ 127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8
[ 127.186815] FS: 00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000
[ 127.186816] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0
[ 127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0)
[ 127.186821] Stack:
[ 127.186822] ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b
[ 127.186824] ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98
[ 127.186825] ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000
[ 127.186827] Call Trace:
[ 127.186829] [<ffffffff810ed83b>] delete_from_page_cache+0x3b/0x80
[ 127.186832] [<ffffffff811bc925>] truncate_hugepages+0x115/0x220
[ 127.186834] [<ffffffff811bca43>] hugetlbfs_evict_inode+0x13/0x30
[ 127.186837] [<ffffffff811655c7>] evict+0xa7/0x1b0
[ 127.186839] [<ffffffff811657a3>] iput_final+0xd3/0x1f0
[ 127.186840] [<ffffffff811658f9>] iput+0x39/0x50
[ 127.186842] [<ffffffff81162708>] d_kill+0xf8/0x130
[ 127.186843] [<ffffffff81162812>] dput+0xd2/0x1a0
[ 127.186845] [<ffffffff8114e2d0>] __fput+0x170/0x230
[ 127.186848] [<ffffffff81236e0e>] ? rb_erase+0xce/0x150
[ 127.186849] [<ffffffff8114e3ad>] fput+0x1d/0x30
[ 127.186851] [<ffffffff81117db7>] remove_vma+0x37/0x80
[ 127.186853] [<ffffffff81119182>] do_munmap+0x2d2/0x360
[ 127.186855] [<ffffffff811cc639>] sys_shmdt+0xc9/0x170
[ 127.186857] [<ffffffff81410a39>] system_call_fastpath+0x16/0x1b
[ 127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0
[ 127.186868] RIP [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[ 127.186870] RSP <ffff8804144b5c08>
[ 127.186871] ---[ end trace 7cbac5d1db69f426 ]---
The bug is a race and not always easy to reproduce. To reproduce it I was
doing the following on a single socket I7-based machine with 16G of RAM.
$ hugeadm --pool-pages-max DEFAULT:13G
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall
$ for i in `seq 1 9000`; do ./hugetlbfs-test; done
On my particular machine, it usually triggers within 10 minutes but
enabling debug options can change the timing such that it never hits.
Once the bug is triggered, the machine is in trouble and needs to be
rebooted. The machine will respond but processes accessing proc like "ps
aux" will hang due to the BUG_ON. shutdown will also hang and needs a
hard reset or a sysrq-b.
The basic problem is a race between page table sharing and teardown. For
the most part page table sharing depends on i_mmap_mutex. In some cases,
it is also taking the mm->page_table_lock for the PTE updates but with
shared page tables, it is the i_mmap_mutex that is more important.
Unfortunately it appears to be also insufficient. Consider the following
situation
Process A Process B
--------- ---------
hugetlb_fault shmdt
LockWrite(mmap_sem)
do_munmap
unmap_region
unmap_vmas
unmap_single_vma
unmap_hugepage_range
Lock(i_mmap_mutex)
Lock(mm->page_table_lock)
huge_pmd_unshare/unmap tables <--- (1)
Unlock(mm->page_table_lock)
Unlock(i_mmap_mutex)
huge_pte_alloc ...
Lock(i_mmap_mutex) ...
vma_prio_walk, find svma, spte ...
Lock(mm->page_table_lock) ...
share spte ...
Unlock(mm->page_table_lock) ...
Unlock(i_mmap_mutex) ...
hugetlb_no_page <--- (2)
free_pgtables
unlink_file_vma
hugetlb_free_pgd_range
remove_vma_list
In this scenario, it is possible for Process A to share page tables with
Process B that is trying to tear them down. The i_mmap_mutex on its own
does not prevent Process A walking Process B's page tables. At (1) above,
the page tables are not shared yet so it unmaps the PMDs. Process A sets
up page table sharing and at (2) faults a new entry. Process B then trips
up on it in free_pgtables.
This patch fixes the problem by adding a new function
__unmap_hugepage_range_final that is only called when the VMA is about to
be destroyed. This function clears VM_MAYSHARE during
unmap_hugepage_range() under the i_mmap_mutex. This makes the VMA
ineligible for sharing and avoids the race. Superficially this looks like
it would then be vunerable to truncate and madvise issues but hugetlbfs
has its own truncate handlers so does not use unmap_mapping_range() and
does not support madvise(DONTNEED).
This should be treated as a -stable candidate if it is merged.
Test program is as follows. The test case was mostly written by Michal
Hocko with a few minor changes to reproduce this bug.
==== CUT HERE ====
static size_t huge_page_size = (2UL << 20);
static size_t nr_huge_page_A = 512;
static size_t nr_huge_page_B = 5632;
unsigned int get_random(unsigned int max)
{
struct timeval tv;
gettimeofday(&tv, NULL);
srandom(tv.tv_usec);
return random() % max;
}
static void play(void *addr, size_t size)
{
unsigned char *start = addr,
*end = start + size,
*a;
start += get_random(size/2);
/* we could itterate on huge pages but let's give it more time. */
for (a = start; a < end; a += 4096)
*a = 0;
}
int main(int argc, char **argv)
{
key_t key = IPC_PRIVATE;
size_t sizeA = nr_huge_page_A * huge_page_size;
size_t sizeB = nr_huge_page_B * huge_page_size;
int shmidA, shmidB;
void *addrA = NULL, *addrB = NULL;
int nr_children = 300, n = 0;
if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
perror("shmget:");
return 1;
}
if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) {
perror("shmat");
return 1;
}
if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
perror("shmget:");
return 1;
}
if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) {
perror("shmat");
return 1;
}
fork_child:
switch(fork()) {
case 0:
switch (n%3) {
case 0:
play(addrA, sizeA);
break;
case 1:
play(addrB, sizeB);
break;
case 2:
break;
}
break;
case -1:
perror("fork:");
break;
default:
if (++n < nr_children)
goto fork_child;
play(addrA, sizeA);
break;
}
shmdt(addrA);
shmdt(addrB);
do {
wait(NULL);
} while (--n > 0);
shmctl(shmidA, IPC_RMID, NULL);
shmctl(shmidB, IPC_RMID, NULL);
return 0;
}
[akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build]
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 07:46:20 +08:00
|
|
|
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
struct page *ref_page);
|
2012-08-01 07:42:03 +08:00
|
|
|
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
struct page *ref_page);
|
2008-10-16 03:50:22 +08:00
|
|
|
void hugetlb_report_meminfo(struct seq_file *);
|
2005-04-17 06:20:36 +08:00
|
|
|
int hugetlb_report_node_meminfo(int, char *);
|
2013-04-30 06:07:48 +08:00
|
|
|
void hugetlb_show_meminfo(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long hugetlb_total_pages(void);
|
2005-10-20 23:24:28 +08:00
|
|
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
2009-06-23 20:49:05 +08:00
|
|
|
unsigned long address, unsigned int flags);
|
2017-02-23 07:42:52 +08:00
|
|
|
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
|
|
|
|
struct vm_area_struct *dst_vma,
|
|
|
|
unsigned long dst_addr,
|
|
|
|
unsigned long src_addr,
|
|
|
|
struct page **pagep);
|
2008-07-24 12:27:23 +08:00
|
|
|
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
2009-02-10 22:02:27 +08:00
|
|
|
struct vm_area_struct *vma,
|
2011-05-26 18:16:19 +08:00
|
|
|
vm_flags_t vm_flags);
|
2015-09-09 06:01:41 +08:00
|
|
|
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
|
|
|
long freed);
|
mm: migrate: make core migration code aware of hugepage
Currently hugepage migration is available only for soft offlining, but
it's also useful for some other users of page migration (clearly because
users of hugepage can enjoy the benefit of mempolicy and memory hotplug.)
So this patchset tries to extend such users to support hugepage migration.
The target of this patchset is to enable hugepage migration for NUMA
related system calls (migrate_pages(2), move_pages(2), and mbind(2)), and
memory hotplug.
This patchset does not add hugepage migration for memory compaction,
because users of memory compaction mainly expect to construct thp by
arranging raw pages, and there's little or no need to compact hugepages.
CMA, another user of page migration, can have benefit from hugepage
migration, but is not enabled to support it for now (just because of lack
of testing and expertise in CMA.)
Hugepage migration of non pmd-based hugepage (for example 1GB hugepage in
x86_64, or hugepages in architectures like ia64) is not enabled for now
(again, because of lack of testing.)
As for how these are achived, I extended the API (migrate_pages()) to
handle hugepage (with patch 1 and 2) and adjusted code of each caller to
check and collect movable hugepages (with patch 3-7). Remaining 2 patches
are kind of miscellaneous ones to avoid unexpected behavior. Patch 8 is
about making sure that we only migrate pmd-based hugepages. And patch 9
is about choosing appropriate zone for hugepage allocation.
My test is mainly functional one, simply kicking hugepage migration via
each entry point and confirm that migration is done correctly. Test code
is available here:
git://github.com/Naoya-Horiguchi/test_hugepage_migration_extension.git
And I always run libhugetlbfs test when changing hugetlbfs's code. With
this patchset, no regression was found in the test.
This patch (of 9):
Before enabling each user of page migration to support hugepage,
this patch enables the list of pages for migration to link not only
LRU pages, but also hugepages. As a result, putback_movable_pages()
and migrate_pages() can handle both of LRU pages and hugepages.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-12 05:21:59 +08:00
|
|
|
bool isolate_huge_page(struct page *page, struct list_head *list);
|
|
|
|
void putback_active_hugepage(struct page *page);
|
2014-07-31 07:08:39 +08:00
|
|
|
void free_huge_page(struct page *page);
|
2016-10-08 08:02:01 +08:00
|
|
|
void hugetlb_fix_reserve_counts(struct inode *inode);
|
2015-09-09 06:01:35 +08:00
|
|
|
extern struct mutex *hugetlb_fault_mutex_table;
|
|
|
|
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct address_space *mapping,
|
|
|
|
pgoff_t idx, unsigned long address);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-04-23 19:35:02 +08:00
|
|
|
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
|
|
|
|
2015-02-11 06:11:36 +08:00
|
|
|
extern int hugepages_treat_as_movable;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int sysctl_hugetlb_shm_group;
|
2008-07-24 12:27:52 +08:00
|
|
|
extern struct list_head huge_boot_pages;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-22 08:14:44 +08:00
|
|
|
/* arch callbacks */
|
|
|
|
|
2008-07-24 12:27:41 +08:00
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
|
|
|
unsigned long addr, unsigned long sz);
|
2017-07-07 06:39:42 +08:00
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm,
|
|
|
|
unsigned long addr, unsigned long sz);
|
2006-12-07 12:32:03 +08:00
|
|
|
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
|
2005-06-22 08:14:44 +08:00
|
|
|
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
|
|
|
|
int write);
|
2017-07-07 06:38:56 +08:00
|
|
|
struct page *follow_huge_pd(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, hugepd_t hpd,
|
|
|
|
int flags, int pdshift);
|
2005-06-22 08:14:44 +08:00
|
|
|
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing. This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.
This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.
This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned. So the caller must be changed to
properly handle the returned tail pages.
We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.
Here is the reproducer:
$ cat movepages.c
#include <stdio.h>
#include <stdlib.h>
#include <numaif.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
#define PS 0x1000
int main(int argc, char *argv[]) {
int i;
int nr_hp = strtol(argv[1], NULL, 0);
int nr_p = nr_hp * HPS / PS;
int ret;
void **addrs;
int *status;
int *nodes;
pid_t pid;
pid = strtol(argv[2], NULL, 0);
addrs = malloc(sizeof(char *) * nr_p + 1);
status = malloc(sizeof(char *) * nr_p + 1);
nodes = malloc(sizeof(char *) * nr_p + 1);
while (1) {
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 1;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 0;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
}
return 0;
}
$ cat hugepage.c
#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
int main(int argc, char *argv[]) {
int nr_hp = strtol(argv[1], NULL, 0);
char *p;
while (1) {
p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (p != (void *)ADDR_INPUT) {
perror("mmap");
break;
}
memset(p, 0, nr_hp * HPS);
munmap(p, nr_hp * HPS);
}
}
$ sysctl vm.nr_hugepages=40
$ ./hugepage 10 &
$ ./movepages 10 $(pgrep -f hugepage)
Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org> [3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 07:25:22 +08:00
|
|
|
pmd_t *pmd, int flags);
|
2008-07-24 12:27:50 +08:00
|
|
|
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing. This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.
This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.
This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned. So the caller must be changed to
properly handle the returned tail pages.
We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.
Here is the reproducer:
$ cat movepages.c
#include <stdio.h>
#include <stdlib.h>
#include <numaif.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
#define PS 0x1000
int main(int argc, char *argv[]) {
int i;
int nr_hp = strtol(argv[1], NULL, 0);
int nr_p = nr_hp * HPS / PS;
int ret;
void **addrs;
int *status;
int *nodes;
pid_t pid;
pid = strtol(argv[2], NULL, 0);
addrs = malloc(sizeof(char *) * nr_p + 1);
status = malloc(sizeof(char *) * nr_p + 1);
nodes = malloc(sizeof(char *) * nr_p + 1);
while (1) {
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 1;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 0;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
}
return 0;
}
$ cat hugepage.c
#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
int main(int argc, char *argv[]) {
int nr_hp = strtol(argv[1], NULL, 0);
char *p;
while (1) {
p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (p != (void *)ADDR_INPUT) {
perror("mmap");
break;
}
memset(p, 0, nr_hp * HPS);
munmap(p, nr_hp * HPS);
}
}
$ sysctl vm.nr_hugepages=40
$ ./hugepage 10 &
$ ./movepages 10 $(pgrep -f hugepage)
Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org> [3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 07:25:22 +08:00
|
|
|
pud_t *pud, int flags);
|
2017-07-07 06:38:50 +08:00
|
|
|
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
|
|
|
|
pgd_t *pgd, int flags);
|
|
|
|
|
2005-06-22 08:14:44 +08:00
|
|
|
int pmd_huge(pmd_t pmd);
|
2017-03-09 22:24:07 +08:00
|
|
|
int pud_huge(pud_t pud);
|
2012-11-19 10:14:23 +08:00
|
|
|
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
2006-03-22 16:08:50 +08:00
|
|
|
unsigned long address, unsigned long end, pgprot_t newprot);
|
2005-06-22 08:14:44 +08:00
|
|
|
|
2017-07-07 06:38:47 +08:00
|
|
|
bool is_hugetlb_entry_migration(pte_t pte);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else /* !CONFIG_HUGETLB_PAGE */
|
|
|
|
|
2008-07-24 12:27:23 +08:00
|
|
|
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline unsigned long hugetlb_total_pages(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-23 07:43:13 +08:00
|
|
|
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
|
2005-04-17 06:20:36 +08:00
|
|
|
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
|
|
|
|
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
|
2008-10-16 03:50:22 +08:00
|
|
|
static inline void hugetlb_report_meminfo(struct seq_file *m)
|
|
|
|
{
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#define hugetlb_report_node_meminfo(n, buf) 0
|
2013-04-30 06:07:48 +08:00
|
|
|
static inline void hugetlb_show_meminfo(void)
|
|
|
|
{
|
|
|
|
}
|
2017-07-07 06:38:56 +08:00
|
|
|
#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL
|
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing. This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.
This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.
This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned. So the caller must be changed to
properly handle the returned tail pages.
We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.
Here is the reproducer:
$ cat movepages.c
#include <stdio.h>
#include <stdlib.h>
#include <numaif.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
#define PS 0x1000
int main(int argc, char *argv[]) {
int i;
int nr_hp = strtol(argv[1], NULL, 0);
int nr_p = nr_hp * HPS / PS;
int ret;
void **addrs;
int *status;
int *nodes;
pid_t pid;
pid = strtol(argv[2], NULL, 0);
addrs = malloc(sizeof(char *) * nr_p + 1);
status = malloc(sizeof(char *) * nr_p + 1);
nodes = malloc(sizeof(char *) * nr_p + 1);
while (1) {
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 1;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 0;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
}
return 0;
}
$ cat hugepage.c
#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
int main(int argc, char *argv[]) {
int nr_hp = strtol(argv[1], NULL, 0);
char *p;
while (1) {
p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (p != (void *)ADDR_INPUT) {
perror("mmap");
break;
}
memset(p, 0, nr_hp * HPS);
munmap(p, nr_hp * HPS);
}
}
$ sysctl vm.nr_hugepages=40
$ ./hugepage 10 &
$ ./movepages 10 $(pgrep -f hugepage)
Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org> [3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 07:25:22 +08:00
|
|
|
#define follow_huge_pmd(mm, addr, pmd, flags) NULL
|
|
|
|
#define follow_huge_pud(mm, addr, pud, flags) NULL
|
2017-07-07 06:38:50 +08:00
|
|
|
#define follow_huge_pgd(mm, addr, pgd, flags) NULL
|
2008-07-24 12:27:41 +08:00
|
|
|
#define prepare_hugepage_range(file, addr, len) (-EINVAL)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define pmd_huge(x) 0
|
2008-07-24 12:27:50 +08:00
|
|
|
#define pud_huge(x) 0
|
2005-04-17 06:20:36 +08:00
|
|
|
#define is_hugepage_only_range(mm, addr, len) 0
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 16:08:57 +08:00
|
|
|
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
2009-06-23 20:49:05 +08:00
|
|
|
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
|
2017-02-23 07:42:52 +08:00
|
|
|
#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
|
|
|
|
src_addr, pagep) ({ BUG(); 0; })
|
2017-07-07 06:39:42 +08:00
|
|
|
#define huge_pte_offset(mm, address, sz) 0
|
2012-08-01 07:42:03 +08:00
|
|
|
|
2013-12-13 09:12:19 +08:00
|
|
|
static inline bool isolate_huge_page(struct page *page, struct list_head *list)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
mm: migrate: make core migration code aware of hugepage
Currently hugepage migration is available only for soft offlining, but
it's also useful for some other users of page migration (clearly because
users of hugepage can enjoy the benefit of mempolicy and memory hotplug.)
So this patchset tries to extend such users to support hugepage migration.
The target of this patchset is to enable hugepage migration for NUMA
related system calls (migrate_pages(2), move_pages(2), and mbind(2)), and
memory hotplug.
This patchset does not add hugepage migration for memory compaction,
because users of memory compaction mainly expect to construct thp by
arranging raw pages, and there's little or no need to compact hugepages.
CMA, another user of page migration, can have benefit from hugepage
migration, but is not enabled to support it for now (just because of lack
of testing and expertise in CMA.)
Hugepage migration of non pmd-based hugepage (for example 1GB hugepage in
x86_64, or hugepages in architectures like ia64) is not enabled for now
(again, because of lack of testing.)
As for how these are achived, I extended the API (migrate_pages()) to
handle hugepage (with patch 1 and 2) and adjusted code of each caller to
check and collect movable hugepages (with patch 3-7). Remaining 2 patches
are kind of miscellaneous ones to avoid unexpected behavior. Patch 8 is
about making sure that we only migrate pmd-based hugepages. And patch 9
is about choosing appropriate zone for hugepage allocation.
My test is mainly functional one, simply kicking hugepage migration via
each entry point and confirm that migration is done correctly. Test code
is available here:
git://github.com/Naoya-Horiguchi/test_hugepage_migration_extension.git
And I always run libhugetlbfs test when changing hugetlbfs's code. With
this patchset, no regression was found in the test.
This patch (of 9):
Before enabling each user of page migration to support hugepage,
this patch enables the list of pages for migration to link not only
LRU pages, but also hugepages. As a result, putback_movable_pages()
and migrate_pages() can handle both of LRU pages and hugepages.
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-09-12 05:21:59 +08:00
|
|
|
#define putback_active_hugepage(p) do {} while (0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-11-19 10:14:23 +08:00
|
|
|
static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, unsigned long end, pgprot_t newprot)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2006-03-22 16:08:50 +08:00
|
|
|
|
mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables
If a process creates a large hugetlbfs mapping that is eligible for page
table sharing and forks heavily with children some of whom fault and
others which destroy the mapping then it is possible for page tables to
get corrupted. Some teardowns of the mapping encounter a "bad pmd" and
output a message to the kernel log. The final teardown will trigger a
BUG_ON in mm/filemap.c.
This was reproduced in 3.4 but is known to have existed for a long time
and goes back at least as far as 2.6.37. It was probably was introduced
in 2.6.20 by [39dde65c: shared page table for hugetlb page]. The messages
look like this;
[ ..........] Lots of bad pmd messages followed by this
[ 127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7).
[ 127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7).
[ 127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7).
[ 127.186778] ------------[ cut here ]------------
[ 127.186781] kernel BUG at mm/filemap.c:134!
[ 127.186782] invalid opcode: 0000 [#1] SMP
[ 127.186783] CPU 7
[ 127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod
[ 127.186801]
[ 127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR
[ 127.186804] RIP: 0010:[<ffffffff810ed6ce>] [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[ 127.186809] RSP: 0000:ffff8804144b5c08 EFLAGS: 00010002
[ 127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0
[ 127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00
[ 127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003
[ 127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8
[ 127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8
[ 127.186815] FS: 00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000
[ 127.186816] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0
[ 127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0)
[ 127.186821] Stack:
[ 127.186822] ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b
[ 127.186824] ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98
[ 127.186825] ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000
[ 127.186827] Call Trace:
[ 127.186829] [<ffffffff810ed83b>] delete_from_page_cache+0x3b/0x80
[ 127.186832] [<ffffffff811bc925>] truncate_hugepages+0x115/0x220
[ 127.186834] [<ffffffff811bca43>] hugetlbfs_evict_inode+0x13/0x30
[ 127.186837] [<ffffffff811655c7>] evict+0xa7/0x1b0
[ 127.186839] [<ffffffff811657a3>] iput_final+0xd3/0x1f0
[ 127.186840] [<ffffffff811658f9>] iput+0x39/0x50
[ 127.186842] [<ffffffff81162708>] d_kill+0xf8/0x130
[ 127.186843] [<ffffffff81162812>] dput+0xd2/0x1a0
[ 127.186845] [<ffffffff8114e2d0>] __fput+0x170/0x230
[ 127.186848] [<ffffffff81236e0e>] ? rb_erase+0xce/0x150
[ 127.186849] [<ffffffff8114e3ad>] fput+0x1d/0x30
[ 127.186851] [<ffffffff81117db7>] remove_vma+0x37/0x80
[ 127.186853] [<ffffffff81119182>] do_munmap+0x2d2/0x360
[ 127.186855] [<ffffffff811cc639>] sys_shmdt+0xc9/0x170
[ 127.186857] [<ffffffff81410a39>] system_call_fastpath+0x16/0x1b
[ 127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0
[ 127.186868] RIP [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[ 127.186870] RSP <ffff8804144b5c08>
[ 127.186871] ---[ end trace 7cbac5d1db69f426 ]---
The bug is a race and not always easy to reproduce. To reproduce it I was
doing the following on a single socket I7-based machine with 16G of RAM.
$ hugeadm --pool-pages-max DEFAULT:13G
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall
$ for i in `seq 1 9000`; do ./hugetlbfs-test; done
On my particular machine, it usually triggers within 10 minutes but
enabling debug options can change the timing such that it never hits.
Once the bug is triggered, the machine is in trouble and needs to be
rebooted. The machine will respond but processes accessing proc like "ps
aux" will hang due to the BUG_ON. shutdown will also hang and needs a
hard reset or a sysrq-b.
The basic problem is a race between page table sharing and teardown. For
the most part page table sharing depends on i_mmap_mutex. In some cases,
it is also taking the mm->page_table_lock for the PTE updates but with
shared page tables, it is the i_mmap_mutex that is more important.
Unfortunately it appears to be also insufficient. Consider the following
situation
Process A Process B
--------- ---------
hugetlb_fault shmdt
LockWrite(mmap_sem)
do_munmap
unmap_region
unmap_vmas
unmap_single_vma
unmap_hugepage_range
Lock(i_mmap_mutex)
Lock(mm->page_table_lock)
huge_pmd_unshare/unmap tables <--- (1)
Unlock(mm->page_table_lock)
Unlock(i_mmap_mutex)
huge_pte_alloc ...
Lock(i_mmap_mutex) ...
vma_prio_walk, find svma, spte ...
Lock(mm->page_table_lock) ...
share spte ...
Unlock(mm->page_table_lock) ...
Unlock(i_mmap_mutex) ...
hugetlb_no_page <--- (2)
free_pgtables
unlink_file_vma
hugetlb_free_pgd_range
remove_vma_list
In this scenario, it is possible for Process A to share page tables with
Process B that is trying to tear them down. The i_mmap_mutex on its own
does not prevent Process A walking Process B's page tables. At (1) above,
the page tables are not shared yet so it unmaps the PMDs. Process A sets
up page table sharing and at (2) faults a new entry. Process B then trips
up on it in free_pgtables.
This patch fixes the problem by adding a new function
__unmap_hugepage_range_final that is only called when the VMA is about to
be destroyed. This function clears VM_MAYSHARE during
unmap_hugepage_range() under the i_mmap_mutex. This makes the VMA
ineligible for sharing and avoids the race. Superficially this looks like
it would then be vunerable to truncate and madvise issues but hugetlbfs
has its own truncate handlers so does not use unmap_mapping_range() and
does not support madvise(DONTNEED).
This should be treated as a -stable candidate if it is merged.
Test program is as follows. The test case was mostly written by Michal
Hocko with a few minor changes to reproduce this bug.
==== CUT HERE ====
static size_t huge_page_size = (2UL << 20);
static size_t nr_huge_page_A = 512;
static size_t nr_huge_page_B = 5632;
unsigned int get_random(unsigned int max)
{
struct timeval tv;
gettimeofday(&tv, NULL);
srandom(tv.tv_usec);
return random() % max;
}
static void play(void *addr, size_t size)
{
unsigned char *start = addr,
*end = start + size,
*a;
start += get_random(size/2);
/* we could itterate on huge pages but let's give it more time. */
for (a = start; a < end; a += 4096)
*a = 0;
}
int main(int argc, char **argv)
{
key_t key = IPC_PRIVATE;
size_t sizeA = nr_huge_page_A * huge_page_size;
size_t sizeB = nr_huge_page_B * huge_page_size;
int shmidA, shmidB;
void *addrA = NULL, *addrB = NULL;
int nr_children = 300, n = 0;
if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
perror("shmget:");
return 1;
}
if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) {
perror("shmat");
return 1;
}
if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
perror("shmget:");
return 1;
}
if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) {
perror("shmat");
return 1;
}
fork_child:
switch(fork()) {
case 0:
switch (n%3) {
case 0:
play(addrA, sizeA);
break;
case 1:
play(addrB, sizeB);
break;
case 2:
break;
}
break;
case -1:
perror("fork:");
break;
default:
if (++n < nr_children)
goto fork_child;
play(addrA, sizeA);
break;
}
shmdt(addrA);
shmdt(addrB);
do {
wait(NULL);
} while (--n > 0);
shmctl(shmidA, IPC_RMID, NULL);
shmctl(shmidB, IPC_RMID, NULL);
return 0;
}
[akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build]
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-08-01 07:46:20 +08:00
|
|
|
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
|
|
|
struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, struct page *ref_page)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2012-08-01 07:42:03 +08:00
|
|
|
static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
|
|
|
|
struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, struct page *ref_page)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* !CONFIG_HUGETLB_PAGE */
|
2014-11-06 00:27:40 +08:00
|
|
|
/*
|
|
|
|
* hugepages at page global directory. If arch support
|
|
|
|
* hugepages at pgd level, they need to define this.
|
|
|
|
*/
|
|
|
|
#ifndef pgd_huge
|
|
|
|
#define pgd_huge(x) 0
|
|
|
|
#endif
|
2017-03-09 22:24:07 +08:00
|
|
|
#ifndef p4d_huge
|
|
|
|
#define p4d_huge(x) 0
|
|
|
|
#endif
|
2014-11-06 00:27:40 +08:00
|
|
|
|
|
|
|
#ifndef pgd_write
|
|
|
|
static inline int pgd_write(pgd_t pgd)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef pud_write
|
|
|
|
static inline int pud_write(pud_t pud)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-09-22 08:03:47 +08:00
|
|
|
#define HUGETLB_ANON_FILE "anon_hugepage"
|
|
|
|
|
2009-09-22 08:03:43 +08:00
|
|
|
enum {
|
|
|
|
/*
|
|
|
|
* The file will be used as an shm file so shmfs accounting rules
|
|
|
|
* apply
|
|
|
|
*/
|
|
|
|
HUGETLB_SHMFS_INODE = 1,
|
2009-09-22 08:03:47 +08:00
|
|
|
/*
|
|
|
|
* The file is being created on the internal vfs mount and shmfs
|
|
|
|
* accounting rules do not apply
|
|
|
|
*/
|
|
|
|
HUGETLB_ANONHUGE_INODE = 2,
|
2009-09-22 08:03:43 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HUGETLBFS
|
|
|
|
struct hugetlbfs_sb_info {
|
|
|
|
long max_inodes; /* inodes allowed */
|
|
|
|
long free_inodes; /* inodes free */
|
|
|
|
spinlock_t stat_lock;
|
2008-07-24 12:27:43 +08:00
|
|
|
struct hstate *hstate;
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:12 +08:00
|
|
|
struct hugepage_subpool *spool;
|
2017-07-05 23:24:18 +08:00
|
|
|
kuid_t uid;
|
|
|
|
kgid_t gid;
|
|
|
|
umode_t mode;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
|
|
|
|
{
|
|
|
|
return sb->s_fs_info;
|
|
|
|
}
|
|
|
|
|
2006-03-28 17:56:42 +08:00
|
|
|
extern const struct file_operations hugetlbfs_file_operations;
|
2009-09-28 02:29:37 +08:00
|
|
|
extern const struct vm_operations_struct hugetlb_vm_ops;
|
2013-05-08 07:18:13 +08:00
|
|
|
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
|
2012-12-12 08:01:34 +08:00
|
|
|
struct user_struct **user, int creat_flags,
|
|
|
|
int page_size_log);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-01-15 07:18:51 +08:00
|
|
|
static inline bool is_file_hugepages(struct file *file)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-03-02 07:46:08 +08:00
|
|
|
if (file->f_op == &hugetlbfs_file_operations)
|
2016-01-15 07:18:51 +08:00
|
|
|
return true;
|
2007-03-02 07:46:08 +08:00
|
|
|
|
2016-01-15 07:18:51 +08:00
|
|
|
return is_file_shm_hugepages(file);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-12-12 08:01:34 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#else /* !CONFIG_HUGETLBFS */
|
|
|
|
|
2016-01-15 07:18:51 +08:00
|
|
|
#define is_file_hugepages(file) false
|
2012-03-22 07:34:14 +08:00
|
|
|
static inline struct file *
|
2013-05-08 07:18:13 +08:00
|
|
|
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
|
|
|
|
struct user_struct **user, int creat_flags,
|
2012-12-12 08:01:34 +08:00
|
|
|
int page_size_log)
|
2009-09-25 05:47:45 +08:00
|
|
|
{
|
|
|
|
return ERR_PTR(-ENOSYS);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#endif /* !CONFIG_HUGETLBFS */
|
|
|
|
|
2007-05-07 05:49:00 +08:00
|
|
|
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
|
|
|
|
|
2008-07-24 12:27:41 +08:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
|
2008-07-24 12:27:44 +08:00
|
|
|
#define HSTATE_NAME_LEN 32
|
2008-07-24 12:27:41 +08:00
|
|
|
/* Defines one hugetlb page size */
|
|
|
|
struct hstate {
|
2009-09-22 08:01:22 +08:00
|
|
|
int next_nid_to_alloc;
|
|
|
|
int next_nid_to_free;
|
2008-07-24 12:27:41 +08:00
|
|
|
unsigned int order;
|
|
|
|
unsigned long mask;
|
|
|
|
unsigned long max_huge_pages;
|
|
|
|
unsigned long nr_huge_pages;
|
|
|
|
unsigned long free_huge_pages;
|
|
|
|
unsigned long resv_huge_pages;
|
|
|
|
unsigned long surplus_huge_pages;
|
|
|
|
unsigned long nr_overcommit_huge_pages;
|
2012-08-01 07:42:07 +08:00
|
|
|
struct list_head hugepage_activelist;
|
2008-07-24 12:27:41 +08:00
|
|
|
struct list_head hugepage_freelists[MAX_NUMNODES];
|
|
|
|
unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
|
|
|
unsigned int free_huge_pages_node[MAX_NUMNODES];
|
|
|
|
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
|
2012-08-01 07:42:24 +08:00
|
|
|
#ifdef CONFIG_CGROUP_HUGETLB
|
|
|
|
/* cgroup control files */
|
|
|
|
struct cftype cgroup_files[5];
|
|
|
|
#endif
|
2008-07-24 12:27:44 +08:00
|
|
|
char name[HSTATE_NAME_LEN];
|
2008-07-24 12:27:41 +08:00
|
|
|
};
|
|
|
|
|
2008-07-24 12:27:52 +08:00
|
|
|
struct huge_bootmem_page {
|
|
|
|
struct list_head list;
|
|
|
|
struct hstate *hstate;
|
2011-07-26 08:11:50 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
phys_addr_t phys;
|
|
|
|
#endif
|
2008-07-24 12:27:52 +08:00
|
|
|
};
|
|
|
|
|
2015-09-09 06:01:54 +08:00
|
|
|
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, int avoid_reserve);
|
2010-09-08 09:19:33 +08:00
|
|
|
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
2013-09-12 05:22:06 +08:00
|
|
|
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, int avoid_reserve);
|
2017-07-11 06:49:11 +08:00
|
|
|
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
|
|
|
|
nodemask_t *nmask);
|
2015-09-09 06:01:50 +08:00
|
|
|
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
|
|
|
pgoff_t idx);
|
2010-09-08 09:19:33 +08:00
|
|
|
|
2008-07-24 12:27:52 +08:00
|
|
|
/* arch callback */
|
2017-07-28 13:01:25 +08:00
|
|
|
int __init __alloc_bootmem_huge_page(struct hstate *h);
|
2008-07-24 12:27:52 +08:00
|
|
|
int __init alloc_bootmem_huge_page(struct hstate *h);
|
|
|
|
|
2016-05-20 08:11:04 +08:00
|
|
|
void __init hugetlb_bad_size(void);
|
2008-07-24 12:27:42 +08:00
|
|
|
void __init hugetlb_add_hstate(unsigned order);
|
|
|
|
struct hstate *size_to_hstate(unsigned long size);
|
|
|
|
|
|
|
|
#ifndef HUGE_MAX_HSTATE
|
|
|
|
#define HUGE_MAX_HSTATE 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern struct hstate hstates[HUGE_MAX_HSTATE];
|
|
|
|
extern unsigned int default_hstate_idx;
|
|
|
|
|
|
|
|
#define default_hstate (hstates[default_hstate_idx])
|
2008-07-24 12:27:41 +08:00
|
|
|
|
2008-07-24 12:27:43 +08:00
|
|
|
static inline struct hstate *hstate_inode(struct inode *i)
|
2008-07-24 12:27:41 +08:00
|
|
|
{
|
2016-05-21 07:57:59 +08:00
|
|
|
return HUGETLBFS_SB(i->i_sb)->hstate;
|
2008-07-24 12:27:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct hstate *hstate_file(struct file *f)
|
|
|
|
{
|
2013-01-24 06:07:38 +08:00
|
|
|
return hstate_inode(file_inode(f));
|
2008-07-24 12:27:41 +08:00
|
|
|
}
|
|
|
|
|
2013-05-08 07:18:13 +08:00
|
|
|
static inline struct hstate *hstate_sizelog(int page_size_log)
|
|
|
|
{
|
|
|
|
if (!page_size_log)
|
|
|
|
return &default_hstate;
|
2014-12-11 07:44:13 +08:00
|
|
|
|
|
|
|
return size_to_hstate(1UL << page_size_log);
|
2013-05-08 07:18:13 +08:00
|
|
|
}
|
|
|
|
|
2008-07-24 12:27:43 +08:00
|
|
|
static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
|
2008-07-24 12:27:41 +08:00
|
|
|
{
|
2008-07-24 12:27:43 +08:00
|
|
|
return hstate_file(vma->vm_file);
|
2008-07-24 12:27:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long huge_page_size(struct hstate *h)
|
|
|
|
{
|
|
|
|
return (unsigned long)PAGE_SIZE << h->order;
|
|
|
|
}
|
|
|
|
|
2009-01-07 06:38:53 +08:00
|
|
|
extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);
|
|
|
|
|
2009-01-07 06:38:54 +08:00
|
|
|
extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);
|
|
|
|
|
2008-07-24 12:27:41 +08:00
|
|
|
static inline unsigned long huge_page_mask(struct hstate *h)
|
|
|
|
{
|
|
|
|
return h->mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int huge_page_order(struct hstate *h)
|
|
|
|
{
|
|
|
|
return h->order;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned huge_page_shift(struct hstate *h)
|
|
|
|
{
|
|
|
|
return h->order + PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2014-06-05 07:07:08 +08:00
|
|
|
static inline bool hstate_is_gigantic(struct hstate *h)
|
|
|
|
{
|
|
|
|
return huge_page_order(h) >= MAX_ORDER;
|
|
|
|
}
|
|
|
|
|
2008-07-24 12:27:41 +08:00
|
|
|
static inline unsigned int pages_per_huge_page(struct hstate *h)
|
|
|
|
{
|
|
|
|
return 1 << h->order;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int blocks_per_huge_page(struct hstate *h)
|
|
|
|
{
|
|
|
|
return huge_page_size(h) / 512;
|
|
|
|
}
|
|
|
|
|
|
|
|
#include <asm/hugetlb.h>
|
|
|
|
|
2012-04-02 02:01:34 +08:00
|
|
|
#ifndef arch_make_huge_pte
|
|
|
|
static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
|
|
|
|
struct page *page, int writable)
|
|
|
|
{
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-07-24 12:27:42 +08:00
|
|
|
static inline struct hstate *page_hstate(struct page *page)
|
|
|
|
{
|
2014-01-24 07:52:54 +08:00
|
|
|
VM_BUG_ON_PAGE(!PageHuge(page), page);
|
2008-07-24 12:27:42 +08:00
|
|
|
return size_to_hstate(PAGE_SIZE << compound_order(page));
|
|
|
|
}
|
|
|
|
|
2010-10-07 03:45:00 +08:00
|
|
|
static inline unsigned hstate_index_to_shift(unsigned index)
|
|
|
|
{
|
|
|
|
return hstates[index].order + PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2012-08-01 07:42:00 +08:00
|
|
|
static inline int hstate_index(struct hstate *h)
|
|
|
|
{
|
|
|
|
return h - hstates;
|
|
|
|
}
|
|
|
|
|
2013-06-25 21:19:31 +08:00
|
|
|
pgoff_t __basepage_index(struct page *page);
|
|
|
|
|
|
|
|
/* Return page->index in PAGE_SIZE units */
|
|
|
|
static inline pgoff_t basepage_index(struct page *page)
|
|
|
|
{
|
|
|
|
if (!PageCompound(page))
|
|
|
|
return page->index;
|
|
|
|
|
|
|
|
return __basepage_index(page);
|
|
|
|
}
|
|
|
|
|
2017-07-11 06:47:41 +08:00
|
|
|
extern int dissolve_free_huge_page(struct page *page);
|
2016-10-08 08:01:10 +08:00
|
|
|
extern int dissolve_free_huge_pages(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn);
|
2016-05-21 07:58:01 +08:00
|
|
|
static inline bool hugepage_migration_supported(struct hstate *h)
|
2013-09-12 05:22:11 +08:00
|
|
|
{
|
2014-06-05 07:05:35 +08:00
|
|
|
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
|
2017-07-07 06:38:38 +08:00
|
|
|
if ((huge_page_shift(h) == PMD_SHIFT) ||
|
|
|
|
(huge_page_shift(h) == PGDIR_SHIFT))
|
|
|
|
return true;
|
|
|
|
else
|
|
|
|
return false;
|
2014-06-05 07:05:35 +08:00
|
|
|
#else
|
2016-05-21 07:58:01 +08:00
|
|
|
return false;
|
2014-06-05 07:05:35 +08:00
|
|
|
#endif
|
2013-09-12 05:22:11 +08:00
|
|
|
}
|
2013-09-12 05:22:09 +08:00
|
|
|
|
2013-11-15 06:31:02 +08:00
|
|
|
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
|
|
|
struct mm_struct *mm, pte_t *pte)
|
|
|
|
{
|
|
|
|
if (huge_page_size(h) == PMD_SIZE)
|
|
|
|
return pmd_lockptr(mm, (pmd_t *) pte);
|
|
|
|
VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
|
|
|
|
return &mm->page_table_lock;
|
|
|
|
}
|
|
|
|
|
2015-07-18 07:23:37 +08:00
|
|
|
#ifndef hugepages_supported
|
|
|
|
/*
|
|
|
|
* Some platform decide whether they support huge pages at boot
|
|
|
|
* time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
|
|
|
|
* when there is no such support
|
|
|
|
*/
|
|
|
|
#define hugepages_supported() (HPAGE_SHIFT != 0)
|
|
|
|
#endif
|
hugetlb: ensure hugepage access is denied if hugepages are not supported
Currently, I am seeing the following when I `mount -t hugetlbfs /none
/dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's
related to the fact that hugetlbfs is properly not correctly setting
itself up in this state?:
Unable to handle kernel paging request for data at address 0x00000031
Faulting instruction address: 0xc000000000245710
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 NUMA pSeries
....
In KVM guests on Power, in a guest not backed by hugepages, we see the
following:
AnonHugePages: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 64 kB
HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
are not supported at boot-time, but this is only checked in
hugetlb_init(). Extract the check to a helper function, and use it in a
few relevant places.
This does make hugetlbfs not supported (not registered at all) in this
environment. I believe this is fine, as there are no valid hugepages
and that won't change at runtime.
[akpm@linux-foundation.org: use pr_info(), per Mel]
[akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined]
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-05-07 03:50:00 +08:00
|
|
|
|
2015-11-06 10:47:14 +08:00
|
|
|
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
|
|
|
|
|
|
|
|
static inline void hugetlb_count_add(long l, struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
atomic_long_add(l, &mm->hugetlb_usage);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
atomic_long_sub(l, &mm->hugetlb_usage);
|
|
|
|
}
|
2017-07-07 06:39:50 +08:00
|
|
|
|
|
|
|
#ifndef set_huge_swap_pte_at
|
|
|
|
static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pte, unsigned long sz)
|
|
|
|
{
|
|
|
|
set_huge_pte_at(mm, addr, ptep, pte);
|
|
|
|
}
|
|
|
|
#endif
|
2013-05-08 07:18:13 +08:00
|
|
|
#else /* CONFIG_HUGETLB_PAGE */
|
2008-07-24 12:27:41 +08:00
|
|
|
struct hstate {};
|
2015-09-09 06:01:54 +08:00
|
|
|
#define alloc_huge_page(v, a, r) NULL
|
2010-09-08 09:19:33 +08:00
|
|
|
#define alloc_huge_page_node(h, nid) NULL
|
2017-07-11 06:49:11 +08:00
|
|
|
#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
|
2013-09-12 05:22:06 +08:00
|
|
|
#define alloc_huge_page_noerr(v, a, r) NULL
|
2008-07-24 12:27:52 +08:00
|
|
|
#define alloc_bootmem_huge_page(h) NULL
|
2008-07-24 12:27:41 +08:00
|
|
|
#define hstate_file(f) NULL
|
2013-05-08 07:18:13 +08:00
|
|
|
#define hstate_sizelog(s) NULL
|
2008-07-24 12:27:41 +08:00
|
|
|
#define hstate_vma(v) NULL
|
|
|
|
#define hstate_inode(i) NULL
|
2013-11-15 06:31:02 +08:00
|
|
|
#define page_hstate(page) NULL
|
2008-07-24 12:27:41 +08:00
|
|
|
#define huge_page_size(h) PAGE_SIZE
|
|
|
|
#define huge_page_mask(h) PAGE_MASK
|
2009-01-07 06:38:53 +08:00
|
|
|
#define vma_kernel_pagesize(v) PAGE_SIZE
|
2009-01-07 06:38:54 +08:00
|
|
|
#define vma_mmu_pagesize(v) PAGE_SIZE
|
2008-07-24 12:27:41 +08:00
|
|
|
#define huge_page_order(h) 0
|
|
|
|
#define huge_page_shift(h) PAGE_SHIFT
|
2017-07-07 06:38:38 +08:00
|
|
|
static inline bool hstate_is_gigantic(struct hstate *h)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-07-27 06:22:27 +08:00
|
|
|
static inline unsigned int pages_per_huge_page(struct hstate *h)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2017-07-11 06:47:41 +08:00
|
|
|
|
|
|
|
static inline unsigned hstate_index_to_shift(unsigned index)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int hstate_index(struct hstate *h)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2013-06-25 21:19:31 +08:00
|
|
|
|
|
|
|
static inline pgoff_t basepage_index(struct page *page)
|
|
|
|
{
|
|
|
|
return page->index;
|
|
|
|
}
|
2017-07-11 06:47:41 +08:00
|
|
|
|
|
|
|
static inline int dissolve_free_huge_page(struct page *page)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int dissolve_free_huge_pages(unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool hugepage_migration_supported(struct hstate *h)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
2013-11-15 06:31:02 +08:00
|
|
|
|
|
|
|
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
|
|
|
struct mm_struct *mm, pte_t *pte)
|
|
|
|
{
|
|
|
|
return &mm->page_table_lock;
|
|
|
|
}
|
2015-11-06 10:47:14 +08:00
|
|
|
|
|
|
|
static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
2017-07-07 06:39:50 +08:00
|
|
|
|
|
|
|
static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pte, unsigned long sz)
|
|
|
|
{
|
|
|
|
}
|
2013-05-08 07:18:13 +08:00
|
|
|
#endif /* CONFIG_HUGETLB_PAGE */
|
2008-07-24 12:27:41 +08:00
|
|
|
|
2013-11-15 06:31:02 +08:00
|
|
|
static inline spinlock_t *huge_pte_lock(struct hstate *h,
|
|
|
|
struct mm_struct *mm, pte_t *pte)
|
|
|
|
{
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
|
|
|
ptl = huge_pte_lockptr(h, mm, pte);
|
|
|
|
spin_lock(ptl);
|
|
|
|
return ptl;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* _LINUX_HUGETLB_H */
|