2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_HUGETLB_H
|
|
|
|
#define _LINUX_HUGETLB_H
|
|
|
|
|
2007-07-30 06:36:13 +08:00
|
|
|
#include <linux/fs.h>
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
|
|
|
|
#include <linux/mempolicy.h>
|
2007-03-02 07:46:08 +08:00
|
|
|
#include <linux/shm.h>
|
2005-06-22 08:14:44 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct ctl_table;
|
|
|
|
|
|
|
|
static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return vma->vm_flags & VM_HUGETLB;
|
|
|
|
}
|
|
|
|
|
|
|
|
int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
|
2008-02-08 20:18:18 +08:00
|
|
|
int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
|
2007-07-17 19:03:13 +08:00
|
|
|
int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
|
2005-04-17 06:20:36 +08:00
|
|
|
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
|
2007-11-15 08:59:33 +08:00
|
|
|
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, int);
|
2005-04-17 06:20:36 +08:00
|
|
|
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
|
2006-10-11 16:20:46 +08:00
|
|
|
void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
|
2005-04-17 06:20:36 +08:00
|
|
|
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
|
|
|
|
int hugetlb_report_meminfo(char *);
|
|
|
|
int hugetlb_report_node_meminfo(int, char *);
|
|
|
|
unsigned long hugetlb_total_pages(void);
|
2005-10-20 23:24:28 +08:00
|
|
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int write_access);
|
2006-06-23 17:03:15 +08:00
|
|
|
int hugetlb_reserve_pages(struct inode *inode, long from, long to);
|
|
|
|
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern unsigned long max_huge_pages;
|
2007-07-17 19:03:13 +08:00
|
|
|
extern unsigned long hugepages_treat_as_movable;
|
hugetlb: introduce nr_overcommit_hugepages sysctl
hugetlb: introduce nr_overcommit_hugepages sysctl
While examining the code to support /proc/sys/vm/hugetlb_dynamic_pool, I
became convinced that having a boolean sysctl was insufficient:
1) To support per-node control of hugepages, I have previously submitted
patches to add a sysfs attribute related to nr_hugepages. However, with
a boolean global value and per-mount quota enforcement constraining the
dynamic pool, adding corresponding control of the dynamic pool on a
per-node basis seems inconsistent to me.
2) Administration of the hugetlb dynamic pool with multiple hugetlbfs
mount points is, arguably, more arduous than it needs to be. Each quota
would need to be set separately, and the sum would need to be monitored.
To ease the administration, and to help make the way for per-node
control of the static & dynamic hugepage pool, I added a separate
sysctl, nr_overcommit_hugepages. This value serves as a high watermark
for the overall hugepage pool, while nr_hugepages serves as a low
watermark. The boolean sysctl can then be removed, as the condition
nr_overcommit_hugepages > 0
indicates the same administrative setting as
hugetlb_dynamic_pool == 1
Quotas still serve as local enforcement of the size of the pool on a
per-mount basis.
A few caveats:
1) There is a race whereby the global surplus huge page counter is
incremented before a hugepage has allocated. Another process could then
try grow the pool, and fail to convert a surplus huge page to a normal
huge page and instead allocate a fresh huge page. I believe this is
benign, as no memory is leaked (the actual pages are still tracked
correctly) and the counters won't go out of sync.
2) Shrinking the static pool while a surplus is in effect will allow the
number of surplus huge pages to exceed the overcommit value. As long as
this condition holds, however, no more surplus huge pages will be
allowed on the system until one of the two sysctls are increased
sufficiently, or the surplus huge pages go out of use and are freed.
Successfully tested on x86_64 with the current libhugetlbfs snapshot,
modified to use the new sysctl.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-12-18 08:20:12 +08:00
|
|
|
extern unsigned long nr_overcommit_huge_pages;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern const unsigned long hugetlb_zero, hugetlb_infinity;
|
|
|
|
extern int sysctl_hugetlb_shm_group;
|
|
|
|
|
2005-06-22 08:14:44 +08:00
|
|
|
/* arch callbacks */
|
|
|
|
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
|
2006-12-07 12:32:03 +08:00
|
|
|
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
|
2005-06-22 08:14:44 +08:00
|
|
|
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
|
|
|
|
int write);
|
|
|
|
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
|
|
pmd_t *pmd, int write);
|
|
|
|
int pmd_huge(pmd_t pmd);
|
2006-03-22 16:08:50 +08:00
|
|
|
void hugetlb_change_protection(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, unsigned long end, pgprot_t newprot);
|
2005-06-22 08:14:44 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
|
|
|
|
#define is_hugepage_only_range(mm, addr, len) 0
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 16:08:57 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
|
|
|
|
#define hugetlb_free_pgd_range free_pgd_range
|
2006-03-22 16:08:59 +08:00
|
|
|
#else
|
|
|
|
void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
|
2006-03-22 16:09:01 +08:00
|
|
|
/*
|
|
|
|
* If the arch doesn't supply something else, assume that hugepage
|
|
|
|
* size aligned regions are ok without further preparation.
|
|
|
|
*/
|
2007-08-31 14:56:40 +08:00
|
|
|
static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
|
2006-03-22 16:09:01 +08:00
|
|
|
{
|
|
|
|
if (len & ~HPAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
if (addr & ~HPAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2007-08-31 14:56:40 +08:00
|
|
|
int prepare_hugepage_range(unsigned long addr, unsigned long len);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2005-06-22 08:14:44 +08:00
|
|
|
#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
|
|
|
|
#define set_huge_pte_at(mm, addr, ptep, pte) set_pte_at(mm, addr, ptep, pte)
|
|
|
|
#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep)
|
|
|
|
#else
|
|
|
|
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pte);
|
|
|
|
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
|
|
|
|
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
|
|
|
|
#else
|
|
|
|
void hugetlb_prefault_arch_hook(struct mm_struct *mm);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#else /* !CONFIG_HUGETLB_PAGE */
|
|
|
|
|
|
|
|
static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static inline unsigned long hugetlb_total_pages(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-11-15 08:59:33 +08:00
|
|
|
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; })
|
2005-04-17 06:20:36 +08:00
|
|
|
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
|
|
|
|
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
|
|
|
|
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
|
|
|
|
#define unmap_hugepage_range(vma, start, end) BUG()
|
|
|
|
#define hugetlb_report_meminfo(buf) 0
|
|
|
|
#define hugetlb_report_node_meminfo(n, buf) 0
|
|
|
|
#define follow_huge_pmd(mm, addr, pmd, write) NULL
|
2007-08-31 14:56:40 +08:00
|
|
|
#define prepare_hugepage_range(addr,len) (-EINVAL)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define pmd_huge(x) 0
|
|
|
|
#define is_hugepage_only_range(mm, addr, len) 0
|
[PATCH] hugepage: Fix hugepage logic in free_pgtables()
free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs. However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma. is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned. So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.
At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.
Nonetheless this should be fixed. We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().
This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64). We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64. Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range(). Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.
This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 16:08:57 +08:00
|
|
|
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
|
2005-10-20 23:24:28 +08:00
|
|
|
#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-22 16:08:50 +08:00
|
|
|
#define hugetlb_change_protection(vma, address, end, newprot)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifndef HPAGE_MASK
|
2005-11-14 08:06:42 +08:00
|
|
|
#define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */
|
|
|
|
#define HPAGE_SIZE PAGE_SIZE
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* !CONFIG_HUGETLB_PAGE */
|
|
|
|
|
|
|
|
#ifdef CONFIG_HUGETLBFS
|
|
|
|
struct hugetlbfs_config {
|
|
|
|
uid_t uid;
|
|
|
|
gid_t gid;
|
|
|
|
umode_t mode;
|
|
|
|
long nr_blocks;
|
|
|
|
long nr_inodes;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct hugetlbfs_sb_info {
|
|
|
|
long max_blocks; /* blocks allowed */
|
|
|
|
long free_blocks; /* blocks free */
|
|
|
|
long max_inodes; /* inodes allowed */
|
|
|
|
long free_inodes; /* inodes free */
|
|
|
|
spinlock_t stat_lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct hugetlbfs_inode_info {
|
|
|
|
struct shared_policy policy;
|
|
|
|
struct inode vfs_inode;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
|
|
|
|
{
|
|
|
|
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
|
|
|
|
{
|
|
|
|
return sb->s_fs_info;
|
|
|
|
}
|
|
|
|
|
2006-03-28 17:56:42 +08:00
|
|
|
extern const struct file_operations hugetlbfs_file_operations;
|
2005-04-17 06:20:36 +08:00
|
|
|
extern struct vm_operations_struct hugetlb_vm_ops;
|
2007-06-17 01:16:16 +08:00
|
|
|
struct file *hugetlb_file_setup(const char *name, size_t);
|
2007-11-15 08:59:41 +08:00
|
|
|
int hugetlb_get_quota(struct address_space *mapping, long delta);
|
|
|
|
void hugetlb_put_quota(struct address_space *mapping, long delta);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-11-15 08:59:44 +08:00
|
|
|
#define BLOCKS_PER_HUGEPAGE (HPAGE_SIZE / 512)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int is_file_hugepages(struct file *file)
|
|
|
|
{
|
2007-03-02 07:46:08 +08:00
|
|
|
if (file->f_op == &hugetlbfs_file_operations)
|
|
|
|
return 1;
|
|
|
|
if (is_file_shm_hugepages(file))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_file_hugepages(struct file *file)
|
|
|
|
{
|
|
|
|
file->f_op = &hugetlbfs_file_operations;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_HUGETLBFS */
|
|
|
|
|
|
|
|
#define is_file_hugepages(file) 0
|
|
|
|
#define set_file_hugepages(file) BUG()
|
2007-06-17 01:16:16 +08:00
|
|
|
#define hugetlb_file_setup(name,size) ERR_PTR(-ENOSYS)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#endif /* !CONFIG_HUGETLBFS */
|
|
|
|
|
2007-05-07 05:49:00 +08:00
|
|
|
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* _LINUX_HUGETLB_H */
|