[PATCH] Add NUMA policy support for huge pages.
The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
96df9333c9
commit
5da7ca8607
|
@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
|
||||||
int hugetlb_report_node_meminfo(int, char *);
|
int hugetlb_report_node_meminfo(int, char *);
|
||||||
int is_hugepage_mem_enough(size_t);
|
int is_hugepage_mem_enough(size_t);
|
||||||
unsigned long hugetlb_total_pages(void);
|
unsigned long hugetlb_total_pages(void);
|
||||||
struct page *alloc_huge_page(void);
|
struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
|
||||||
void free_huge_page(struct page *);
|
void free_huge_page(struct page *);
|
||||||
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
unsigned long address, int write_access);
|
unsigned long address, int write_access);
|
||||||
|
@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
|
||||||
#define is_hugepage_only_range(mm, addr, len) 0
|
#define is_hugepage_only_range(mm, addr, len) 0
|
||||||
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
|
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
|
||||||
do { } while (0)
|
do { } while (0)
|
||||||
#define alloc_huge_page() ({ NULL; })
|
#define alloc_huge_page(vma, addr) ({ NULL; })
|
||||||
#define free_huge_page(p) ({ (void)(p); BUG(); })
|
#define free_huge_page(p) ({ (void)(p); BUG(); })
|
||||||
#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
|
#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
|
||||||
|
|
||||||
|
|
|
@ -156,6 +156,8 @@ extern void numa_default_policy(void);
|
||||||
extern void numa_policy_init(void);
|
extern void numa_policy_init(void);
|
||||||
extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
|
extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
|
||||||
extern struct mempolicy default_policy;
|
extern struct mempolicy default_policy;
|
||||||
|
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr)
|
||||||
|
{
|
||||||
|
return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* CONFIG_NUMA */
|
#endif /* CONFIG_NUMA */
|
||||||
#endif /* __KERNEL__ */
|
#endif /* __KERNEL__ */
|
||||||
|
|
||||||
|
|
22
mm/hugetlb.c
22
mm/hugetlb.c
|
@ -11,6 +11,8 @@
|
||||||
#include <linux/highmem.h>
|
#include <linux/highmem.h>
|
||||||
#include <linux/nodemask.h>
|
#include <linux/nodemask.h>
|
||||||
#include <linux/pagemap.h>
|
#include <linux/pagemap.h>
|
||||||
|
#include <linux/mempolicy.h>
|
||||||
|
|
||||||
#include <asm/page.h>
|
#include <asm/page.h>
|
||||||
#include <asm/pgtable.h>
|
#include <asm/pgtable.h>
|
||||||
|
|
||||||
|
@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
|
||||||
free_huge_pages_node[nid]++;
|
free_huge_pages_node[nid]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct page *dequeue_huge_page(void)
|
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
|
||||||
|
unsigned long address)
|
||||||
{
|
{
|
||||||
int nid = numa_node_id();
|
int nid = numa_node_id();
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
|
struct zonelist *zonelist = huge_zonelist(vma, address);
|
||||||
struct zone **z;
|
struct zone **z;
|
||||||
|
|
||||||
for (z = zonelist->zones; *z; z++) {
|
for (z = zonelist->zones; *z; z++) {
|
||||||
|
@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
|
||||||
spin_unlock(&hugetlb_lock);
|
spin_unlock(&hugetlb_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct page *alloc_huge_page(void)
|
struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
spin_lock(&hugetlb_lock);
|
spin_lock(&hugetlb_lock);
|
||||||
page = dequeue_huge_page();
|
page = dequeue_huge_page(vma, addr);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
spin_unlock(&hugetlb_lock);
|
spin_unlock(&hugetlb_lock);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
|
||||||
spin_lock(&hugetlb_lock);
|
spin_lock(&hugetlb_lock);
|
||||||
try_to_free_low(count);
|
try_to_free_low(count);
|
||||||
while (count < nr_huge_pages) {
|
while (count < nr_huge_pages) {
|
||||||
struct page *page = dequeue_huge_page();
|
struct page *page = dequeue_huge_page(NULL, 0);
|
||||||
if (!page)
|
if (!page)
|
||||||
break;
|
break;
|
||||||
update_and_free_page(page);
|
update_and_free_page(page);
|
||||||
|
@ -365,7 +368,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||||
flush_tlb_range(vma, start, end);
|
flush_tlb_range(vma, start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct page *find_or_alloc_huge_page(struct address_space *mapping,
|
static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr, struct address_space *mapping,
|
||||||
unsigned long idx, int shared)
|
unsigned long idx, int shared)
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
@ -378,7 +382,7 @@ retry:
|
||||||
|
|
||||||
if (hugetlb_get_quota(mapping))
|
if (hugetlb_get_quota(mapping))
|
||||||
goto out;
|
goto out;
|
||||||
page = alloc_huge_page();
|
page = alloc_huge_page(vma, addr);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
hugetlb_put_quota(mapping);
|
hugetlb_put_quota(mapping);
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
}
|
}
|
||||||
|
|
||||||
page_cache_get(old_page);
|
page_cache_get(old_page);
|
||||||
new_page = alloc_huge_page();
|
new_page = alloc_huge_page(vma, address);
|
||||||
|
|
||||||
if (!new_page) {
|
if (!new_page) {
|
||||||
page_cache_release(old_page);
|
page_cache_release(old_page);
|
||||||
|
@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
* Use page lock to guard against racing truncation
|
* Use page lock to guard against racing truncation
|
||||||
* before we get page_table_lock.
|
* before we get page_table_lock.
|
||||||
*/
|
*/
|
||||||
page = find_or_alloc_huge_page(mapping, idx,
|
page = find_or_alloc_huge_page(vma, address, mapping, idx,
|
||||||
vma->vm_flags & VM_SHARED);
|
vma->vm_flags & VM_SHARED);
|
||||||
if (!page)
|
if (!page)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
|
@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
|
||||||
return nid;
|
return nid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Determine a node number for interleave */
|
||||||
|
static inline unsigned interleave_nid(struct mempolicy *pol,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr, int shift)
|
||||||
|
{
|
||||||
|
if (vma) {
|
||||||
|
unsigned long off;
|
||||||
|
|
||||||
|
off = vma->vm_pgoff;
|
||||||
|
off += (addr - vma->vm_start) >> shift;
|
||||||
|
return offset_il_node(pol, vma, off);
|
||||||
|
} else
|
||||||
|
return interleave_nodes(pol);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return a zonelist suitable for a huge page allocation. */
|
||||||
|
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
{
|
||||||
|
struct mempolicy *pol = get_vma_policy(current, vma, addr);
|
||||||
|
|
||||||
|
if (pol->policy == MPOL_INTERLEAVE) {
|
||||||
|
unsigned nid;
|
||||||
|
|
||||||
|
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
|
||||||
|
return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
|
||||||
|
}
|
||||||
|
return zonelist_policy(GFP_HIGHUSER, pol);
|
||||||
|
}
|
||||||
|
|
||||||
/* Allocate a page in interleaved policy.
|
/* Allocate a page in interleaved policy.
|
||||||
Own path because it needs to do special accounting. */
|
Own path because it needs to do special accounting. */
|
||||||
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
|
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
|
||||||
|
@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
|
||||||
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
|
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
|
||||||
unsigned nid;
|
unsigned nid;
|
||||||
if (vma) {
|
|
||||||
unsigned long off;
|
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
|
||||||
off = vma->vm_pgoff;
|
|
||||||
off += (addr - vma->vm_start) >> PAGE_SHIFT;
|
|
||||||
nid = offset_il_node(pol, vma, off);
|
|
||||||
} else {
|
|
||||||
/* fall back to process interleaving */
|
|
||||||
nid = interleave_nodes(pol);
|
|
||||||
}
|
|
||||||
return alloc_page_interleave(gfp, 0, nid);
|
return alloc_page_interleave(gfp, 0, nid);
|
||||||
}
|
}
|
||||||
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
|
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
|
||||||
|
|
Loading…
Reference in New Issue