2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* linux/arch/i386/mm/pgtable.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/mm.h>
|
2007-10-18 00:04:34 +08:00
|
|
|
#include <linux/nmi.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/spinlock.h>
|
2006-09-26 14:32:25 +08:00
|
|
|
#include <linux/module.h>
|
2007-05-13 02:15:24 +08:00
|
|
|
#include <linux/quicklist.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <asm/e820.h>
|
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
|
|
|
void show_mem(void)
|
|
|
|
{
|
|
|
|
int total = 0, reserved = 0;
|
|
|
|
int shared = 0, cached = 0;
|
|
|
|
int highmem = 0;
|
|
|
|
struct page *page;
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
unsigned long i;
|
2005-10-30 09:16:52 +08:00
|
|
|
unsigned long flags;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_INFO "Mem-info:\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
show_free_areas();
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
|
2006-03-27 17:15:59 +08:00
|
|
|
for_each_online_pgdat(pgdat) {
|
2005-10-30 09:16:52 +08:00
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
|
2007-10-18 00:04:34 +08:00
|
|
|
if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
|
|
|
|
touch_nmi_watchdog();
|
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 15:07:37 +08:00
|
|
|
page = pgdat_page_nr(pgdat, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
total++;
|
|
|
|
if (PageHighMem(page))
|
|
|
|
highmem++;
|
|
|
|
if (PageReserved(page))
|
|
|
|
reserved++;
|
|
|
|
else if (PageSwapCache(page))
|
|
|
|
cached++;
|
|
|
|
else if (page_count(page))
|
|
|
|
shared += page_count(page) - 1;
|
|
|
|
}
|
2005-10-30 09:16:52 +08:00
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_INFO "%d pages of RAM\n", total);
|
|
|
|
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
|
|
|
|
printk(KERN_INFO "%d reserved pages\n", reserved);
|
|
|
|
printk(KERN_INFO "%d pages shared\n", shared);
|
|
|
|
printk(KERN_INFO "%d pages swap cached\n", cached);
|
2005-06-23 15:08:08 +08:00
|
|
|
|
2006-06-30 16:55:39 +08:00
|
|
|
printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
|
2006-06-30 16:55:40 +08:00
|
|
|
printk(KERN_INFO "%lu pages writeback\n",
|
|
|
|
global_page_state(NR_WRITEBACK));
|
2006-06-30 16:55:34 +08:00
|
|
|
printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
|
2006-09-26 14:31:51 +08:00
|
|
|
printk(KERN_INFO "%lu pages slab\n",
|
|
|
|
global_page_state(NR_SLAB_RECLAIMABLE) +
|
|
|
|
global_page_state(NR_SLAB_UNRECLAIMABLE));
|
2006-06-30 16:55:38 +08:00
|
|
|
printk(KERN_INFO "%lu pages pagetables\n",
|
|
|
|
global_page_state(NR_PAGETABLE));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame.
|
|
|
|
*/
|
|
|
|
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
if (pud_none(*pud)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
if (pmd_none(*pmd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pte = pte_offset_kernel(pmd, vaddr);
|
2006-12-07 09:14:09 +08:00
|
|
|
if (pgprot_val(flags))
|
2007-10-18 00:04:33 +08:00
|
|
|
set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
|
2006-12-07 09:14:09 +08:00
|
|
|
else
|
|
|
|
pte_clear(&init_mm, vaddr, pte);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a large virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame. pfn is for the base of the page,
|
|
|
|
* vaddr is what the page gets mapped to - both must be properly aligned.
|
|
|
|
* The pmd must already be instantiated. Assumes PAE mode.
|
|
|
|
*/
|
|
|
|
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
2005-06-26 05:59:24 +08:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
|
2005-04-17 06:20:36 +08:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
set_pmd(pmd, pfn_pmd(pfn, flags));
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
2006-09-26 14:32:25 +08:00
|
|
|
static int fixmaps;
|
|
|
|
unsigned long __FIXADDR_TOP = 0xfffff000;
|
|
|
|
EXPORT_SYMBOL(__FIXADDR_TOP);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
|
|
|
|
{
|
|
|
|
unsigned long address = __fix_to_virt(idx);
|
|
|
|
|
|
|
|
if (idx >= __end_of_fixed_addresses) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
|
2006-09-26 14:32:25 +08:00
|
|
|
fixmaps++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* reserve_top_address - reserves a hole in the top of kernel address space
|
|
|
|
* @reserve - size of hole to reserve
|
|
|
|
*
|
|
|
|
* Can be used to relocate the fixmap area and poke a hole in the top
|
|
|
|
* of kernel address space to make room for a hypervisor.
|
|
|
|
*/
|
|
|
|
void reserve_top_address(unsigned long reserve)
|
|
|
|
{
|
|
|
|
BUG_ON(fixmaps > 0);
|
2007-02-13 20:26:21 +08:00
|
|
|
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
|
|
|
|
(int)-reserve);
|
2006-09-26 14:32:25 +08:00
|
|
|
__FIXADDR_TOP = -reserve - PAGE_SIZE;
|
|
|
|
__VMALLOC_RESERVE += reserve;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
struct page *pte;
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#else
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#endif
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of all pgd's needed for non-PAE so it can invalidate entries
|
|
|
|
* in both cached and uncached pgd's; not needed for PAE since the
|
|
|
|
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
|
|
|
* tactic would be needed. This is essentially codepath-based locking
|
|
|
|
* against pageattr.c; it is the unique case in which a valid change
|
|
|
|
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
|
|
|
* vmalloc faults work because attached pagetables are never freed.
|
|
|
|
* -- wli
|
|
|
|
*/
|
|
|
|
DEFINE_SPINLOCK(pgd_lock);
|
|
|
|
struct page *pgd_list;
|
|
|
|
|
|
|
|
static inline void pgd_list_add(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(pgd);
|
|
|
|
page->index = (unsigned long)pgd_list;
|
|
|
|
if (pgd_list)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(pgd_list, (unsigned long)&page->index);
|
2005-04-17 06:20:36 +08:00
|
|
|
pgd_list = page;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(page, (unsigned long)&pgd_list);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgd_list_del(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *next, **pprev, *page = virt_to_page(pgd);
|
|
|
|
next = (struct page *)page->index;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
pprev = (struct page **)page_private(page);
|
2005-04-17 06:20:36 +08:00
|
|
|
*pprev = next;
|
|
|
|
if (next)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 09:16:40 +08:00
|
|
|
set_page_private(next, (unsigned long)pprev);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-05-13 02:15:24 +08:00
|
|
|
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
#if (PTRS_PER_PMD == 1)
|
|
|
|
/* Non-PAE pgd constructor */
|
2007-07-21 23:11:07 +08:00
|
|
|
static void pgd_ctor(void *pgd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
/* !PAE, no pagetable sharing */
|
|
|
|
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
/* must happen under lock */
|
2005-09-04 06:56:50 +08:00
|
|
|
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
2005-04-17 06:20:36 +08:00
|
|
|
swapper_pg_dir + USER_PTRS_PER_PGD,
|
2005-09-04 06:56:50 +08:00
|
|
|
KERNEL_PGD_PTRS);
|
2007-02-13 20:26:21 +08:00
|
|
|
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
__pa(swapper_pg_dir) >> PAGE_SHIFT,
|
|
|
|
USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
2005-04-17 06:20:36 +08:00
|
|
|
pgd_list_add(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
#else /* PTRS_PER_PMD > 1 */
|
|
|
|
/* PAE pgd constructor */
|
2007-07-21 23:11:07 +08:00
|
|
|
static void pgd_ctor(void *pgd)
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
{
|
|
|
|
/* PAE, kernel PMD may be shared */
|
|
|
|
|
|
|
|
if (SHARED_KERNEL_PMD) {
|
|
|
|
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
|
|
|
swapper_pg_dir + USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
|
|
|
} else {
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_add(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* PTRS_PER_PMD */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-21 23:11:07 +08:00
|
|
|
static void pgd_dtor(void *pgd)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned long flags; /* can be called from interrupt context */
|
|
|
|
|
2007-05-13 02:15:24 +08:00
|
|
|
if (SHARED_KERNEL_PMD)
|
|
|
|
return;
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_del(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
#define UNSHARED_PTRS_PER_PGD \
|
|
|
|
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
|
|
|
|
|
2008-01-30 20:33:40 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
/*
|
|
|
|
* Mop up any pmd pages which may still be attached to the pgd.
|
|
|
|
* Normally they will be freed by munmap/exit_mmap, but any pmd we
|
|
|
|
* preallocate which never got a corresponding vma will need to be
|
|
|
|
* freed manually.
|
|
|
|
*/
|
|
|
|
static void pgd_mop_up_pmds(pgd_t *pgdp)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2008-01-30 20:33:40 +08:00
|
|
|
for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
|
2008-01-30 20:33:40 +08:00
|
|
|
pgd_t pgd = pgdp[i];
|
|
|
|
|
|
|
|
if (pgd_val(pgd) != 0) {
|
|
|
|
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
|
|
|
|
|
|
|
|
pgdp[i] = native_make_pgd(0);
|
|
|
|
|
|
|
|
paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
|
|
|
|
pmd_free(pmd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
|
|
|
|
* updating the top-level pagetable entries to guarantee the
|
|
|
|
* processor notices the update. Since this is expensive, and
|
|
|
|
* all 4 top-level entries are used almost immediately in a
|
|
|
|
* new process's life, we just pre-populate them here.
|
2008-01-30 20:33:40 +08:00
|
|
|
*
|
|
|
|
* Also, if we're in a paravirt environment where the kernel pmd is
|
|
|
|
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
|
|
|
|
* and initialize the kernel pmds here.
|
2008-01-30 20:33:40 +08:00
|
|
|
*/
|
|
|
|
static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long addr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, 0);
|
2008-01-30 20:33:40 +08:00
|
|
|
for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
|
|
|
|
i++, pud++, addr += PUD_SIZE) {
|
2008-01-30 20:33:40 +08:00
|
|
|
pmd_t *pmd = pmd_alloc_one(mm, addr);
|
|
|
|
|
|
|
|
if (!pmd) {
|
|
|
|
pgd_mop_up_pmds(pgd);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-30 20:33:40 +08:00
|
|
|
if (i >= USER_PTRS_PER_PGD)
|
|
|
|
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
|
|
|
|
sizeof(pmd_t) * PTRS_PER_PMD);
|
|
|
|
|
2008-01-30 20:33:40 +08:00
|
|
|
pud_populate(mm, pud, pmd);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_X86_PAE */
|
|
|
|
/* No need to prepopulate any pagetable entries in non-PAE modes. */
|
|
|
|
static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pgd_mop_up_pmds(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_X86_PAE */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
|
|
{
|
2007-05-13 02:15:24 +08:00
|
|
|
pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:33:39 +08:00
|
|
|
mm->pgd = pgd; /* so that alloc_pd can use it */
|
|
|
|
|
2008-01-30 20:33:40 +08:00
|
|
|
if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
|
|
|
|
quicklist_free(0, pgd_dtor, pgd);
|
|
|
|
pgd = NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return pgd;
|
|
|
|
}
|
|
|
|
|
|
|
|
void pgd_free(pgd_t *pgd)
|
|
|
|
{
|
2008-01-30 20:33:40 +08:00
|
|
|
pgd_mop_up_pmds(pgd);
|
2007-05-13 02:15:24 +08:00
|
|
|
quicklist_free(0, pgd_dtor, pgd);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-05-13 02:15:24 +08:00
|
|
|
|
|
|
|
void check_pgt_cache(void)
|
|
|
|
{
|
|
|
|
quicklist_trim(0, pgd_dtor, 25, 16);
|
|
|
|
}
|