x86: construct 32-bit boot time page tables in native format.

Specifically the boot time page tables in a CONFIG_X86_PAE=y enabled
kernel are in PAE format.

early_ioremap is updated to use the standard page table accessors.

Clear any mappings beyond max_low_pfn from the boot page tables in
native_pagetable_setup_start because the initial mappings can extend
beyond the range of physical memory and into the vmalloc area.

Derived from patches by Eric Biederman and H. Peter Anvin.

[ jeremy@goop.org: PAE swapper_pg_dir needs to be page-sized fix ]

Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Mika Penttilä <mika.penttila@kolumbus.fi>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
This commit is contained in:
Ian Campbell 2008-02-09 23:24:09 +01:00 committed by Thomas Gleixner
parent 185c045c24
commit 551889a6e2
6 changed files with 177 additions and 108 deletions

View File

@ -19,6 +19,10 @@
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
/*
* References to members of the new_cpu_data structure.
@ -80,10 +84,6 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_
*/
.section .text.head,"ax",@progbits
ENTRY(startup_32)
/* check to see if KEEP_SEGMENTS flag is meaningful */
cmpw $0x207, BP_version(%esi)
jb 1f
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
testb $(1<<6), BP_loadflags(%esi)
@ -92,7 +92,7 @@ ENTRY(startup_32)
/*
* Set segments to known values.
*/
1: lgdt boot_gdt_descr - __PAGE_OFFSET
lgdt pa(boot_gdt_descr)
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
@ -105,8 +105,8 @@ ENTRY(startup_32)
*/
cld
xorl %eax,%eax
movl $__bss_start - __PAGE_OFFSET,%edi
movl $__bss_stop - __PAGE_OFFSET,%ecx
movl $pa(__bss_start),%edi
movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
rep ; stosl
@ -118,31 +118,32 @@ ENTRY(startup_32)
* (kexec on panic case). Hence copy out the parameters before initializing
* page tables.
*/
movl $(boot_params - __PAGE_OFFSET),%edi
movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
rep
movsl
movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
jz 1f # No comand line
movl $(boot_command_line - __PAGE_OFFSET),%edi
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
movsl
1:
#ifdef CONFIG_PARAVIRT
cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET)
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
jb default_entry
/* Paravirt-compatible boot parameters. Look to see what architecture
we're booting under. */
movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax
movl pa(boot_params + BP_hardware_subarch), %eax
cmpl $num_subarch_entries, %eax
jae bad_subarch
movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax
movl pa(subarch_entries)(,%eax,4), %eax
subl $__PAGE_OFFSET, %eax
jmp *%eax
@ -170,17 +171,68 @@ num_subarch_entries = (. - subarch_entries) / 4
* Mappings are created both at virtual address 0 (identity mapping)
* and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
*
* Warning: don't use %esi or the stack in this code. However, %esp
* can be used as a GPR if you really need it...
* Note that the stack is not yet set up!
*/
page_pde_offset = (__PAGE_OFFSET >> 20);
#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
default_entry:
movl $(pg0 - __PAGE_OFFSET), %edi
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
#ifdef CONFIG_X86_PAE
/*
* In PAE mode swapper_pg_dir is statically defined to contain enough
* entries to cover the VMSPLIT option (that is the top 1, 2 or 3
* entries). The identity mapping is handled by pointing two PGD
* entries to the first kernel PMD.
*
* Note the upper half of each PMD or PTE are always zero at
* this stage.
*/
#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */
xorl %ebx,%ebx /* %ebx is kept at zero */
movl $pa(pg0), %edi
movl $pa(swapper_pg_pmd), %edx
movl $PTE_ATTR, %eax
10:
leal 0x007(%edi),%ecx /* Create PDE entry */
leal PDE_ATTR(%edi),%ecx /* Create PMD entry */
movl %ecx,(%edx) /* Store PMD entry */
/* Upper half already zero */
addl $8,%edx
movl $512,%ecx
11:
stosl
xchgl %eax,%ebx
stosl
xchgl %eax,%ebx
addl $0x1000,%eax
loop 11b
/*
* End condition: we must map up to and including INIT_MAP_BEYOND_END
* bytes beyond the end of our own page tables.
*/
leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
1:
movl %edi,pa(init_pg_tables_end)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(pg0), %edi
movl $pa(swapper_pg_dir), %edx
movl $PTE_ATTR, %eax
10:
leal PDE_ATTR(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
@ -189,19 +241,20 @@ default_entry:
stosl
addl $0x1000,%eax
loop 11b
/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
/*
* End condition: we must map up to and including INIT_MAP_BEYOND_END
* bytes beyond the end of our own page tables; the +0x007 is
* the attribute bits
*/
leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
/* Do an early initialization of the fixmap area */
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
addl $0x67, %eax /* 0x67 == _PAGE_TABLE */
movl %eax, 4092(%edx)
movl %edi,pa(init_pg_tables_end)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
movl %eax,pa(swapper_pg_dir+0xffc)
#endif
jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
@ -241,7 +294,7 @@ ENTRY(startup_32_smp)
* NOTE! We have to correct for the fact that we're
* not yet offset PAGE_OFFSET..
*/
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
#define cr4_bits pa(mmu_cr4_features)
movl cr4_bits,%edx
andl %edx,%edx
jz 6f
@ -276,10 +329,10 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
movl $swapper_pg_dir-__PAGE_OFFSET,%eax
movl $pa(swapper_pg_dir),%eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $0x80000000,%eax
orl $X86_CR0_PG,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
@ -552,16 +605,44 @@ ENTRY(_stext)
*/
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
ENTRY(swapper_pg_pmd)
.fill 1024*KPMDS,4,0
#else
ENTRY(swapper_pg_dir)
.fill 1024,4,0
ENTRY(swapper_pg_pmd)
#endif
ENTRY(swapper_pg_fixmap)
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
/*
* This starts the data section.
*/
#ifdef CONFIG_X86_PAE
.section ".data.page_aligned","wa"
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
.long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */
# if KPMDS == 3
.long pa(swapper_pg_pmd+PGD_ATTR),0
.long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
.long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
.long pa(swapper_pg_pmd+PGD_ATTR),0
.long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
.long pa(swapper_pg_pmd+PGD_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
.align PAGE_SIZE_asm /* needs to be page-sized too */
#endif
.data
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE

View File

@ -154,7 +154,11 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
EXPORT_SYMBOL(boot_cpu_data);
#ifndef CONFIG_X86_PAE
unsigned long mmu_cr4_features;
#else
unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
/* for MCA, but anyone else can use it if they want */
unsigned int machine_id;

View File

@ -46,6 +46,7 @@
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
#include <asm/setup.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@ -328,44 +329,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
void __init native_pagetable_setup_start(pgd_t *base)
{
#ifdef CONFIG_X86_PAE
int i;
unsigned long pfn, va;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
/*
* Init entries of the first-level page table to the
* zero page, if they haven't already been set up.
*
* In a normal native boot, we'll be running on a
* pagetable rooted in swapper_pg_dir, but not in PAE
* mode, so this will end up clobbering the mappings
* for the lower 24Mbytes of the address space,
* without affecting the kernel address space.
* Remove any mappings which extend past the end of physical
* memory from the boot time page table:
*/
for (i = 0; i < USER_PTRS_PER_PGD; i++)
set_pgd(&base[i],
__pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
break;
/* Make sure kernel address space is empty so that a pagetable
will be allocated for it. */
memset(&base[USER_PTRS_PER_PGD], 0,
KERNEL_PGD_PTRS * sizeof(pgd_t));
#else
pud = pud_offset(pgd, va);
pmd = pmd_offset(pud, va);
if (!pmd_present(*pmd))
break;
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
pte_clear(NULL, va, pte);
}
paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
#endif
}
void __init native_pagetable_setup_done(pgd_t *base)
{
#ifdef CONFIG_X86_PAE
/*
* Add low memory identity-mappings - SMP needs it when
* starting up on an AP from real-mode. In the non-PAE
* case we already have these mappings through head.S.
* All user-space mappings are explicitly cleared after
* SMP startup.
*/
set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
#endif
}
/*
@ -374,9 +369,8 @@ void __init native_pagetable_setup_done(pgd_t *base)
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
* constructed in arch/i386/kernel/head.S, and not running in PAE mode
* (even if we'll end up running in PAE). The root of the pagetable
* will be swapper_pg_dir.
* constructed in arch/x86/kernel/head_32.S. The root of the
* pagetable will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
@ -537,14 +531,6 @@ void __init paging_init(void)
load_cr3(swapper_pg_dir);
#ifdef CONFIG_X86_PAE
/*
* We will bail out later - printk doesn't work right now so
* the user would just see a hanging kernel.
*/
if (cpu_has_pae)
set_in_cr4(X86_CR4_PAE);
#endif
__flush_tlb_all();
kmap_init();
@ -675,10 +661,6 @@ void __init mem_init(void)
BUG_ON((unsigned long)high_memory > VMALLOC_START);
#endif /* double-sanity-check paranoia */
#ifdef CONFIG_X86_PAE
if (!cpu_has_pae)
panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
#endif
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();

View File

@ -260,37 +260,42 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
static __initdata unsigned long bm_pte[1024]
static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
__attribute__((aligned(PAGE_SIZE)));
static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)];
pud_t *pud = pud_offset(pgd, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
}
static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
return &bm_pte[pte_index(addr)];
}
void __init early_ioremap_init(void)
{
unsigned long *pgd;
pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
*pgd = __pa(bm_pte) | _PAGE_TABLE;
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE));
/*
* The boot-ioremap range spans multiple pgds, for which
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
printk(KERN_WARNING "pgd %p != %p\n",
pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "pmd %p != %p\n",
pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
@ -304,28 +309,29 @@ void __init early_ioremap_init(void)
void __init early_ioremap_clear(void)
{
unsigned long *pgd;
pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_clear()\n");
pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
*pgd = 0;
paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
pmd_clear(pmd);
paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT);
__flush_tlb_all();
}
void __init early_ioremap_reset(void)
{
enum fixed_addresses idx;
unsigned long *pte, phys, addr;
unsigned long addr, phys;
pte_t *pte;
after_paging_init = 1;
for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
addr = fix_to_virt(idx);
pte = early_ioremap_pte(addr);
if (*pte & _PAGE_PRESENT) {
phys = *pte & PAGE_MASK;
if (pte_present(*pte)) {
phys = pte_val(*pte) & PAGE_MASK;
set_fixmap(idx, phys);
}
}
@ -334,7 +340,8 @@ void __init early_ioremap_reset(void)
static void __init __early_set_fixmap(enum fixed_addresses idx,
unsigned long phys, pgprot_t flags)
{
unsigned long *pte, addr = __fix_to_virt(idx);
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
if (idx >= __end_of_fixed_addresses) {
BUG();
@ -342,9 +349,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
}
pte = early_ioremap_pte(addr);
if (pgprot_val(flags))
*pte = (phys & PAGE_MASK) | pgprot_val(flags);
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
*pte = 0;
pte_clear(NULL, addr, pte);
__flush_tlb_one(addr);
}

View File

@ -48,7 +48,6 @@ typedef unsigned long pgprotval_t;
typedef unsigned long phys_addr_t;
typedef union { pteval_t pte, pte_low; } pte_t;
typedef pte_t boot_pte_t;
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_X86_PAE */

View File

@ -52,10 +52,6 @@ void paging_init(void);
#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
#define TWOLEVEL_PGDIR_SHIFT 22
#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
/* Just any arbitrary offset to the start of the vmalloc VM area: the
* current 8MB value just means that there will be a 8MB "hole" after the
* physical memory until the kernel virtual memory starts. That means that