2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
|
|
*
|
|
|
|
* Enhanced CPU detection and feature setting code by Mike Jagdis
|
|
|
|
* and Martin Mares, November 1997.
|
|
|
|
*/
|
|
|
|
|
|
|
|
.text
|
|
|
|
#include <linux/threads.h>
|
2008-01-30 20:33:28 +08:00
|
|
|
#include <linux/init.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/segment.h>
|
2009-02-14 03:14:01 +08:00
|
|
|
#include <asm/page_types.h>
|
|
|
|
#include <asm/pgtable_types.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/cache.h>
|
|
|
|
#include <asm/thread_info.h>
|
2005-09-10 01:28:28 +08:00
|
|
|
#include <asm/asm-offsets.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/setup.h>
|
2008-02-10 06:24:09 +08:00
|
|
|
#include <asm/processor-flags.h>
|
2009-11-14 07:28:13 +08:00
|
|
|
#include <asm/msr-index.h>
|
|
|
|
#include <asm/cpufeature.h>
|
2009-02-09 21:17:40 +08:00
|
|
|
#include <asm/percpu.h>
|
2008-02-10 06:24:09 +08:00
|
|
|
|
|
|
|
/* Physical address */
|
|
|
|
#define pa(X) ((X) - __PAGE_OFFSET)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* References to members of the new_cpu_data structure.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define X86 new_cpu_data+CPUINFO_x86
|
|
|
|
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
|
|
|
|
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
|
|
|
|
#define X86_MASK new_cpu_data+CPUINFO_x86_mask
|
|
|
|
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
|
|
|
|
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
|
|
|
|
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
|
|
|
|
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
|
|
|
|
|
|
|
|
/*
|
2009-03-17 03:07:54 +08:00
|
|
|
* This is how much memory in addition to the memory covered up to
|
|
|
|
* and including _end we need mapped initially.
|
2007-05-03 01:27:16 +08:00
|
|
|
* We need:
|
2009-03-09 16:15:57 +08:00
|
|
|
* (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
|
|
|
|
* (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Modulo rounding, each megabyte assigned here requires a kilobyte of
|
|
|
|
* memory, which is currently unreclaimed.
|
|
|
|
*
|
|
|
|
* This should be a multiple of a page.
|
2009-03-09 16:15:57 +08:00
|
|
|
*
|
|
|
|
* KERNEL_IMAGE_SIZE should be greater than pa(_end)
|
|
|
|
* and small than max_low_pfn, otherwise will waste some page table entries
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
|
2007-05-03 01:27:16 +08:00
|
|
|
#if PTRS_PER_PMD > 1
|
2009-03-17 03:07:54 +08:00
|
|
|
#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
|
2007-05-03 01:27:16 +08:00
|
|
|
#else
|
2009-03-17 03:07:54 +08:00
|
|
|
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
|
2007-05-03 01:27:16 +08:00
|
|
|
#endif
|
|
|
|
|
2009-03-17 03:07:54 +08:00
|
|
|
/* Enough space to fit pagetables for the low memory linear map */
|
2009-03-18 02:38:23 +08:00
|
|
|
MAPPING_BEYOND_END = \
|
|
|
|
PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
|
2009-03-17 03:07:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Worst-case size of the kernel mapping we need to make:
|
|
|
|
* the worst-case size of the kernel itself, plus the extra we need
|
|
|
|
* to map for the linear map.
|
|
|
|
*/
|
|
|
|
KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
|
|
|
|
|
2009-03-17 03:10:07 +08:00
|
|
|
INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
|
2009-03-09 16:15:57 +08:00
|
|
|
RESERVE_BRK(pagetables, INIT_MAP_SIZE)
|
2009-03-13 07:09:49 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
|
|
|
|
* %esi points to the real-mode code as a 32-bit pointer.
|
|
|
|
* CS and DS must be 4 GB flat segments, but we don't depend on
|
|
|
|
* any particular GDT layout, because we load our own as soon as we
|
|
|
|
* can.
|
|
|
|
*/
|
2009-09-17 04:44:28 +08:00
|
|
|
__HEAD
|
2005-04-17 06:20:36 +08:00
|
|
|
ENTRY(startup_32)
|
2007-10-22 07:41:35 +08:00
|
|
|
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
|
|
|
|
us to not reload segments */
|
|
|
|
testb $(1<<6), BP_loadflags(%esi)
|
|
|
|
jnz 2f
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set segments to known values.
|
|
|
|
*/
|
2008-02-10 06:24:09 +08:00
|
|
|
lgdt pa(boot_gdt_descr)
|
2005-04-17 06:20:36 +08:00
|
|
|
movl $(__BOOT_DS),%eax
|
|
|
|
movl %eax,%ds
|
|
|
|
movl %eax,%es
|
|
|
|
movl %eax,%fs
|
|
|
|
movl %eax,%gs
|
2007-10-22 07:41:35 +08:00
|
|
|
2:
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear BSS first so that there are no surprises...
|
|
|
|
*/
|
2007-10-22 07:41:35 +08:00
|
|
|
cld
|
2005-04-17 06:20:36 +08:00
|
|
|
xorl %eax,%eax
|
2008-02-10 06:24:09 +08:00
|
|
|
movl $pa(__bss_start),%edi
|
|
|
|
movl $pa(__bss_stop),%ecx
|
2005-04-17 06:20:36 +08:00
|
|
|
subl %edi,%ecx
|
|
|
|
shrl $2,%ecx
|
|
|
|
rep ; stosl
|
2005-09-04 06:56:31 +08:00
|
|
|
/*
|
|
|
|
* Copy bootup parameters out of the way.
|
|
|
|
* Note: %esi still has the pointer to the real-mode data.
|
|
|
|
* With the kexec as boot loader, parameter segment might be loaded beyond
|
|
|
|
* kernel image and might not even be addressable by early boot page tables.
|
|
|
|
* (kexec on panic case). Hence copy out the parameters before initializing
|
|
|
|
* page tables.
|
|
|
|
*/
|
2008-02-10 06:24:09 +08:00
|
|
|
movl $pa(boot_params),%edi
|
2005-09-04 06:56:31 +08:00
|
|
|
movl $(PARAM_SIZE/4),%ecx
|
|
|
|
cld
|
|
|
|
rep
|
|
|
|
movsl
|
2008-02-10 06:24:09 +08:00
|
|
|
movl pa(boot_params) + NEW_CL_POINTER,%esi
|
2005-09-04 06:56:31 +08:00
|
|
|
andl %esi,%esi
|
2007-10-24 04:37:25 +08:00
|
|
|
jz 1f # No comand line
|
2008-02-10 06:24:09 +08:00
|
|
|
movl $pa(boot_command_line),%edi
|
2005-09-04 06:56:31 +08:00
|
|
|
movl $(COMMAND_LINE_SIZE/4),%ecx
|
|
|
|
rep
|
|
|
|
movsl
|
|
|
|
1:
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-06-19 05:46:53 +08:00
|
|
|
#ifdef CONFIG_OLPC_OPENFIRMWARE
|
|
|
|
/* save OFW's pgdir table for later use when calling into OFW */
|
|
|
|
movl %cr3, %eax
|
|
|
|
movl %eax, pa(olpc_ofw_pgd)
|
|
|
|
#endif
|
|
|
|
|
2007-10-22 07:41:35 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT
|
2008-02-10 06:24:09 +08:00
|
|
|
/* This is can only trip for a broken bootloader... */
|
|
|
|
cmpw $0x207, pa(boot_params + BP_version)
|
2007-10-22 07:41:35 +08:00
|
|
|
jb default_entry
|
|
|
|
|
|
|
|
/* Paravirt-compatible boot parameters. Look to see what architecture
|
|
|
|
we're booting under. */
|
2008-02-10 06:24:09 +08:00
|
|
|
movl pa(boot_params + BP_hardware_subarch), %eax
|
2007-10-22 07:41:35 +08:00
|
|
|
cmpl $num_subarch_entries, %eax
|
|
|
|
jae bad_subarch
|
|
|
|
|
2008-02-10 06:24:09 +08:00
|
|
|
movl pa(subarch_entries)(,%eax,4), %eax
|
2007-10-22 07:41:35 +08:00
|
|
|
subl $__PAGE_OFFSET, %eax
|
|
|
|
jmp *%eax
|
|
|
|
|
|
|
|
bad_subarch:
|
|
|
|
WEAK(lguest_entry)
|
|
|
|
WEAK(xen_entry)
|
|
|
|
/* Unknown implementation; there's really
|
|
|
|
nothing we can do at this point. */
|
|
|
|
ud2a
|
2008-01-30 20:33:28 +08:00
|
|
|
|
|
|
|
__INITDATA
|
|
|
|
|
2007-10-22 07:41:35 +08:00
|
|
|
subarch_entries:
|
|
|
|
.long default_entry /* normal x86/PC */
|
|
|
|
.long lguest_entry /* lguest hypervisor */
|
|
|
|
.long xen_entry /* Xen hypervisor */
|
2009-08-29 05:52:47 +08:00
|
|
|
.long default_entry /* Moorestown MID */
|
2007-10-22 07:41:35 +08:00
|
|
|
num_subarch_entries = (. - subarch_entries) / 4
|
|
|
|
.previous
|
|
|
|
#endif /* CONFIG_PARAVIRT */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Initialize page tables. This creates a PDE and a set of page
|
2009-03-09 16:15:57 +08:00
|
|
|
* tables, which are located immediately beyond __brk_base. The variable
|
2009-02-28 05:27:38 +08:00
|
|
|
* _brk_end is set up to point to the first "safe" location.
|
2005-04-17 06:20:36 +08:00
|
|
|
* Mappings are created both at virtual address 0 (identity mapping)
|
2009-03-09 16:15:57 +08:00
|
|
|
* and PAGE_OFFSET for up to _end.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2008-02-10 06:24:09 +08:00
|
|
|
* Note that the stack is not yet set up!
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-10-22 07:41:35 +08:00
|
|
|
default_entry:
|
2008-02-10 06:24:09 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In PAE mode swapper_pg_dir is statically defined to contain enough
|
|
|
|
* entries to cover the VMSPLIT option (that is the top 1, 2 or 3
|
|
|
|
* entries). The identity mapping is handled by pointing two PGD
|
|
|
|
* entries to the first kernel PMD.
|
|
|
|
*
|
|
|
|
* Note the upper half of each PMD or PTE are always zero at
|
|
|
|
* this stage.
|
|
|
|
*/
|
|
|
|
|
2008-06-03 05:21:06 +08:00
|
|
|
#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
|
2008-02-10 06:24:09 +08:00
|
|
|
|
|
|
|
xorl %ebx,%ebx /* %ebx is kept at zero */
|
|
|
|
|
2009-02-28 05:27:38 +08:00
|
|
|
movl $pa(__brk_base), %edi
|
2008-02-10 06:24:09 +08:00
|
|
|
movl $pa(swapper_pg_pmd), %edx
|
2008-09-24 05:00:36 +08:00
|
|
|
movl $PTE_IDENT_ATTR, %eax
|
2008-02-10 06:24:09 +08:00
|
|
|
10:
|
2008-09-24 05:00:36 +08:00
|
|
|
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
|
2008-02-10 06:24:09 +08:00
|
|
|
movl %ecx,(%edx) /* Store PMD entry */
|
|
|
|
/* Upper half already zero */
|
|
|
|
addl $8,%edx
|
|
|
|
movl $512,%ecx
|
|
|
|
11:
|
|
|
|
stosl
|
|
|
|
xchgl %eax,%ebx
|
|
|
|
stosl
|
|
|
|
xchgl %eax,%ebx
|
|
|
|
addl $0x1000,%eax
|
|
|
|
loop 11b
|
|
|
|
|
|
|
|
/*
|
2009-03-17 03:07:54 +08:00
|
|
|
* End condition: we must map up to the end + MAPPING_BEYOND_END.
|
2008-02-10 06:24:09 +08:00
|
|
|
*/
|
2009-03-17 03:07:54 +08:00
|
|
|
movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
|
2008-02-10 06:24:09 +08:00
|
|
|
cmpl %ebp,%eax
|
|
|
|
jb 10b
|
|
|
|
1:
|
2009-02-28 05:27:38 +08:00
|
|
|
addl $__PAGE_OFFSET, %edi
|
|
|
|
movl %edi, pa(_brk_end)
|
2008-06-02 14:53:50 +08:00
|
|
|
shrl $12, %eax
|
|
|
|
movl %eax, pa(max_pfn_mapped)
|
2008-02-10 06:24:09 +08:00
|
|
|
|
|
|
|
/* Do early initialization of the fixmap area */
|
2008-09-24 05:00:36 +08:00
|
|
|
movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
|
2008-02-10 06:24:09 +08:00
|
|
|
movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
|
|
|
|
#else /* Not PAE */
|
|
|
|
|
|
|
|
page_pde_offset = (__PAGE_OFFSET >> 20);
|
|
|
|
|
2009-02-28 05:27:38 +08:00
|
|
|
movl $pa(__brk_base), %edi
|
2008-02-10 06:24:09 +08:00
|
|
|
movl $pa(swapper_pg_dir), %edx
|
2008-09-24 05:00:36 +08:00
|
|
|
movl $PTE_IDENT_ATTR, %eax
|
2005-04-17 06:20:36 +08:00
|
|
|
10:
|
2008-09-24 05:00:36 +08:00
|
|
|
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
|
2005-04-17 06:20:36 +08:00
|
|
|
movl %ecx,(%edx) /* Store identity PDE entry */
|
|
|
|
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
|
|
|
|
addl $4,%edx
|
|
|
|
movl $1024, %ecx
|
|
|
|
11:
|
|
|
|
stosl
|
|
|
|
addl $0x1000,%eax
|
|
|
|
loop 11b
|
2008-02-10 06:24:09 +08:00
|
|
|
/*
|
2009-03-17 03:07:54 +08:00
|
|
|
* End condition: we must map up to the end + MAPPING_BEYOND_END.
|
2008-02-10 06:24:09 +08:00
|
|
|
*/
|
2009-03-17 03:07:54 +08:00
|
|
|
movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
|
2005-04-17 06:20:36 +08:00
|
|
|
cmpl %ebp,%eax
|
|
|
|
jb 10b
|
2009-02-28 05:27:38 +08:00
|
|
|
addl $__PAGE_OFFSET, %edi
|
|
|
|
movl %edi, pa(_brk_end)
|
2008-06-02 14:53:50 +08:00
|
|
|
shrl $12, %eax
|
|
|
|
movl %eax, pa(max_pfn_mapped)
|
2007-12-02 09:34:06 +08:00
|
|
|
|
2008-02-10 06:24:09 +08:00
|
|
|
/* Do early initialization of the fixmap area */
|
2008-09-24 05:00:36 +08:00
|
|
|
movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
|
2008-02-10 06:24:09 +08:00
|
|
|
movl %eax,pa(swapper_pg_dir+0xffc)
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
jmp 3f
|
|
|
|
/*
|
|
|
|
* Non-boot CPU entry point; entered from trampoline.S
|
|
|
|
* We can't lgdt here, because lgdt itself uses a data segment, but
|
2007-05-03 01:27:10 +08:00
|
|
|
* we know the trampoline has already loaded the boot_gdt for us.
|
2007-02-13 20:26:22 +08:00
|
|
|
*
|
|
|
|
* If cpu hotplug is not supported then this code can go in init section
|
|
|
|
* which will be freed later
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-02-13 20:26:22 +08:00
|
|
|
|
2009-08-18 23:41:33 +08:00
|
|
|
__CPUINIT
|
2007-02-13 20:26:22 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2005-04-17 06:20:36 +08:00
|
|
|
ENTRY(startup_32_smp)
|
|
|
|
cld
|
|
|
|
movl $(__BOOT_DS),%eax
|
|
|
|
movl %eax,%ds
|
|
|
|
movl %eax,%es
|
|
|
|
movl %eax,%fs
|
|
|
|
movl %eax,%gs
|
2008-01-30 20:33:27 +08:00
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
3:
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* New page tables may be in 4Mbyte page mode and may
|
|
|
|
* be using the global pages.
|
|
|
|
*
|
|
|
|
* NOTE! If we are on a 486 we may have no cr4 at all!
|
|
|
|
* So we do not try to touch it unless we really have
|
|
|
|
* some bits in it to set. This won't work if the BSP
|
|
|
|
* implements cr4 but this AP does not -- very unlikely
|
|
|
|
* but be warned! The same applies to the pse feature
|
|
|
|
* if not equally supported. --macro
|
|
|
|
*
|
|
|
|
* NOTE! We have to correct for the fact that we're
|
|
|
|
* not yet offset PAGE_OFFSET..
|
|
|
|
*/
|
2008-02-10 06:24:09 +08:00
|
|
|
#define cr4_bits pa(mmu_cr4_features)
|
2005-04-17 06:20:36 +08:00
|
|
|
movl cr4_bits,%edx
|
|
|
|
andl %edx,%edx
|
|
|
|
jz 6f
|
|
|
|
movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
|
|
|
|
orl %edx,%eax
|
|
|
|
movl %eax,%cr4
|
|
|
|
|
2009-11-14 07:28:13 +08:00
|
|
|
testb $X86_CR4_PAE, %al # check if PAE is enabled
|
|
|
|
jz 6f
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Check if extended functions are implemented */
|
|
|
|
movl $0x80000000, %eax
|
|
|
|
cpuid
|
2009-11-14 07:28:13 +08:00
|
|
|
/* Value must be in the range 0x80000001 to 0x8000ffff */
|
|
|
|
subl $0x80000001, %eax
|
|
|
|
cmpl $(0x8000ffff-0x80000001), %eax
|
|
|
|
ja 6f
|
2005-04-17 06:20:36 +08:00
|
|
|
mov $0x80000001, %eax
|
|
|
|
cpuid
|
|
|
|
/* Execute Disable bit supported? */
|
2009-11-14 07:28:13 +08:00
|
|
|
btl $(X86_FEATURE_NX & 31), %edx
|
2005-04-17 06:20:36 +08:00
|
|
|
jnc 6f
|
|
|
|
|
|
|
|
/* Setup EFER (Extended Feature Enable Register) */
|
2009-11-14 07:28:13 +08:00
|
|
|
movl $MSR_EFER, %ecx
|
2005-04-17 06:20:36 +08:00
|
|
|
rdmsr
|
|
|
|
|
2009-11-14 07:28:13 +08:00
|
|
|
btsl $_EFER_NX, %eax
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Make changes effective */
|
|
|
|
wrmsr
|
|
|
|
|
|
|
|
6:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable paging
|
|
|
|
*/
|
x86-32: Separate 1:1 pagetables from swapper_pg_dir
This patch fixes machine crashes which occur when heavily exercising the
CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by
AMD Erratum 383 and result in a fatal machine check exception. Here's
the scenario:
1. On 32-bit, the swapper_pg_dir page table is used as the initial page
table for booting a secondary CPU.
2. To make this work, swapper_pg_dir needs a direct mapping of physical
memory in it (the low mappings). By adding those low, large page (2M)
mappings (PAE kernel), we create the necessary conditions for Erratum
383 to occur.
3. Other CPUs which do not participate in the off- and onlining game may
use swapper_pg_dir while the low mappings are present (when leave_mm is
called). For all steps below, the CPU referred to is a CPU that is using
swapper_pg_dir, and not the CPU which is being onlined.
4. The presence of the low mappings in swapper_pg_dir can result
in TLB entries for addresses below __PAGE_OFFSET to be established
speculatively. These TLB entries are marked global and large.
5. When the CPU with such TLB entry switches to another page table, this
TLB entry remains because it is global.
6. The process then generates an access to an address covered by the
above TLB entry but there is a permission mismatch - the TLB entry
covers a large global page not accessible to userspace.
7. Due to this permission mismatch a new 4kb, user TLB entry gets
established. Further, Erratum 383 provides for a small window of time
where both TLB entries are present. This results in an uncorrectable
machine check exception signalling a TLB multimatch which panics the
machine.
There are two ways to fix this issue:
1. Always do a global TLB flush when a new cr3 is loaded and the
old page table was swapper_pg_dir. I consider this a hack hard
to understand and with performance implications
2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit
does.
This patch implements solution 2. It introduces a trampoline_pg_dir
which has the same layout as swapper_pg_dir with low_mappings. This page
table is used as the initial page table of the booting CPU. Later in the
bringup process, it switches to swapper_pg_dir and does a global TLB
flush. This fixes the crashes in our test cases.
-v2: switch to swapper_pg_dir right after entering start_secondary() so
that we are able to access percpu data which might not be mapped in the
trampoline page table.
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20100816123833.GB28147@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-16 20:38:33 +08:00
|
|
|
movl pa(initial_page_table), %eax
|
2005-04-17 06:20:36 +08:00
|
|
|
movl %eax,%cr3 /* set the page table pointer.. */
|
|
|
|
movl %cr0,%eax
|
2008-02-10 06:24:09 +08:00
|
|
|
orl $X86_CR0_PG,%eax
|
2005-04-17 06:20:36 +08:00
|
|
|
movl %eax,%cr0 /* ..and set paging (PG) bit */
|
|
|
|
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
|
|
|
|
1:
|
|
|
|
/* Set up the stack pointer */
|
|
|
|
lss stack_start,%esp
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize eflags. Some BIOS's leave bits like NT set. This would
|
|
|
|
* confuse the debugger if this code is traced.
|
|
|
|
* XXX - best to initialize before switching to protected mode.
|
|
|
|
*/
|
|
|
|
pushl $0
|
|
|
|
popfl
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2008-01-30 20:33:27 +08:00
|
|
|
cmpb $0, ready
|
2005-04-17 06:20:36 +08:00
|
|
|
jz 1f /* Initial CPU cleans BSS */
|
|
|
|
jmp checkCPUtype
|
|
|
|
1:
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* start system 32-bit setup. We need to re-do some of the things done
|
|
|
|
* in 16-bit mode for the "real" operations.
|
|
|
|
*/
|
|
|
|
call setup_idt
|
|
|
|
|
|
|
|
checkCPUtype:
|
|
|
|
|
|
|
|
movl $-1,X86_CPUID # -1 for no CPUID initially
|
|
|
|
|
|
|
|
/* check if it is 486 or 386. */
|
|
|
|
/*
|
|
|
|
* XXX - this does a lot of unnecessary setup. Alignment checks don't
|
|
|
|
* apply at our cpl of 0 and the stack ought to be aligned already, and
|
|
|
|
* we don't need to preserve eflags.
|
|
|
|
*/
|
|
|
|
|
|
|
|
movb $3,X86 # at least 386
|
|
|
|
pushfl # push EFLAGS
|
|
|
|
popl %eax # get EFLAGS
|
|
|
|
movl %eax,%ecx # save original EFLAGS
|
|
|
|
xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
|
|
|
|
pushl %eax # copy to EFLAGS
|
|
|
|
popfl # set EFLAGS
|
|
|
|
pushfl # get new EFLAGS
|
|
|
|
popl %eax # put it in eax
|
|
|
|
xorl %ecx,%eax # change in flags
|
|
|
|
pushl %ecx # restore original EFLAGS
|
|
|
|
popfl
|
|
|
|
testl $0x40000,%eax # check if AC bit changed
|
|
|
|
je is386
|
|
|
|
|
|
|
|
movb $4,X86 # at least 486
|
|
|
|
testl $0x200000,%eax # check if ID bit changed
|
|
|
|
je is486
|
|
|
|
|
|
|
|
/* get vendor info */
|
|
|
|
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
|
|
|
|
cpuid
|
|
|
|
movl %eax,X86_CPUID # save CPUID level
|
|
|
|
movl %ebx,X86_VENDOR_ID # lo 4 chars
|
|
|
|
movl %edx,X86_VENDOR_ID+4 # next 4 chars
|
|
|
|
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
|
|
|
|
|
|
|
|
orl %eax,%eax # do we have processor info as well?
|
|
|
|
je is486
|
|
|
|
|
|
|
|
movl $1,%eax # Use the CPUID instruction to get CPU type
|
|
|
|
cpuid
|
|
|
|
movb %al,%cl # save reg for future use
|
|
|
|
andb $0x0f,%ah # mask processor family
|
|
|
|
movb %ah,X86
|
|
|
|
andb $0xf0,%al # mask model
|
|
|
|
shrb $4,%al
|
|
|
|
movb %al,X86_MODEL
|
|
|
|
andb $0x0f,%cl # mask mask revision
|
|
|
|
movb %cl,X86_MASK
|
|
|
|
movl %edx,X86_CAPABILITY
|
|
|
|
|
|
|
|
is486: movl $0x50022,%ecx # set AM, WP, NE and MP
|
|
|
|
jmp 2f
|
|
|
|
|
|
|
|
is386: movl $2,%ecx # set MP
|
|
|
|
2: movl %cr0,%eax
|
|
|
|
andl $0x80000011,%eax # Save PG,PE,ET
|
|
|
|
orl %ecx,%eax
|
|
|
|
movl %eax,%cr0
|
|
|
|
|
|
|
|
call check_x87
|
2007-02-13 20:26:26 +08:00
|
|
|
lgdt early_gdt_descr
|
2005-04-17 06:20:36 +08:00
|
|
|
lidt idt_descr
|
|
|
|
ljmp $(__KERNEL_CS),$1f
|
|
|
|
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
|
|
|
|
movl %eax,%ss # after changing gdt.
|
|
|
|
|
|
|
|
movl $(__USER_DS),%eax # DS/ES contains default USER segment
|
|
|
|
movl %eax,%ds
|
|
|
|
movl %eax,%es
|
|
|
|
|
2009-01-21 16:26:05 +08:00
|
|
|
movl $(__KERNEL_PERCPU), %eax
|
|
|
|
movl %eax,%fs # set this cpu's percpu
|
|
|
|
|
2009-02-09 21:17:40 +08:00
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
|
|
|
/*
|
|
|
|
* The linker can't handle this by relocation. Manually set
|
|
|
|
* base address in stack canary segment descriptor.
|
|
|
|
*/
|
|
|
|
cmpb $0,ready
|
|
|
|
jne 1f
|
2009-10-29 21:34:15 +08:00
|
|
|
movl $gdt_page,%eax
|
|
|
|
movl $stack_canary,%ecx
|
2009-02-09 21:17:40 +08:00
|
|
|
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
|
|
|
|
shrl $16, %ecx
|
|
|
|
movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
|
|
|
|
movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
|
|
|
|
1:
|
|
|
|
#endif
|
|
|
|
movl $(__KERNEL_STACK_CANARY),%eax
|
2007-02-13 20:26:20 +08:00
|
|
|
movl %eax,%gs
|
2009-02-09 21:17:40 +08:00
|
|
|
|
|
|
|
xorl %eax,%eax # Clear LDT
|
2005-04-17 06:20:36 +08:00
|
|
|
lldt %ax
|
[PATCH] i386: Use %gs as the PDA base-segment in the kernel
This patch is the meat of the PDA change. This patch makes several related
changes:
1: Most significantly, %gs is now used in the kernel. This means that on
entry, the old value of %gs is saved away, and it is reloaded with
__KERNEL_PDA.
2: entry.S constructs the stack in the shape of struct pt_regs, and this
is passed around the kernel so that the process's saved register
state can be accessed.
Unfortunately struct pt_regs doesn't currently have space for %gs
(or %fs). This patch extends pt_regs to add space for gs (no space
is allocated for %fs, since it won't be used, and it would just
complicate the code in entry.S to work around the space).
3: Because %gs is now saved on the stack like %ds, %es and the integer
registers, there are a number of places where it no longer needs to
be handled specially; namely context switch, and saving/restoring the
register state in a signal context.
4: And since kernel threads run in kernel space and call normal kernel
code, they need to be created with their %gs == __KERNEL_PDA.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
2006-12-07 09:14:02 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
cld # gcc2 wants the direction flag cleared at all times
|
2006-10-22 00:37:02 +08:00
|
|
|
pushl $0 # fake return address for unwinder
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2005-06-26 05:54:49 +08:00
|
|
|
movb ready, %cl
|
|
|
|
movb $1, ready
|
2006-08-31 01:37:09 +08:00
|
|
|
cmpb $0,%cl # the first CPU calls start_kernel
|
2007-05-03 01:27:16 +08:00
|
|
|
je 1f
|
2008-05-29 00:01:54 +08:00
|
|
|
movl (stack_start), %esp
|
2007-05-03 01:27:16 +08:00
|
|
|
1:
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* CONFIG_SMP */
|
2008-05-28 23:57:02 +08:00
|
|
|
jmp *(initial_code)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We depend on ET to be correct. This checks for 287/387.
|
|
|
|
*/
|
|
|
|
check_x87:
|
|
|
|
movb $0,X86_HARD_MATH
|
|
|
|
clts
|
|
|
|
fninit
|
|
|
|
fstsw %ax
|
|
|
|
cmpb $0,%al
|
|
|
|
je 1f
|
|
|
|
movl %cr0,%eax /* no coprocessor: have to set bits */
|
|
|
|
xorl $4,%eax /* set EM */
|
|
|
|
movl %eax,%cr0
|
|
|
|
ret
|
|
|
|
ALIGN
|
|
|
|
1: movb $1,X86_HARD_MATH
|
|
|
|
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
|
|
|
|
ret
|
|
|
|
|
|
|
|
/*
|
|
|
|
* setup_idt
|
|
|
|
*
|
|
|
|
* sets up a idt with 256 entries pointing to
|
|
|
|
* ignore_int, interrupt gates. It doesn't actually load
|
|
|
|
* idt - that can be done only after paging has been enabled
|
|
|
|
* and the kernel moved to PAGE_OFFSET. Interrupts
|
|
|
|
* are enabled elsewhere, when we can be relatively
|
|
|
|
* sure everything is ok.
|
|
|
|
*
|
|
|
|
* Warning: %esi is live across this function.
|
|
|
|
*/
|
|
|
|
setup_idt:
|
|
|
|
lea ignore_int,%edx
|
|
|
|
movl $(__KERNEL_CS << 16),%eax
|
|
|
|
movw %dx,%ax /* selector = 0x0010 = cs */
|
|
|
|
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
|
|
|
|
|
|
|
|
lea idt_table,%edi
|
|
|
|
mov $256,%ecx
|
|
|
|
rp_sidt:
|
|
|
|
movl %eax,(%edi)
|
|
|
|
movl %edx,4(%edi)
|
|
|
|
addl $8,%edi
|
|
|
|
dec %ecx
|
|
|
|
jne rp_sidt
|
2006-09-26 16:52:39 +08:00
|
|
|
|
|
|
|
.macro set_early_handler handler,trapno
|
|
|
|
lea \handler,%edx
|
|
|
|
movl $(__KERNEL_CS << 16),%eax
|
|
|
|
movw %dx,%ax
|
|
|
|
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
|
|
|
|
lea idt_table,%edi
|
|
|
|
movl %eax,8*\trapno(%edi)
|
|
|
|
movl %edx,8*\trapno+4(%edi)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
set_early_handler handler=early_divide_err,trapno=0
|
|
|
|
set_early_handler handler=early_illegal_opcode,trapno=6
|
|
|
|
set_early_handler handler=early_protection_fault,trapno=13
|
|
|
|
set_early_handler handler=early_page_fault,trapno=14
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
ret
|
|
|
|
|
2006-09-26 16:52:39 +08:00
|
|
|
early_divide_err:
|
|
|
|
xor %edx,%edx
|
|
|
|
pushl $0 /* fake errcode */
|
|
|
|
jmp early_fault
|
|
|
|
|
|
|
|
early_illegal_opcode:
|
|
|
|
movl $6,%edx
|
|
|
|
pushl $0 /* fake errcode */
|
|
|
|
jmp early_fault
|
|
|
|
|
|
|
|
early_protection_fault:
|
|
|
|
movl $13,%edx
|
|
|
|
jmp early_fault
|
|
|
|
|
|
|
|
early_page_fault:
|
|
|
|
movl $14,%edx
|
|
|
|
jmp early_fault
|
|
|
|
|
|
|
|
early_fault:
|
|
|
|
cld
|
|
|
|
#ifdef CONFIG_PRINTK
|
2007-10-18 00:04:41 +08:00
|
|
|
pusha
|
2006-09-26 16:52:39 +08:00
|
|
|
movl $(__KERNEL_DS),%eax
|
|
|
|
movl %eax,%ds
|
|
|
|
movl %eax,%es
|
|
|
|
cmpl $2,early_recursion_flag
|
|
|
|
je hlt_loop
|
|
|
|
incl early_recursion_flag
|
|
|
|
movl %cr2,%eax
|
|
|
|
pushl %eax
|
|
|
|
pushl %edx /* trapno */
|
|
|
|
pushl $fault_msg
|
|
|
|
call printk
|
|
|
|
#endif
|
2008-01-30 20:33:09 +08:00
|
|
|
call dump_stack
|
2006-09-26 16:52:39 +08:00
|
|
|
hlt_loop:
|
|
|
|
hlt
|
|
|
|
jmp hlt_loop
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* This is the default interrupt "handler" :-) */
|
|
|
|
ALIGN
|
|
|
|
ignore_int:
|
|
|
|
cld
|
2005-05-01 23:59:02 +08:00
|
|
|
#ifdef CONFIG_PRINTK
|
2005-04-17 06:20:36 +08:00
|
|
|
pushl %eax
|
|
|
|
pushl %ecx
|
|
|
|
pushl %edx
|
|
|
|
pushl %es
|
|
|
|
pushl %ds
|
|
|
|
movl $(__KERNEL_DS),%eax
|
|
|
|
movl %eax,%ds
|
|
|
|
movl %eax,%es
|
2006-09-26 16:52:39 +08:00
|
|
|
cmpl $2,early_recursion_flag
|
|
|
|
je hlt_loop
|
|
|
|
incl early_recursion_flag
|
2005-04-17 06:20:36 +08:00
|
|
|
pushl 16(%esp)
|
|
|
|
pushl 24(%esp)
|
|
|
|
pushl 32(%esp)
|
|
|
|
pushl 40(%esp)
|
|
|
|
pushl $int_msg
|
|
|
|
call printk
|
2009-01-26 13:09:00 +08:00
|
|
|
|
|
|
|
call dump_stack
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
addl $(5*4),%esp
|
|
|
|
popl %ds
|
|
|
|
popl %es
|
|
|
|
popl %edx
|
|
|
|
popl %ecx
|
|
|
|
popl %eax
|
2005-05-01 23:59:02 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
iret
|
|
|
|
|
2009-07-28 01:43:52 +08:00
|
|
|
__REFDATA
|
2008-07-28 03:43:11 +08:00
|
|
|
.align 4
|
|
|
|
ENTRY(initial_code)
|
|
|
|
.long i386_start_kernel
|
x86-32: Separate 1:1 pagetables from swapper_pg_dir
This patch fixes machine crashes which occur when heavily exercising the
CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by
AMD Erratum 383 and result in a fatal machine check exception. Here's
the scenario:
1. On 32-bit, the swapper_pg_dir page table is used as the initial page
table for booting a secondary CPU.
2. To make this work, swapper_pg_dir needs a direct mapping of physical
memory in it (the low mappings). By adding those low, large page (2M)
mappings (PAE kernel), we create the necessary conditions for Erratum
383 to occur.
3. Other CPUs which do not participate in the off- and onlining game may
use swapper_pg_dir while the low mappings are present (when leave_mm is
called). For all steps below, the CPU referred to is a CPU that is using
swapper_pg_dir, and not the CPU which is being onlined.
4. The presence of the low mappings in swapper_pg_dir can result
in TLB entries for addresses below __PAGE_OFFSET to be established
speculatively. These TLB entries are marked global and large.
5. When the CPU with such TLB entry switches to another page table, this
TLB entry remains because it is global.
6. The process then generates an access to an address covered by the
above TLB entry but there is a permission mismatch - the TLB entry
covers a large global page not accessible to userspace.
7. Due to this permission mismatch a new 4kb, user TLB entry gets
established. Further, Erratum 383 provides for a small window of time
where both TLB entries are present. This results in an uncorrectable
machine check exception signalling a TLB multimatch which panics the
machine.
There are two ways to fix this issue:
1. Always do a global TLB flush when a new cr3 is loaded and the
old page table was swapper_pg_dir. I consider this a hack hard
to understand and with performance implications
2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit
does.
This patch implements solution 2. It introduces a trampoline_pg_dir
which has the same layout as swapper_pg_dir with low_mappings. This page
table is used as the initial page table of the booting CPU. Later in the
bringup process, it switches to swapper_pg_dir and does a global TLB
flush. This fixes the crashes in our test cases.
-v2: switch to swapper_pg_dir right after entering start_secondary() so
that we are able to access percpu data which might not be mapped in the
trampoline page table.
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20100816123833.GB28147@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-16 20:38:33 +08:00
|
|
|
ENTRY(initial_page_table)
|
|
|
|
.long pa(swapper_pg_dir)
|
2008-07-28 03:43:11 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* BSS section
|
|
|
|
*/
|
2009-09-21 06:14:14 +08:00
|
|
|
__PAGE_ALIGNED_BSS
|
xen: Core Xen implementation
This patch is a rollup of all the core pieces of the Xen
implementation, including:
- booting and setup
- pagetable setup
- privileged instructions
- segmentation
- interrupt flags
- upcalls
- multicall batching
BOOTING AND SETUP
The vmlinux image is decorated with ELF notes which tell the Xen
domain builder what the kernel's requirements are; the domain builder
then constructs the address space accordingly and starts the kernel.
Xen has its own entrypoint for the kernel (contained in an ELF note).
The ELF notes are set up by xen-head.S, which is included into head.S.
In principle it could be linked separately, but it seems to provoke
lots of binutils bugs.
Because the domain builder starts the kernel in a fairly sane state
(32-bit protected mode, paging enabled, flat segments set up), there's
not a lot of setup needed before starting the kernel proper. The main
steps are:
1. Install the Xen paravirt_ops, which is simply a matter of a
structure assignment.
2. Set init_mm to use the Xen-supplied pagetables (analogous to the
head.S generated pagetables in a native boot).
3. Reserve address space for Xen, since it takes a chunk at the top
of the address space for its own use.
4. Call start_kernel()
PAGETABLE SETUP
Once we hit the main kernel boot sequence, it will end up calling back
via paravirt_ops to set up various pieces of Xen specific state. One
of the critical things which requires a bit of extra care is the
construction of the initial init_mm pagetable. Because Xen places
tight constraints on pagetables (an active pagetable must always be
valid, and must always be mapped read-only to the guest domain), we
need to be careful when constructing the new pagetable to keep these
constraints in mind. It turns out that the easiest way to do this is
use the initial Xen-provided pagetable as a template, and then just
insert new mappings for memory where a mapping doesn't already exist.
This means that during pagetable setup, it uses a special version of
xen_set_pte which ignores any attempt to remap a read-only page as
read-write (since Xen will map its own initial pagetable as RO), but
lets other changes to the ptes happen, so that things like NX are set
properly.
PRIVILEGED INSTRUCTIONS AND SEGMENTATION
When the kernel runs under Xen, it runs in ring 1 rather than ring 0.
This means that it is more privileged than user-mode in ring 3, but it
still can't run privileged instructions directly. Non-performance
critical instructions are dealt with by taking a privilege exception
and trapping into the hypervisor and emulating the instruction, but
more performance-critical instructions have their own specific
paravirt_ops. In many cases we can avoid having to do any hypercalls
for these instructions, or the Xen implementation is quite different
from the normal native version.
The privileged instructions fall into the broad classes of:
Segmentation: setting up the GDT and the GDT entries, LDT,
TLS and so on. Xen doesn't allow the GDT to be directly
modified; all GDT updates are done via hypercalls where the new
entries can be validated. This is important because Xen uses
segment limits to prevent the guest kernel from damaging the
hypervisor itself.
Traps and exceptions: Xen uses a special format for trap entrypoints,
so when the kernel wants to set an IDT entry, it needs to be
converted to the form Xen expects. Xen sets int 0x80 up specially
so that the trap goes straight from userspace into the guest kernel
without going via the hypervisor. sysenter isn't supported.
Kernel stack: The esp0 entry is extracted from the tss and provided to
Xen.
TLB operations: the various TLB calls are mapped into corresponding
Xen hypercalls.
Control registers: all the control registers are privileged. The most
important is cr3, which points to the base of the current pagetable,
and we handle it specially.
Another instruction we treat specially is CPUID, even though its not
privileged. We want to control what CPU features are visible to the
rest of the kernel, and so CPUID ends up going into a paravirt_op.
Xen implements this mainly to disable the ACPI and APIC subsystems.
INTERRUPT FLAGS
Xen maintains its own separate flag for masking events, which is
contained within the per-cpu vcpu_info structure. Because the guest
kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely
ignored (and must be, because even if a guest domain disables
interrupts for itself, it can't disable them overall).
(A note on terminology: "events" and interrupts are effectively
synonymous. However, rather than using an "enable flag", Xen uses a
"mask flag", which blocks event delivery when it is non-zero.)
There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which
are implemented to manage the Xen event mask state. The only thing
worth noting is that when events are unmasked, we need to explicitly
see if there's a pending event and call into the hypervisor to make
sure it gets delivered.
UPCALLS
Xen needs a couple of upcall (or callback) functions to be implemented
by each guest. One is the event upcalls, which is how events
(interrupts, effectively) are delivered to the guests. The other is
the failsafe callback, which is used to report errors in either
reloading a segment register, or caused by iret. These are
implemented in i386/kernel/entry.S so they can jump into the normal
iret_exc path when necessary.
MULTICALL BATCHING
Xen provides a multicall mechanism, which allows multiple hypercalls
to be issued at once in order to mitigate the cost of trapping into
the hypervisor. This is particularly useful for context switches,
since the 4-5 hypercalls they would normally need (reload cr3, update
TLS, maybe update LDT) can be reduced to one. This patch implements a
generic batching mechanism for hypercalls, which gets used in many
places in the Xen code.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Ian Pratt <ian.pratt@xensource.com>
Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
|
|
|
.align PAGE_SIZE_asm
|
2008-02-10 06:24:09 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
2008-02-23 03:58:37 +08:00
|
|
|
swapper_pg_pmd:
|
2008-02-10 06:24:09 +08:00
|
|
|
.fill 1024*KPMDS,4,0
|
|
|
|
#else
|
2005-04-17 06:20:36 +08:00
|
|
|
ENTRY(swapper_pg_dir)
|
|
|
|
.fill 1024,4,0
|
2008-02-10 06:24:09 +08:00
|
|
|
#endif
|
2008-02-14 05:29:55 +08:00
|
|
|
swapper_pg_fixmap:
|
2007-07-16 14:37:28 +08:00
|
|
|
.fill 1024,4,0
|
x86-32: Separate 1:1 pagetables from swapper_pg_dir
This patch fixes machine crashes which occur when heavily exercising the
CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by
AMD Erratum 383 and result in a fatal machine check exception. Here's
the scenario:
1. On 32-bit, the swapper_pg_dir page table is used as the initial page
table for booting a secondary CPU.
2. To make this work, swapper_pg_dir needs a direct mapping of physical
memory in it (the low mappings). By adding those low, large page (2M)
mappings (PAE kernel), we create the necessary conditions for Erratum
383 to occur.
3. Other CPUs which do not participate in the off- and onlining game may
use swapper_pg_dir while the low mappings are present (when leave_mm is
called). For all steps below, the CPU referred to is a CPU that is using
swapper_pg_dir, and not the CPU which is being onlined.
4. The presence of the low mappings in swapper_pg_dir can result
in TLB entries for addresses below __PAGE_OFFSET to be established
speculatively. These TLB entries are marked global and large.
5. When the CPU with such TLB entry switches to another page table, this
TLB entry remains because it is global.
6. The process then generates an access to an address covered by the
above TLB entry but there is a permission mismatch - the TLB entry
covers a large global page not accessible to userspace.
7. Due to this permission mismatch a new 4kb, user TLB entry gets
established. Further, Erratum 383 provides for a small window of time
where both TLB entries are present. This results in an uncorrectable
machine check exception signalling a TLB multimatch which panics the
machine.
There are two ways to fix this issue:
1. Always do a global TLB flush when a new cr3 is loaded and the
old page table was swapper_pg_dir. I consider this a hack hard
to understand and with performance implications
2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit
does.
This patch implements solution 2. It introduces a trampoline_pg_dir
which has the same layout as swapper_pg_dir with low_mappings. This page
table is used as the initial page table of the booting CPU. Later in the
bringup process, it switches to swapper_pg_dir and does a global TLB
flush. This fixes the crashes in our test cases.
-v2: switch to swapper_pg_dir right after entering start_secondary() so
that we are able to access percpu data which might not be mapped in the
trampoline page table.
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20100816123833.GB28147@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-16 20:38:33 +08:00
|
|
|
#ifdef CONFIG_X86_TRAMPOLINE
|
|
|
|
ENTRY(trampoline_pg_dir)
|
|
|
|
.fill 1024,4,0
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
ENTRY(empty_zero_page)
|
|
|
|
.fill 4096,1,0
|
2009-03-09 16:15:57 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This starts the data section.
|
|
|
|
*/
|
2008-02-10 06:24:09 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
2009-09-21 06:14:15 +08:00
|
|
|
__PAGE_ALIGNED_DATA
|
2008-02-10 06:24:09 +08:00
|
|
|
/* Page-aligned for the benefit of paravirt? */
|
|
|
|
.align PAGE_SIZE_asm
|
|
|
|
ENTRY(swapper_pg_dir)
|
2008-09-24 05:00:36 +08:00
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
|
2008-02-10 06:24:09 +08:00
|
|
|
# if KPMDS == 3
|
2008-09-24 05:00:36 +08:00
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
|
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
|
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
|
2008-02-10 06:24:09 +08:00
|
|
|
# elif KPMDS == 2
|
|
|
|
.long 0,0
|
2008-09-24 05:00:36 +08:00
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
|
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
|
2008-02-10 06:24:09 +08:00
|
|
|
# elif KPMDS == 1
|
|
|
|
.long 0,0
|
|
|
|
.long 0,0
|
2008-09-24 05:00:36 +08:00
|
|
|
.long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
|
2008-02-10 06:24:09 +08:00
|
|
|
# else
|
|
|
|
# error "Kernel PMDs should be 1, 2 or 3"
|
|
|
|
# endif
|
|
|
|
.align PAGE_SIZE_asm /* needs to be page-sized too */
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
.data
|
|
|
|
ENTRY(stack_start)
|
|
|
|
.long init_thread_union+THREAD_SIZE
|
|
|
|
.long __BOOT_DS
|
|
|
|
|
|
|
|
ready: .byte 0
|
|
|
|
|
2006-09-26 16:52:39 +08:00
|
|
|
early_recursion_flag:
|
|
|
|
.long 0
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
int_msg:
|
2009-01-26 13:09:00 +08:00
|
|
|
.asciz "Unknown interrupt or fault at: %p %p %p\n"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-26 16:52:39 +08:00
|
|
|
fault_msg:
|
2008-04-26 03:02:34 +08:00
|
|
|
/* fault info: */
|
|
|
|
.ascii "BUG: Int %d: CR2 %p\n"
|
|
|
|
/* pusha regs: */
|
|
|
|
.ascii " EDI %p ESI %p EBP %p ESP %p\n"
|
|
|
|
.ascii " EBX %p EDX %p ECX %p EAX %p\n"
|
|
|
|
/* fault frame: */
|
|
|
|
.ascii " err %p EIP %p CS %p flg %p\n"
|
|
|
|
.ascii "Stack: %p %p %p %p %p %p %p %p\n"
|
|
|
|
.ascii " %p %p %p %p %p %p %p %p\n"
|
|
|
|
.asciz " %p %p %p %p %p %p %p %p\n"
|
2006-09-26 16:52:39 +08:00
|
|
|
|
2007-10-11 17:16:51 +08:00
|
|
|
#include "../../x86/xen/xen-head.S"
|
xen: Core Xen implementation
This patch is a rollup of all the core pieces of the Xen
implementation, including:
- booting and setup
- pagetable setup
- privileged instructions
- segmentation
- interrupt flags
- upcalls
- multicall batching
BOOTING AND SETUP
The vmlinux image is decorated with ELF notes which tell the Xen
domain builder what the kernel's requirements are; the domain builder
then constructs the address space accordingly and starts the kernel.
Xen has its own entrypoint for the kernel (contained in an ELF note).
The ELF notes are set up by xen-head.S, which is included into head.S.
In principle it could be linked separately, but it seems to provoke
lots of binutils bugs.
Because the domain builder starts the kernel in a fairly sane state
(32-bit protected mode, paging enabled, flat segments set up), there's
not a lot of setup needed before starting the kernel proper. The main
steps are:
1. Install the Xen paravirt_ops, which is simply a matter of a
structure assignment.
2. Set init_mm to use the Xen-supplied pagetables (analogous to the
head.S generated pagetables in a native boot).
3. Reserve address space for Xen, since it takes a chunk at the top
of the address space for its own use.
4. Call start_kernel()
PAGETABLE SETUP
Once we hit the main kernel boot sequence, it will end up calling back
via paravirt_ops to set up various pieces of Xen specific state. One
of the critical things which requires a bit of extra care is the
construction of the initial init_mm pagetable. Because Xen places
tight constraints on pagetables (an active pagetable must always be
valid, and must always be mapped read-only to the guest domain), we
need to be careful when constructing the new pagetable to keep these
constraints in mind. It turns out that the easiest way to do this is
use the initial Xen-provided pagetable as a template, and then just
insert new mappings for memory where a mapping doesn't already exist.
This means that during pagetable setup, it uses a special version of
xen_set_pte which ignores any attempt to remap a read-only page as
read-write (since Xen will map its own initial pagetable as RO), but
lets other changes to the ptes happen, so that things like NX are set
properly.
PRIVILEGED INSTRUCTIONS AND SEGMENTATION
When the kernel runs under Xen, it runs in ring 1 rather than ring 0.
This means that it is more privileged than user-mode in ring 3, but it
still can't run privileged instructions directly. Non-performance
critical instructions are dealt with by taking a privilege exception
and trapping into the hypervisor and emulating the instruction, but
more performance-critical instructions have their own specific
paravirt_ops. In many cases we can avoid having to do any hypercalls
for these instructions, or the Xen implementation is quite different
from the normal native version.
The privileged instructions fall into the broad classes of:
Segmentation: setting up the GDT and the GDT entries, LDT,
TLS and so on. Xen doesn't allow the GDT to be directly
modified; all GDT updates are done via hypercalls where the new
entries can be validated. This is important because Xen uses
segment limits to prevent the guest kernel from damaging the
hypervisor itself.
Traps and exceptions: Xen uses a special format for trap entrypoints,
so when the kernel wants to set an IDT entry, it needs to be
converted to the form Xen expects. Xen sets int 0x80 up specially
so that the trap goes straight from userspace into the guest kernel
without going via the hypervisor. sysenter isn't supported.
Kernel stack: The esp0 entry is extracted from the tss and provided to
Xen.
TLB operations: the various TLB calls are mapped into corresponding
Xen hypercalls.
Control registers: all the control registers are privileged. The most
important is cr3, which points to the base of the current pagetable,
and we handle it specially.
Another instruction we treat specially is CPUID, even though its not
privileged. We want to control what CPU features are visible to the
rest of the kernel, and so CPUID ends up going into a paravirt_op.
Xen implements this mainly to disable the ACPI and APIC subsystems.
INTERRUPT FLAGS
Xen maintains its own separate flag for masking events, which is
contained within the per-cpu vcpu_info structure. Because the guest
kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely
ignored (and must be, because even if a guest domain disables
interrupts for itself, it can't disable them overall).
(A note on terminology: "events" and interrupts are effectively
synonymous. However, rather than using an "enable flag", Xen uses a
"mask flag", which blocks event delivery when it is non-zero.)
There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which
are implemented to manage the Xen event mask state. The only thing
worth noting is that when events are unmasked, we need to explicitly
see if there's a pending event and call into the hypervisor to make
sure it gets delivered.
UPCALLS
Xen needs a couple of upcall (or callback) functions to be implemented
by each guest. One is the event upcalls, which is how events
(interrupts, effectively) are delivered to the guests. The other is
the failsafe callback, which is used to report errors in either
reloading a segment register, or caused by iret. These are
implemented in i386/kernel/entry.S so they can jump into the normal
iret_exc path when necessary.
MULTICALL BATCHING
Xen provides a multicall mechanism, which allows multiple hypercalls
to be issued at once in order to mitigate the cost of trapping into
the hypervisor. This is particularly useful for context switches,
since the 4-5 hypercalls they would normally need (reload cr3, update
TLS, maybe update LDT) can be reduced to one. This patch implements a
generic batching mechanism for hypercalls, which gets used in many
places in the Xen code.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Ian Pratt <ian.pratt@xensource.com>
Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* The IDT and GDT 'descriptors' are a strange 48-bit object
|
|
|
|
* only used by the lidt and lgdt instructions. They are not
|
|
|
|
* like usual segment descriptors - they consist of a 16-bit
|
|
|
|
* segment size, and 32-bit linear address value:
|
|
|
|
*/
|
|
|
|
|
|
|
|
.globl boot_gdt_descr
|
|
|
|
.globl idt_descr
|
|
|
|
|
|
|
|
ALIGN
|
|
|
|
# early boot GDT descriptor (must use 1:1 address mapping)
|
|
|
|
.word 0 # 32 bit align gdt_desc.address
|
|
|
|
boot_gdt_descr:
|
|
|
|
.word __BOOT_DS+7
|
2007-05-03 01:27:10 +08:00
|
|
|
.long boot_gdt - __PAGE_OFFSET
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
.word 0 # 32-bit align idt_desc.address
|
|
|
|
idt_descr:
|
|
|
|
.word IDT_ENTRIES*8-1 # idt contains 256 entries
|
|
|
|
.long idt_table
|
|
|
|
|
|
|
|
# boot GDT descriptor (later on used by CPU#0):
|
|
|
|
.word 0 # 32 bit align gdt_desc.address
|
2007-02-13 20:26:26 +08:00
|
|
|
ENTRY(early_gdt_descr)
|
2005-04-17 06:20:36 +08:00
|
|
|
.word GDT_ENTRIES*8-1
|
2009-10-29 21:34:15 +08:00
|
|
|
.long gdt_page /* Overwritten for secondary CPUs */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
2007-05-03 01:27:10 +08:00
|
|
|
* The boot_gdt must mirror the equivalent in setup.S and is
|
2005-04-17 06:20:36 +08:00
|
|
|
* used only for booting.
|
|
|
|
*/
|
|
|
|
.align L1_CACHE_BYTES
|
2007-05-03 01:27:10 +08:00
|
|
|
ENTRY(boot_gdt)
|
2005-04-17 06:20:36 +08:00
|
|
|
.fill GDT_ENTRY_BOOT_CS,8,0
|
|
|
|
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
|
|
|
|
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
|