From cbc34973709eb41b369c304c075cf2069f847012 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 13 Feb 2008 13:14:35 -0800 Subject: [PATCH 01/24] lguest: include function prototypes Added a declaration to asm-x86/lguest.h and moved the extern arrays there as well. As an alternative to including asm/lguest.h directly, an include could be put in linux/lguest.h Signed-off-by: Harvey Harrison Cc: "rusty@rustcorp.com.au" Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/lguest/boot.c | 10 +--------- include/asm-x86/lguest.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5afdde4895dc..1e613fb03e32 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -75,15 +76,6 @@ * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*/ -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; -extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; -extern void lguest_iret(void); - struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h index 4d9367b72976..9b17571e9bc3 100644 --- a/include/asm-x86/lguest.h +++ b/include/asm-x86/lguest.h @@ -23,6 +23,17 @@ /* Found in switcher.S */ extern unsigned long default_idt_entries[]; +/* Declarations for definitions in lguest_guest.S */ +extern char lguest_noirq_start[], lguest_noirq_end[]; +extern const char lgstart_cli[], lgend_cli[]; +extern const char lgstart_sti[], lgend_sti[]; +extern const char lgstart_popf[], lgend_popf[]; +extern const char lgstart_pushf[], lgend_pushf[]; +extern const char lgstart_iret[], lgend_iret[]; + +extern void lguest_iret(void); +extern void lguest_init(void); + struct lguest_regs { /* Manually saved part. */ From db342d216ba9e060d8c5501eefc1d0a789c9e711 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Tue, 19 Feb 2008 08:16:03 +0100 Subject: [PATCH 02/24] lguest: fix build breakage [ mingo@elte.hu: merged to Rusty's patch ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/asm-offsets_32.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a33d53017997..8ea040124f7d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -128,13 +128,11 @@ void foo(void) OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); #endif -#ifdef CONFIG_LGUEST_GUEST +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); -#endif -#ifdef CONFIG_LGUEST BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); From 1ce70c4fac3c3954bd48c035f448793867592bc0 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sun, 24 Feb 2008 17:55:15 +0200 Subject: [PATCH 03/24] x86/lguest: fix pgdir pmd index calculation Hi all, Beginning from commits close to v2.6.25-rc2, running lguest always oopses the host kernel. Oops is at [1]. Bisection led to the following commit: commit 37cc8d7f963ba2deec29c9b68716944516a3244f x86/early_ioremap: don't assume we're using swapper_pg_dir At the early stages of boot, before the kernel pagetable has been fully initialized, a Xen kernel will still be running off the Xen-provided pagetables rather than swapper_pg_dir[]. Therefore, readback cr3 to determine the base of the pagetable rather than assuming swapper_pg_dir[]. static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); Trying to analyze the problem, it seems on the guest side of lguest, %cr3 has a different value from &swapper_pg-dir (which is AFAIK fine on a pravirt guest): Putting some debugging messages in early_ioremap_pmd: /* Appears 3 times */ [ 0.000000] *************************** [ 0.000000] __va(%cr3) = c0000000, &swapper_pg_dir = c02cc000 [ 0.000000] *************************** After 8 hours of debugging and staring on lguest code, I noticed something strange in paravirt_ops->set_pmd hypercall invocation: static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); } The first hcall parameter is global pgdir which looks fine. The second parameter is the pmd index in the pgdir which is suspectful. AFAIK, calculating the index of pmd does not need a divisoin over four. Removing the division made lguest work fine again . Patch is at [2]. I am not sure why the division over four existed in the first place. It seems bogus, maybe the Xen patch just made the problem appear ? [2]: The patch: [PATCH] lguest: fix pgdir pmd index cacluation Remove an error in index calculation which leads to removing a not existing shadow page table (leading to a Null dereference). Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1e613fb03e32..cccb38a59653 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -481,7 +481,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, - (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); + (__pa(pmdp)&(PAGE_SIZE-1)), 0); } /* There are a couple of legacy places where the kernel sets a PTE, but we From 92cb54a37a42a41cfb2ef7f1478bfa4395198258 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2008 14:37:52 +0100 Subject: [PATCH 04/24] x86: make DEBUG_PAGEALLOC and CPA more robust Use PF_MEMALLOC to prevent recursive calls in the DBEUG_PAGEALLOC case. This makes the code simpler and more robust against allocation failures. This fixes the following fallback to non-mmconfig: http://lkml.org/lkml/2008/2/20/551 http://bugzilla.kernel.org/show_bug.cgi?id=10083 Also, for DEBUG_PAGEALLOC=n reduce the pool size to one page. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 84 +++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 464d8fc21ce6..14e48b5a94ba 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void) #endif +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -355,45 +361,48 @@ out_unlock: static LIST_HEAD(page_pool); static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed, pool_refill; +static unsigned long pool_used, pool_failed; -static void cpa_fill_pool(void) +static void cpa_fill_pool(struct page **ret) { - struct page *p; gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; - /* Do not allocate from interrupt context */ - if (in_irq() || irqs_disabled()) - return; /* - * Check unlocked. I does not matter when we have one more - * page in the pool. The bit lock avoids recursive pool - * allocations: + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: */ - if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + if (current->flags & PF_MEMALLOC) return; + current->flags |= PF_MEMALLOC; -#ifdef CONFIG_DEBUG_PAGEALLOC /* - * We could do: - * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; - * but this fails on !PREEMPT kernels + * Allocate atomically from atomic contexts: */ - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; -#endif + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - while (pool_pages < pool_size) { + while (pool_pages < pool_size || (ret && !*ret)) { p = alloc_pages(gfp, 0); if (!p) { pool_failed++; break; } - spin_lock_irq(&pgd_lock); + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); list_add(&p->lru, &page_pool); pool_pages++; - spin_unlock_irq(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } - clear_bit_unlock(0, &pool_refill); + + current->flags &= ~PF_MEMALLOC; } #define SHIFT_MB (20 - PAGE_SHIFT) @@ -414,11 +423,15 @@ void __init cpa_init(void) * GiB. Shift MiB to Gib and multiply the result by * POOL_PAGES_PER_GB: */ - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } pool_low = pool_size; - cpa_fill_pool(); + cpa_fill_pool(NULL); printk(KERN_DEBUG "CPA: page pool initialized %lu of %lu pages preallocated\n", pool_pages, pool_size); @@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address) spin_lock_irqsave(&pgd_lock, flags); if (list_empty(&page_pool)) { spin_unlock_irqrestore(&pgd_lock, flags); - return -ENOMEM; + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; } - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - /* * Check for races, another CPU might have split this page * up for us already: @@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(); + cpa_fill_pool(NULL); + return ret; } @@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * Try to refill the page pool here. We can do this only after * the tlb flush. */ - cpa_fill_pool(); + cpa_fill_pool(NULL); } #ifdef CONFIG_HIBERNATION From b02a7f22f39f02fdf5a1380ff700293639db4490 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: [PATCH 05/24] x86: hpet fix docbook comment Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 429d084e014d..235fd6c77504 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -368,8 +368,8 @@ static int hpet_clocksource_register(void) return 0; } -/* - * Try to setup the HPET timer +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { From a7ef94e6889186848573a10c5bdb8271405f44de Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:51:00 -0800 Subject: [PATCH 06/24] x86: do not promote TM3x00/TM5x00 to i686-class We have been promoting Transmeta TM3x00/TM5x00 chips to i686-class based on the notion that they contain all the user-space visible features of an i686-class chip. However, this is not actually true: they lack the EA-taking long NOPs (0F 1F /0). Since this is a userspace-visible incompatibility, downgrade these CPUs to the manufacturer-defined i586 level. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/transmeta.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 200fb3f9ebfb..e8b422c1c512 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) /* All Transmeta CPUs have a constant TSC */ set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - /* If we can run i686 user-space code, call us an i686 */ -#define USER686 ((1 << X86_FEATURE_TSC)|\ - (1 << X86_FEATURE_CX8)|\ - (1 << X86_FEATURE_CMOV)) - if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686) - c->x86 = 6; - #ifdef CONFIG_SYSCTL /* randomize_va_space slows us down enormously; it probably triggers retranslation of x86->native bytecode */ From 7343b3b3a627eb30e24e921f004f659c8ebb91c5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:52:05 -0800 Subject: [PATCH 07/24] x86: require family >= 6 if we are using P6 NOPs The P6 family of NOPs are only available on family >= 6 or above, so enforce that in the boot code. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.cpu | 5 +++++ include/asm-x86/nops.h | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e09a6b73a1aa..86fd2a0e4597 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,6 +377,10 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +config X86_P6_NOP + def_bool y + depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + config X86_TSC def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 @@ -390,6 +394,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 + default "6" if X86_32 && X86_P6_NOP default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h index fec025c7f58c..c52b334b8166 100644 --- a/include/asm-x86/nops.h +++ b/include/asm-x86/nops.h @@ -63,9 +63,7 @@ #define ASM_NOP6 K7_NOP6 #define ASM_NOP7 K7_NOP7 #define ASM_NOP8 K7_NOP8 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \ - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \ - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4) +#elif defined(CONFIG_X86_P6_NOP) #define ASM_NOP1 P6_NOP1 #define ASM_NOP2 P6_NOP2 #define ASM_NOP3 P6_NOP3 From 959b3be64cab9160cd74532a49b89cdd918d38e9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Feb 2008 14:56:45 -0800 Subject: [PATCH 08/24] x86: don't use P6_NOPs if compiling with CONFIG_X86_GENERIC P6_NOPs are definitely not supported on some VIA CPUs, and possibly (unverified) on AMD K7s. It is also the only thing that prevents a 686 kernel from running on Transmeta TM3x00/5x00 (Crusoe) series. The performance benefit over generic NOPs is very small, so when building for generic consumption, avoid using them. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.cpu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 86fd2a0e4597..6d50064db182 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -377,9 +377,18 @@ config X86_OOSTORE def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR +# +# P6_NOPs are a relatively minor optimization that require a family >= +# 6 processor, except that it is broken on certain VIA chips. +# Furthermore, AMD chips prefer a totally different sequence of NOPs +# (which work on all CPUs). As a result, disallow these if we're +# compiling X86_GENERIC but not X86_64 (these NOPs do work on all +# x86-64 capable chips); the list of processors in the right-hand clause +# are the cores that benefit from this optimization. +# config X86_P6_NOP def_bool y - depends on (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) + depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || PENTIUM4) config X86_TSC def_bool y From 4cd20952d74323df06e438c0c3273b5be89d6bfd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Feb 2008 23:24:33 -0800 Subject: [PATCH 09/24] x86: add comments for NOPs Add comments describing the various NOP sequences. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/nops.h | 62 ++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h index c52b334b8166..e3b2bce0aff8 100644 --- a/include/asm-x86/nops.h +++ b/include/asm-x86/nops.h @@ -3,17 +3,29 @@ /* Define nops for use with alternative() */ -/* generic versions from gas */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 +/* generic versions from gas + 1: nop + 2: movl %esi,%esi + 3: leal 0x00(%esi),%esi + 4: leal 0x00(,%esi,1),%esi + 6: leal 0x00000000(%esi),%esi + 7: leal 0x00000000(,%esi,1),%esi +*/ +#define GENERIC_NOP1 ".byte 0x90\n" +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 -/* Opteron 64bit nops */ +/* Opteron 64bit nops + 1: nop + 2: osp nop + 3: osp osp nop + 4: osp osp osp nop +*/ #define K8_NOP1 GENERIC_NOP1 #define K8_NOP2 ".byte 0x66,0x90\n" #define K8_NOP3 ".byte 0x66,0x66,0x90\n" @@ -23,19 +35,35 @@ #define K8_NOP7 K8_NOP4 K8_NOP3 #define K8_NOP8 K8_NOP4 K8_NOP4 -/* K7 nops */ -/* uses eax dependencies (arbitary choice) */ -#define K7_NOP1 GENERIC_NOP1 +/* K7 nops + uses eax dependencies (arbitary choice) + 1: nop + 2: movl %eax,%eax + 3: leal (,%eax,1),%eax + 4: leal 0x00(,%eax,1),%eax + 6: leal 0x00000000(%eax),%eax + 7: leal 0x00000000(,%eax,1),%eax +*/ +#define K7_NOP1 GENERIC_NOP1 #define K7_NOP2 ".byte 0x8b,0xc0\n" #define K7_NOP3 ".byte 0x8d,0x04,0x20\n" #define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" #define K7_NOP5 K7_NOP4 ASM_NOP1 #define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" +#define K7_NOP8 K7_NOP7 ASM_NOP1 -/* P6 nops */ -/* uses eax dependencies (Intel-recommended choice) */ +/* P6 nops + uses eax dependencies (Intel-recommended choice) + 1: nop + 2: osp nop + 3: nopl (%eax) + 4: nopl 0x00(%eax) + 5: nopl 0x00(%eax,%eax,1) + 6: osp nopl 0x00(%eax,%eax,1) + 7: nopl 0x00000000(%eax) + 8: nopl 0x00000000(%eax,%eax,1) +*/ #define P6_NOP1 GENERIC_NOP1 #define P6_NOP2 ".byte 0x66,0x90\n" #define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" From 829157be590af1c2555fb74c3c4db3327e3201fc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 13 Feb 2008 11:16:46 -0800 Subject: [PATCH 10/24] x86: handle BIOSes which terminate e820 with CF=1 and no SMAP The proper way to terminate the e820 chain is with %ebx == 0 on the last legitimate memory block. However, several BIOSes don't do that and instead return error (CF = 1) when trying to read off the end of the list. For this error return, %eax doesn't necessarily return the SMAP signature -- correctly so, since %ah should contain an error code in this case. To deal with some particularly broken BIOSes, we clear the entire e820 chain if the SMAP signature is missing in the middle, indicating a plain insane e820 implementation. However, we need to make the test for CF = 1 before the SMAP check. This fixes at least one HP laptop (nc6400) for which none of the memory-probing methods (e820, e801, 88) functioned fully according to spec. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/memory.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 378353956b5d..e77d89f9e8aa 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -37,6 +37,12 @@ static int detect_memory_e820(void) "=m" (*desc) : "D" (desc), "d" (SMAP), "a" (0xe820)); + /* BIOSes which terminate the chain with CF = 1 as opposed + to %ebx = 0 don't always report the SMAP signature on + the final, failing, probe. */ + if (err) + break; + /* Some BIOSes stop returning SMAP in the middle of the search loop. We don't know exactly how the BIOS screwed up the map at that point, we might have a @@ -47,9 +53,6 @@ static int detect_memory_e820(void) break; } - if (err) - break; - count++; desc++; } while (next && count < E820MAX); From f5106d91f2bf9153d6420f9ebb8114f73f9ce66a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Feb 2008 13:10:44 -0800 Subject: [PATCH 11/24] x86/mtrr: fix kernel-doc missing notation Fix mtrr kernel-doc warning: Warning(linux-2.6.24-git12//arch/x86/kernel/cpu/mtrr/main.c:677): No description found for parameter 'end_pfn' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b6e136f23d3d..c8fda3eb3d7e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -649,6 +649,7 @@ static __init int amd_special_default_mtrr(void) /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number * * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches From 7265b6f10deba1cbe071cee646b063bda07ecd68 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 19 Feb 2008 11:02:30 +0100 Subject: [PATCH 12/24] x86: notsc is ignored on common configurations notsc is ignored in 32-bit kernels if CONFIG_X86_TSC is on.. which is bad, fix it. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 43517e324be8..f14cfd9d1f94 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz); static int __init tsc_setup(char *str) { printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); + "cannot disable TSC completely.\n"); + mark_tsc_unstable("user disabled TSC"); return 1; } #else From 3b57bc461fd5019aef4cfc77d4faf56ebe95449c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Feb 2008 18:53:17 -0800 Subject: [PATCH 13/24] x86: remove double-checking empty zero pages debug so far no one complained about that. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bb652f5a93fb..6dbb0035c57a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -515,14 +515,6 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - /* temporary debugging - double check it's true: */ - { - int i; - - for (i = 0; i < 1024; i++) - WARN_ON_ONCE(empty_zero_page[i]); - } - reservedpages = 0; /* this will put all low memory onto the freelists */ From 88f3aec7afd9ae3e6f6d221801996b69aad1e3a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Feb 2008 11:04:11 +0100 Subject: [PATCH 14/24] x86: fix spontaneous reboot with allyesconfig bzImage recently the 64-bit allyesconfig bzImage kernel started spontaneously rebooting during early bootup. after a few fun hours spent with early init debugging, it turns out that we've got this rather annoying limit on the size of the kernel image: #define KERNEL_TEXT_SIZE (40*1024*1024) which limit my vmlinux just happened to pass: text data bss dec hex filename 29703744 4222751 8646224 42572719 2899baf vmlinux 40 MB is 42572719 bytes, so my vmlinux was just 1.5% above this limit :-/ So it happily crashed right in head_64.S, which - as we all know - is the most debuggable code in the whole architecture ;-) So increase the limit to allow an up to 128MB kernel image to be mapped. (should anyone be that crazy or lazy) We have a full 4K of pagetable (level2_kernel_pgt) allocated for these mappings already, so there's no RAM overhead and the limit was rather pointless and arbitrary. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 22 ++++++++++++++-------- arch/x86/mm/init_64.c | 5 +++-- include/asm-x86/page_64.h | 8 ++++++-- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index eb415043a929..b037b15be7f8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) - /* Module mapping starts here */ - .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 + /* + * 128 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +128MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_TEXT_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) - .fill 512,8,0 + .fill 512, 8, 0 #undef PMDS #undef NEXT_PAGE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6dbb0035c57a..a02a14f0f324 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) } /* - * The head.S code sets up the kernel high mapping from: - * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * * phys_addr holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index f7393bc516ef..3e2e3ca63048 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -47,8 +47,12 @@ #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 48 -#define KERNEL_TEXT_SIZE (40*1024*1024) -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) +/* + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in + * arch/x86/kernel/head_64.S), and it is mapped here: + */ +#define KERNEL_TEXT_SIZE (128*1024*1024) +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) #ifndef __ASSEMBLY__ void clear_page(void *page); From d4afe414189b098d56bcd24280c018aa2ac9a990 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Feb 2008 13:39:30 +0100 Subject: [PATCH 15/24] x86: rename KERNEL_TEXT_SIZE => KERNEL_IMAGE_SIZE The KERNEL_TEXT_SIZE constant was mis-named, as we not only map the kernel text but data, bss and init sections as well. That name led me on the wrong path with the KERNEL_TEXT_SIZE regression, because i knew how big of _text_ my images have and i knew about the 40 MB "text" limit so i wrongly thought to be on the safe side of the 40 MB limit with my 29 MB of text, while the total image size was slightly above 40 MB. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 2 +- include/asm-x86/page_64.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b037b15be7f8..a007454133a3 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -393,7 +393,7 @@ NEXT_PAGE(level2_kernel_pgt) * too.) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, - KERNEL_TEXT_SIZE/PMD_SIZE) + KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) .fill 512, 8, 0 diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index 3e2e3ca63048..143546073b95 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -51,8 +51,8 @@ * Kernel image size is limited to 128 MB (see level2_kernel_pgt in * arch/x86/kernel/head_64.S), and it is mapped here: */ -#define KERNEL_TEXT_SIZE (128*1024*1024) -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) +#define KERNEL_IMAGE_SIZE (128*1024*1024) +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) #ifndef __ASSEMBLY__ void clear_page(void *page); From ce28b9864b853803320c3f1d8de1b81aa4120b14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Feb 2008 23:57:30 +0100 Subject: [PATCH 16/24] x86: fix vsyscall wreckage based on a report from Arne Georg Gleditsch about user-space apps misbehaving after toggling /proc/sys/kernel/vsyscall64, a review of the code revealed that the "NOP patching" done there is fundamentally unsafe for a number of reasons: 1) the patching code runs without synchronizing other CPUs 2) it inserts NOPs even if there is no clock source which provides vread 3) when the clock source changes to one without vread we run in exactly the same problem as in #2 4) if nobody toggles the proc entry from 1 to 0 and to 1 again, then the syscall is not patched out as a result it is possible to break user-space via this patching. The only safe thing for now is to remove the patching. This code was broken since v2.6.21. Reported-by: Arne Georg Gleditsch Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 52 ++--------------------------------- 1 file changed, 3 insertions(+), 49 deletions(-) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3f8242774580..b6be812fac05 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -44,11 +44,6 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","cx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz) static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -227,50 +222,10 @@ long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .mode = 0644 }, {} }; @@ -279,7 +234,6 @@ static ctl_table kernel_root_table2[] = { .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod From 5d119b2c9a490e2d647eae134211b32a18a04c7d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Feb 2008 12:55:57 +0100 Subject: [PATCH 17/24] x86: fix execve with -fstack-protect pointed out by pageexec@freemail.hu: > what happens here is that gcc treats the argument area as owned by the > callee, not the caller and is allowed to do certain tricks. for ssp it > will make a copy of the struct passed by value into the local variable > area and pass *its* address down, and it won't copy it back into the > original instance stored in the argument area. > > so once sys_execve returns, the pt_regs passed by value hasn't at all > changed and its default content will cause a nice double fault (FWIW, > this part took me the longest to debug, being down with cold didn't > help it either ;). To fix this we pass in pt_regs by pointer. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_64.S | 6 ++++-- arch/x86/kernel/process_64.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2ad9a1bc6a73..c20c9e7e08dd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -453,6 +453,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -1036,15 +1037,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b0cc8f0136d8..43f287744f9f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -730,16 +730,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } From 4147c8747eace9058c606b35e700060297edaf91 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 21 Feb 2008 15:50:14 +0100 Subject: [PATCH 18/24] x86: don't print a warning when MTRR are blank and running in KVM Inside a KVM virtual machine the MTRRs are usually blank. This confuses Linux and causes a warning message at boot. This patch removes that warning message when running Linux as a KVM guest. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c8fda3eb3d7e..be83336fddba 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "mtrr.h" u32 num_var_ranges = 0; @@ -689,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); + if (!kvm_para_available()) { + printk(KERN_WARNING + "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + } return 0; } From ed2b7e2b1d1ae201afe8fbd111632074b7b53ed4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 22 Feb 2008 21:58:37 +0200 Subject: [PATCH 19/24] x86: don't make swapper_pg_pmd global There doesn't seem to be any reason for swapper_pg_pmd being global. Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 25eb98540a41..fd8ca53943a8 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -606,7 +606,7 @@ ENTRY(_stext) .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(swapper_pg_pmd) +swapper_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(swapper_pg_dir) From 1650743cdc0db73478f72c57544ce79ea8f3dda6 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 22 Feb 2008 19:23:58 +0100 Subject: [PATCH 20/24] x86: don't save unreliable stack trace entries Currently, there is no way for print_stack_trace() to determine whether a given stack trace entry was deemed reliable or not, simply because save_stack_trace() does not record this information. (Perhaps needless to say, this makes the saved stack traces A LOT harder to read, and probably with no other benefits, since debugging features that use save_stack_trace() most likely also require frame pointers, etc.) This patch reverts to the old behaviour of only recording the reliable trace entries for saved stack traces. Signed-off-by: Vegard Nossum Acked-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 02f0f61f5b11..c28c342c162f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; + if (!reliable) + return; if (trace->skip > 0) { trace->skip--; return; @@ -37,6 +39,8 @@ static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; + if (!reliable) + return; if (in_sched_functions(addr)) return; if (trace->skip > 0) { From 2b775a27c0d9fdf8078d5b31e1e27411e5bf2a91 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 22 Feb 2008 12:09:29 -0300 Subject: [PATCH 21/24] x86: make c_idle.work have a static address. Currently, c_idle is declared in the stack, and thus, have no static address. Peter Zijlstra points out this simple solution, in which c_idle.work is initializated separatedly. Note that the INIT_WORK macro has a static declaration of a key inside. Signed-off-by: Glauber Costa Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index d53bd6fcb428..0880f2c388a9 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { - .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; + INIT_WORK(&c_idle.work, do_fork_idle); /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && From 03994f01e8b72b3d01fd3d09d1cc7c9f421a727c Mon Sep 17 00:00:00 2001 From: Priit Laes Date: Sun, 24 Feb 2008 18:36:05 +0200 Subject: [PATCH 22/24] x86: fix build on non-C locales. For some locales regex range [a-zA-Z] does not work as it is supposed to. so we have to use [:alnum:] and [:xdigit:] to make it work as intended. [1] http://en.wikipedia.org/wiki/Estonian_alphabet Signed-off-by: Ingo Molnar --- arch/x86/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index f385a4b4a484..b8bd0c4aa02e 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -48,7 +48,7 @@ obj-$(VDSO64-y) += vdso-syms.lds # Match symbols in the DSO that look like VDSO*; produce a file of constants. # sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' + -e 's/^\([[:xdigit:]]*\) . \(VDSO[[:alnum:]_]*\)$$/\2 = 0x\1;/p' quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ From 12c247a6719987aad65f83158d2bb3e73c75c1f5 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sun, 24 Feb 2008 18:27:03 +0100 Subject: [PATCH 23/24] x86: fix boot failure on 486 due to TSC breakage > Diffing dmesg between git7 and git8 doesn't sched any light since > git8 also removed the printouts of the x86 caps as they were being > initialised and updated. I'm currently adding those printouts back > in the hope of seeing where and when the caps get broken. That turned out to be very illuminating: --- dmesg-2.6.24-git7 2008-02-24 18:01:25.295851000 +0100 +++ dmesg-2.6.24-git8 2008-02-24 18:01:25.530358000 +0100 ... CPU: After generic identify, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000 CPU: After all inits, caps: 00000003 00000000 00000000 00000000 00000000 00000000 00000000 00000000 +CPU: After applying cleared_cpu_caps, caps: 00000013 00000000 00000000 00000000 00000000 00000000 00000000 00000000 Notice how the TSC cap bit goes from Off to On. (The first two lines are printout loops from -git7 forward-ported to -git8, the third line is the same printout loop added just after the xor-with-cleared_cpu_caps[] loop.) Here's how the breakage occurs: 1. arch/x86/kernel/tsc_32.c:tsc_init() sees !cpu_has_tsc, so bails and calls setup_clear_cpu_cap(X86_FEATURE_TSC). 2. include/asm-x86/cpufeature.h:setup_clear_cpu_cap(bit) clears the bit in boot_cpu_data and sets it in cleared_cpu_caps 3. arch/x86/kernel/cpu/common.c:identify_cpu() XORs all caps in with cleared_cpu_caps HOWEVER, at this point c->x86_capability correctly has TSC Off, cleared_cpu_caps has TSC On, so the XOR incorrectly sets TSC to On in c->x86_capability, with disastrous results. The real bug is that clearing bits with XOR only works if the bits are known to be 1 prior to the XOR, and that's not true here. A simple fix is to convert the XOR to AND-NOT instead. The following patch does that, and allows my 486 to boot 2.6.25-rc kernels again. [ mingo@elte.hu: fixed a similar bug in setup_64.c as well. ] The breakage was introduced via commit 7d851c8d3db0. Signed-off-by: Mikael Pettersson Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/setup_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f86a3c4a2669..a38aafaefc23 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; /* Init Machine Check Exception if available. */ mcheck_init(c); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6fd804f07821..7637dc91c79b 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Clear all flags overriden by options */ for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] ^= cleared_cpu_caps[i]; + c->x86_capability[i] &= ~cleared_cpu_caps[i]; #ifdef CONFIG_X86_MCE mcheck_init(c); From f18edc95a37a901ffcbe91f5e05105f916a04fae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 16 Feb 2008 14:05:01 +0100 Subject: [PATCH 24/24] x86: no robust/pi futex for real i386 CPUs Real i386 CPUs do not have cmpxchg instructions. Catch it before crashing on an invalid opcode. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/futex.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h index cd9f894dd2d7..c9952ea9f698 100644 --- a/include/asm-x86/futex.h +++ b/include/asm-x86/futex.h @@ -102,6 +102,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) { + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) + /* Real i386 machines have no cmpxchg instruction */ + if (boot_cpu_data.x86 == 3) + return -ENOSYS; +#endif + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT;