Merge branch 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 syscall entry code changes for PTI from Ingo Molnar: "The main changes here are Andy Lutomirski's changes to switch the x86-64 entry code to use the 'per CPU entry trampoline stack'. This, besides helping fix KASLR leaks (the pending Page Table Isolation (PTI) work), also robustifies the x86 entry code" * 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) x86/cpufeatures: Make CPU bugs sticky x86/paravirt: Provide a way to check for hypervisors x86/paravirt: Dont patch flush_tlb_single x86/entry/64: Make cpu_entry_area.tss read-only x86/entry: Clean up the SYSENTER_stack code x86/entry/64: Remove the SYSENTER stack canary x86/entry/64: Move the IST stacks into struct cpu_entry_area x86/entry/64: Create a per-CPU SYSCALL entry trampoline x86/entry/64: Return to userspace from the trampoline stack x86/entry/64: Use a per-CPU trampoline stack for IDT entries x86/espfix/64: Stop assuming that pt_regs is on the entry stack x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 x86/entry: Remap the TSS into the CPU entry area x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct x86/dumpstack: Handle stack overflow on all stacks x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss x86/kasan/64: Teach KASAN about the cpu_entry_area x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area x86/entry/gdt: Put per-CPU GDT remaps in ascending order x86/dumpstack: Add get_stack_info() support for the SYSENTER stack ...
This commit is contained in:
commit
64a48099b3
|
@ -941,7 +941,8 @@ ENTRY(debug)
|
|||
movl %esp, %eax # pt_regs pointer
|
||||
|
||||
/* Are we currently on the SYSENTER stack? */
|
||||
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
|
||||
movl PER_CPU_VAR(cpu_entry_area), %ecx
|
||||
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
|
||||
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
|
||||
cmpl $SIZEOF_SYSENTER_stack, %ecx
|
||||
jb .Ldebug_from_sysenter_stack
|
||||
|
@ -984,7 +985,8 @@ ENTRY(nmi)
|
|||
movl %esp, %eax # pt_regs pointer
|
||||
|
||||
/* Are we currently on the SYSENTER stack? */
|
||||
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
|
||||
movl PER_CPU_VAR(cpu_entry_area), %ecx
|
||||
addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
|
||||
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
|
||||
cmpl $SIZEOF_SYSENTER_stack, %ecx
|
||||
jb .Lnmi_from_sysenter_stack
|
||||
|
|
|
@ -140,6 +140,64 @@ END(native_usergs_sysret64)
|
|||
* with them due to bugs in both AMD and Intel CPUs.
|
||||
*/
|
||||
|
||||
.pushsection .entry_trampoline, "ax"
|
||||
|
||||
/*
|
||||
* The code in here gets remapped into cpu_entry_area's trampoline. This means
|
||||
* that the assembler and linker have the wrong idea as to where this code
|
||||
* lives (and, in fact, it's mapped more than once, so it's not even at a
|
||||
* fixed address). So we can't reference any symbols outside the entry
|
||||
* trampoline and expect it to work.
|
||||
*
|
||||
* Instead, we carefully abuse %rip-relative addressing.
|
||||
* _entry_trampoline(%rip) refers to the start of the remapped) entry
|
||||
* trampoline. We can thus find cpu_entry_area with this macro:
|
||||
*/
|
||||
|
||||
#define CPU_ENTRY_AREA \
|
||||
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
|
||||
|
||||
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
|
||||
#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
|
||||
SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
|
||||
|
||||
ENTRY(entry_SYSCALL_64_trampoline)
|
||||
UNWIND_HINT_EMPTY
|
||||
swapgs
|
||||
|
||||
/* Stash the user RSP. */
|
||||
movq %rsp, RSP_SCRATCH
|
||||
|
||||
/* Load the top of the task stack into RSP */
|
||||
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
|
||||
|
||||
/* Start building the simulated IRET frame. */
|
||||
pushq $__USER_DS /* pt_regs->ss */
|
||||
pushq RSP_SCRATCH /* pt_regs->sp */
|
||||
pushq %r11 /* pt_regs->flags */
|
||||
pushq $__USER_CS /* pt_regs->cs */
|
||||
pushq %rcx /* pt_regs->ip */
|
||||
|
||||
/*
|
||||
* x86 lacks a near absolute jump, and we can't jump to the real
|
||||
* entry text with a relative jump. We could push the target
|
||||
* address and then use retq, but this destroys the pipeline on
|
||||
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
|
||||
* spill RDI and restore it in a second-stage trampoline.
|
||||
*/
|
||||
pushq %rdi
|
||||
movq $entry_SYSCALL_64_stage2, %rdi
|
||||
jmp *%rdi
|
||||
END(entry_SYSCALL_64_trampoline)
|
||||
|
||||
.popsection
|
||||
|
||||
ENTRY(entry_SYSCALL_64_stage2)
|
||||
UNWIND_HINT_EMPTY
|
||||
popq %rdi
|
||||
jmp entry_SYSCALL_64_after_hwframe
|
||||
END(entry_SYSCALL_64_stage2)
|
||||
|
||||
ENTRY(entry_SYSCALL_64)
|
||||
UNWIND_HINT_EMPTY
|
||||
/*
|
||||
|
@ -330,8 +388,24 @@ syscall_return_via_sysret:
|
|||
popq %rsi /* skip rcx */
|
||||
popq %rdx
|
||||
popq %rsi
|
||||
|
||||
/*
|
||||
* Now all regs are restored except RSP and RDI.
|
||||
* Save old stack pointer and switch to trampoline stack.
|
||||
*/
|
||||
movq %rsp, %rdi
|
||||
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
|
||||
|
||||
pushq RSP-RDI(%rdi) /* RSP */
|
||||
pushq (%rdi) /* RDI */
|
||||
|
||||
/*
|
||||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
|
||||
popq %rdi
|
||||
movq RSP-ORIG_RAX(%rsp), %rsp
|
||||
popq %rsp
|
||||
USERGS_SYSRET64
|
||||
END(entry_SYSCALL_64)
|
||||
|
||||
|
@ -466,12 +540,13 @@ END(irq_entries_start)
|
|||
|
||||
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
pushfq
|
||||
testl $X86_EFLAGS_IF, (%rsp)
|
||||
pushq %rax
|
||||
SAVE_FLAGS(CLBR_RAX)
|
||||
testl $X86_EFLAGS_IF, %eax
|
||||
jz .Lokay_\@
|
||||
ud2
|
||||
.Lokay_\@:
|
||||
addq $8, %rsp
|
||||
popq %rax
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
@ -563,6 +638,13 @@ END(irq_entries_start)
|
|||
/* 0(%rsp): ~(interrupt number) */
|
||||
.macro interrupt func
|
||||
cld
|
||||
|
||||
testb $3, CS-ORIG_RAX(%rsp)
|
||||
jz 1f
|
||||
SWAPGS
|
||||
call switch_to_thread_stack
|
||||
1:
|
||||
|
||||
ALLOC_PT_GPREGS_ON_STACK
|
||||
SAVE_C_REGS
|
||||
SAVE_EXTRA_REGS
|
||||
|
@ -572,12 +654,8 @@ END(irq_entries_start)
|
|||
jz 1f
|
||||
|
||||
/*
|
||||
* IRQ from user mode. Switch to kernel gsbase and inform context
|
||||
* tracking that we're in kernel mode.
|
||||
*/
|
||||
SWAPGS
|
||||
|
||||
/*
|
||||
* IRQ from user mode.
|
||||
*
|
||||
* We need to tell lockdep that IRQs are off. We can't do this until
|
||||
* we fix gsbase, and we should do it before enter_from_user_mode
|
||||
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
|
||||
|
@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
|
|||
ud2
|
||||
1:
|
||||
#endif
|
||||
SWAPGS
|
||||
POP_EXTRA_REGS
|
||||
POP_C_REGS
|
||||
addq $8, %rsp /* skip regs->orig_ax */
|
||||
popq %r11
|
||||
popq %r10
|
||||
popq %r9
|
||||
popq %r8
|
||||
popq %rax
|
||||
popq %rcx
|
||||
popq %rdx
|
||||
popq %rsi
|
||||
|
||||
/*
|
||||
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
|
||||
* Save old stack pointer and switch to trampoline stack.
|
||||
*/
|
||||
movq %rsp, %rdi
|
||||
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
|
||||
|
||||
/* Copy the IRET frame to the trampoline stack. */
|
||||
pushq 6*8(%rdi) /* SS */
|
||||
pushq 5*8(%rdi) /* RSP */
|
||||
pushq 4*8(%rdi) /* EFLAGS */
|
||||
pushq 3*8(%rdi) /* CS */
|
||||
pushq 2*8(%rdi) /* RIP */
|
||||
|
||||
/* Push user RDI on the trampoline stack. */
|
||||
pushq (%rdi)
|
||||
|
||||
/*
|
||||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
|
||||
/* Restore RDI. */
|
||||
popq %rdi
|
||||
SWAPGS
|
||||
INTERRUPT_RETURN
|
||||
|
||||
|
||||
|
@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
|||
/*
|
||||
* Exception entry points.
|
||||
*/
|
||||
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
|
||||
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
|
||||
|
||||
/*
|
||||
* Switch to the thread stack. This is called with the IRET frame and
|
||||
* orig_ax on the stack. (That is, RDI..R12 are not on the stack and
|
||||
* space has not been allocated for them.)
|
||||
*/
|
||||
ENTRY(switch_to_thread_stack)
|
||||
UNWIND_HINT_FUNC
|
||||
|
||||
pushq %rdi
|
||||
movq %rsp, %rdi
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
|
||||
|
||||
pushq 7*8(%rdi) /* regs->ss */
|
||||
pushq 6*8(%rdi) /* regs->rsp */
|
||||
pushq 5*8(%rdi) /* regs->eflags */
|
||||
pushq 4*8(%rdi) /* regs->cs */
|
||||
pushq 3*8(%rdi) /* regs->ip */
|
||||
pushq 2*8(%rdi) /* regs->orig_ax */
|
||||
pushq 8(%rdi) /* return address */
|
||||
UNWIND_HINT_FUNC
|
||||
|
||||
movq (%rdi), %rdi
|
||||
ret
|
||||
END(switch_to_thread_stack)
|
||||
|
||||
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
|
||||
ENTRY(\sym)
|
||||
|
@ -848,11 +983,12 @@ ENTRY(\sym)
|
|||
|
||||
ALLOC_PT_GPREGS_ON_STACK
|
||||
|
||||
.if \paranoid
|
||||
.if \paranoid == 1
|
||||
.if \paranoid < 2
|
||||
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
|
||||
jnz 1f
|
||||
jnz .Lfrom_usermode_switch_stack_\@
|
||||
.endif
|
||||
|
||||
.if \paranoid
|
||||
call paranoid_entry
|
||||
.else
|
||||
call error_entry
|
||||
|
@ -894,20 +1030,15 @@ ENTRY(\sym)
|
|||
jmp error_exit
|
||||
.endif
|
||||
|
||||
.if \paranoid == 1
|
||||
.if \paranoid < 2
|
||||
/*
|
||||
* Paranoid entry from userspace. Switch stacks and treat it
|
||||
* Entry from userspace. Switch stacks and treat it
|
||||
* as a normal entry. This means that paranoid handlers
|
||||
* run in real process context if user_mode(regs).
|
||||
*/
|
||||
1:
|
||||
.Lfrom_usermode_switch_stack_\@:
|
||||
call error_entry
|
||||
|
||||
|
||||
movq %rsp, %rdi /* pt_regs pointer */
|
||||
call sync_regs
|
||||
movq %rax, %rsp /* switch stack */
|
||||
|
||||
movq %rsp, %rdi /* pt_regs pointer */
|
||||
|
||||
.if \has_error_code
|
||||
|
@ -1170,6 +1301,14 @@ ENTRY(error_entry)
|
|||
SWAPGS
|
||||
|
||||
.Lerror_entry_from_usermode_after_swapgs:
|
||||
/* Put us onto the real thread stack. */
|
||||
popq %r12 /* save return addr in %12 */
|
||||
movq %rsp, %rdi /* arg0 = pt_regs pointer */
|
||||
call sync_regs
|
||||
movq %rax, %rsp /* switch stack */
|
||||
ENCODE_FRAME_POINTER
|
||||
pushq %r12
|
||||
|
||||
/*
|
||||
* We need to tell lockdep that IRQs are off. We can't do this until
|
||||
* we fix gsbase, and we should do it before enter_from_user_mode
|
||||
|
|
|
@ -48,7 +48,7 @@
|
|||
*/
|
||||
ENTRY(entry_SYSENTER_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWAPGS
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
/*
|
||||
|
@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
|
|||
*/
|
||||
movl %eax, %eax
|
||||
|
||||
/* Construct struct pt_regs on stack (iret frame is already on stack) */
|
||||
pushq %rax /* pt_regs->orig_ax */
|
||||
|
||||
/* switch to thread stack expects orig_ax to be pushed */
|
||||
call switch_to_thread_stack
|
||||
|
||||
pushq %rdi /* pt_regs->di */
|
||||
pushq %rsi /* pt_regs->si */
|
||||
pushq %rdx /* pt_regs->dx */
|
||||
|
|
|
@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
|
|||
set_bit(bit, (unsigned long *)cpu_caps_set); \
|
||||
} while (0)
|
||||
|
||||
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
|
||||
|
||||
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
|
||||
/*
|
||||
* Static testing of CPU features. Used the same as boot_cpu_has().
|
||||
|
|
|
@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
|
|||
return this_cpu_ptr(&gdt_page)->gdt;
|
||||
}
|
||||
|
||||
/* Get the fixmap index for a specific processor */
|
||||
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
|
||||
{
|
||||
return FIX_GDT_REMAP_BEGIN + cpu;
|
||||
}
|
||||
|
||||
/* Provide the fixmap address of the remapped GDT */
|
||||
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
|
||||
{
|
||||
unsigned int idx = get_cpu_gdt_ro_index(cpu);
|
||||
return (struct desc_struct *)__fix_to_virt(idx);
|
||||
return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
|
||||
}
|
||||
|
||||
/* Provide the current read-only GDT */
|
||||
|
@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
|
||||
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
|
||||
{
|
||||
struct desc_struct *d = get_cpu_gdt_rw(cpu);
|
||||
tss_desc tss;
|
||||
|
|
|
@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
|
|||
PAGE_SIZE)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* cpu_entry_area is a percpu region in the fixmap that contains things
|
||||
* needed by the CPU and early entry/exit code. Real types aren't used
|
||||
* for all fields here to avoid circular header dependencies.
|
||||
*
|
||||
* Every field is a virtual alias of some other allocated backing store.
|
||||
* There is no direct allocation of a struct cpu_entry_area.
|
||||
*/
|
||||
struct cpu_entry_area {
|
||||
char gdt[PAGE_SIZE];
|
||||
|
||||
/*
|
||||
* The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
|
||||
* a a read-only guard page.
|
||||
*/
|
||||
struct SYSENTER_stack_page SYSENTER_stack_page;
|
||||
|
||||
/*
|
||||
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
|
||||
* we need task switches to work, and task switches write to the TSS.
|
||||
*/
|
||||
struct tss_struct tss;
|
||||
|
||||
char entry_trampoline[PAGE_SIZE];
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Exception stacks used for IST entries.
|
||||
*
|
||||
* In the future, this should have a separate slot for each stack
|
||||
* with guard pages between them.
|
||||
*/
|
||||
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
|
||||
#endif
|
||||
};
|
||||
|
||||
#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
|
||||
|
||||
extern void setup_cpu_entry_areas(void);
|
||||
|
||||
/*
|
||||
* Here we define all the compile-time 'special' virtual
|
||||
|
@ -101,8 +140,8 @@ enum fixed_addresses {
|
|||
FIX_LNW_VRTC,
|
||||
#endif
|
||||
/* Fixmap entries to remap the GDTs, one per processor. */
|
||||
FIX_GDT_REMAP_BEGIN,
|
||||
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
|
||||
FIX_CPU_ENTRY_AREA_TOP,
|
||||
FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
|
||||
|
||||
#ifdef CONFIG_ACPI_APEI_GHES
|
||||
/* Used for GHES mapping from assorted contexts */
|
||||
|
@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
|
|||
void __early_set_fixmap(enum fixed_addresses idx,
|
||||
phys_addr_t phys, pgprot_t flags);
|
||||
|
||||
static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
|
||||
|
||||
return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
|
||||
}
|
||||
|
||||
#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
|
||||
BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
|
||||
__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
|
||||
})
|
||||
|
||||
#define get_cpu_entry_area_index(cpu, field) \
|
||||
__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
|
||||
|
||||
static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
|
||||
{
|
||||
return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
|
||||
}
|
||||
|
||||
static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
|
||||
{
|
||||
return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
|
||||
}
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#endif /* _ASM_X86_FIXMAP_H */
|
||||
|
|
|
@ -20,16 +20,7 @@
|
|||
#ifndef _ASM_X86_HYPERVISOR_H
|
||||
#define _ASM_X86_HYPERVISOR_H
|
||||
|
||||
#ifdef CONFIG_HYPERVISOR_GUEST
|
||||
|
||||
#include <asm/kvm_para.h>
|
||||
#include <asm/x86_init.h>
|
||||
#include <asm/xen/hypervisor.h>
|
||||
|
||||
/*
|
||||
* x86 hypervisor information
|
||||
*/
|
||||
|
||||
/* x86 hypervisor types */
|
||||
enum x86_hypervisor_type {
|
||||
X86_HYPER_NATIVE = 0,
|
||||
X86_HYPER_VMWARE,
|
||||
|
@ -39,6 +30,12 @@ enum x86_hypervisor_type {
|
|||
X86_HYPER_KVM,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_HYPERVISOR_GUEST
|
||||
|
||||
#include <asm/kvm_para.h>
|
||||
#include <asm/x86_init.h>
|
||||
#include <asm/xen/hypervisor.h>
|
||||
|
||||
struct hypervisor_x86 {
|
||||
/* Hypervisor name */
|
||||
const char *name;
|
||||
|
@ -58,7 +55,15 @@ struct hypervisor_x86 {
|
|||
|
||||
extern enum x86_hypervisor_type x86_hyper_type;
|
||||
extern void init_hypervisor_platform(void);
|
||||
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
|
||||
{
|
||||
return x86_hyper_type == type;
|
||||
}
|
||||
#else
|
||||
static inline void init_hypervisor_platform(void) { }
|
||||
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
|
||||
{
|
||||
return type == X86_HYPER_NATIVE;
|
||||
}
|
||||
#endif /* CONFIG_HYPERVISOR_GUEST */
|
||||
#endif /* _ASM_X86_HYPERVISOR_H */
|
||||
|
|
|
@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
|
|||
swapgs; \
|
||||
sysretl
|
||||
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
#define SAVE_FLAGS(x) pushfq; popq %rax
|
||||
#endif
|
||||
#else
|
||||
#define INTERRUPT_RETURN iret
|
||||
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
|
||||
|
|
|
@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
|
|||
extern int __must_check __die(const char *, struct pt_regs *, long);
|
||||
extern void show_stack_regs(struct pt_regs *regs);
|
||||
extern void __show_regs(struct pt_regs *regs, int all);
|
||||
extern void show_iret_regs(struct pt_regs *regs);
|
||||
extern unsigned long oops_begin(void);
|
||||
extern void oops_end(unsigned long, struct pt_regs *, int signr);
|
||||
|
||||
|
|
|
@ -927,6 +927,15 @@ extern void default_banner(void);
|
|||
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
|
||||
CLBR_NONE, \
|
||||
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
|
||||
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
#define SAVE_FLAGS(clobbers) \
|
||||
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
|
||||
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
|
||||
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
|
||||
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_X86_32 */
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
|
|
@ -163,9 +163,9 @@ enum cpuid_regs_idx {
|
|||
extern struct cpuinfo_x86 boot_cpu_data;
|
||||
extern struct cpuinfo_x86 new_cpu_data;
|
||||
|
||||
extern struct tss_struct doublefault_tss;
|
||||
extern __u32 cpu_caps_cleared[NCAPINTS];
|
||||
extern __u32 cpu_caps_set[NCAPINTS];
|
||||
extern struct x86_hw_tss doublefault_tss;
|
||||
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
|
||||
|
@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
|
|||
write_cr3(__sme_pa(pgdir));
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
|
||||
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
|
||||
* unrelated to the task-switch mechanism:
|
||||
*/
|
||||
#ifdef CONFIG_X86_32
|
||||
/* This is the TSS defined by the hardware. */
|
||||
struct x86_hw_tss {
|
||||
|
@ -305,7 +310,13 @@ struct x86_hw_tss {
|
|||
struct x86_hw_tss {
|
||||
u32 reserved1;
|
||||
u64 sp0;
|
||||
|
||||
/*
|
||||
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
|
||||
* Linux does not use ring 1, so sp1 is not otherwise needed.
|
||||
*/
|
||||
u64 sp1;
|
||||
|
||||
u64 sp2;
|
||||
u64 reserved2;
|
||||
u64 ist[7];
|
||||
|
@ -323,12 +334,22 @@ struct x86_hw_tss {
|
|||
#define IO_BITMAP_BITS 65536
|
||||
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
|
||||
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
|
||||
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
|
||||
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
|
||||
#define INVALID_IO_BITMAP_OFFSET 0x8000
|
||||
|
||||
struct SYSENTER_stack {
|
||||
unsigned long words[64];
|
||||
};
|
||||
|
||||
struct SYSENTER_stack_page {
|
||||
struct SYSENTER_stack stack;
|
||||
} __aligned(PAGE_SIZE);
|
||||
|
||||
struct tss_struct {
|
||||
/*
|
||||
* The hardware state:
|
||||
* The fixed hardware portion. This must not cross a page boundary
|
||||
* at risk of violating the SDM's advice and potentially triggering
|
||||
* errata.
|
||||
*/
|
||||
struct x86_hw_tss x86_tss;
|
||||
|
||||
|
@ -339,18 +360,9 @@ struct tss_struct {
|
|||
* be within the limit.
|
||||
*/
|
||||
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
|
||||
} __aligned(PAGE_SIZE);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
/*
|
||||
* Space for the temporary SYSENTER stack.
|
||||
*/
|
||||
unsigned long SYSENTER_stack_canary;
|
||||
unsigned long SYSENTER_stack[64];
|
||||
#endif
|
||||
|
||||
} ____cacheline_aligned;
|
||||
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
|
||||
|
||||
/*
|
||||
* sizeof(unsigned long) coming from an extra "long" at the end
|
||||
|
@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
|
|||
|
||||
#ifdef CONFIG_X86_32
|
||||
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
|
||||
#else
|
||||
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
|
||||
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
|
|||
static inline void
|
||||
native_load_sp0(unsigned long sp0)
|
||||
{
|
||||
this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
|
||||
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
|
||||
}
|
||||
|
||||
static inline void native_swapgs(void)
|
||||
|
@ -535,12 +550,12 @@ static inline void native_swapgs(void)
|
|||
|
||||
static inline unsigned long current_top_of_stack(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
|
||||
#else
|
||||
/* sp0 on x86_32 is special in and around vm86 mode. */
|
||||
/*
|
||||
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
|
||||
* and around vm86 mode and sp0 on x86_64 is special because of the
|
||||
* entry trampoline.
|
||||
*/
|
||||
return this_cpu_read_stable(cpu_current_top_of_stack);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool on_thread_stack(void)
|
||||
|
|
|
@ -16,6 +16,7 @@ enum stack_type {
|
|||
STACK_TYPE_TASK,
|
||||
STACK_TYPE_IRQ,
|
||||
STACK_TYPE_SOFTIRQ,
|
||||
STACK_TYPE_SYSENTER,
|
||||
STACK_TYPE_EXCEPTION,
|
||||
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
|
||||
};
|
||||
|
@ -28,6 +29,8 @@ struct stack_info {
|
|||
bool in_task_stack(unsigned long *stack, struct task_struct *task,
|
||||
struct stack_info *info);
|
||||
|
||||
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
|
||||
|
||||
int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
struct stack_info *info, unsigned long *visit_mask);
|
||||
|
||||
|
|
|
@ -79,10 +79,10 @@ do { \
|
|||
static inline void refresh_sysenter_cs(struct thread_struct *thread)
|
||||
{
|
||||
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
|
||||
if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
|
||||
if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
|
||||
return;
|
||||
|
||||
this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
|
||||
this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
|
||||
}
|
||||
#endif
|
||||
|
@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
|
|||
/* This is used when switching tasks or entering/exiting vm86 mode. */
|
||||
static inline void update_sp0(struct task_struct *task)
|
||||
{
|
||||
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
|
||||
#ifdef CONFIG_X86_32
|
||||
load_sp0(task->thread.sp0);
|
||||
#else
|
||||
load_sp0(task_top_of_stack(task));
|
||||
if (static_cpu_has(X86_FEATURE_XENPV))
|
||||
load_sp0(task_top_of_stack(task));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
|
|||
#else /* !__ASSEMBLY__ */
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
|
||||
# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
|
|||
dotraplinkage void do_stack_segment(struct pt_regs *, long);
|
||||
#ifdef CONFIG_X86_64
|
||||
dotraplinkage void do_double_fault(struct pt_regs *, long);
|
||||
asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
|
||||
#endif
|
||||
dotraplinkage void do_general_protection(struct pt_regs *, long);
|
||||
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
|
||||
|
|
|
@ -7,6 +7,9 @@
|
|||
#include <asm/ptrace.h>
|
||||
#include <asm/stacktrace.h>
|
||||
|
||||
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
|
||||
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
|
||||
|
||||
struct unwind_state {
|
||||
struct stack_info stack_info;
|
||||
unsigned long stack_mask;
|
||||
|
@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
|
|||
}
|
||||
|
||||
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
|
||||
/*
|
||||
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
|
||||
* only the iret frame registers are accessible. Use with caution!
|
||||
*/
|
||||
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
|
||||
{
|
||||
if (unwind_done(state))
|
||||
|
|
|
@ -93,4 +93,10 @@ void common(void) {
|
|||
|
||||
BLANK();
|
||||
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
|
||||
|
||||
/* Layout info for cpu_entry_area */
|
||||
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
|
||||
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
|
||||
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
|
||||
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
|
||||
}
|
||||
|
|
|
@ -47,13 +47,8 @@ void foo(void)
|
|||
BLANK();
|
||||
|
||||
/* Offset from the sysenter stack to tss.sp0 */
|
||||
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
|
||||
offsetofend(struct tss_struct, SYSENTER_stack));
|
||||
|
||||
/* Offset from cpu_tss to SYSENTER_stack */
|
||||
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
|
||||
/* Size of SYSENTER_stack */
|
||||
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
|
||||
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
|
||||
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
|
||||
|
||||
#ifdef CONFIG_CC_STACKPROTECTOR
|
||||
BLANK();
|
||||
|
|
|
@ -23,6 +23,9 @@ int main(void)
|
|||
#ifdef CONFIG_PARAVIRT
|
||||
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
|
||||
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
|
||||
#endif
|
||||
BLANK();
|
||||
#endif
|
||||
|
||||
|
@ -63,6 +66,7 @@ int main(void)
|
|||
|
||||
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
|
||||
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
|
||||
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
|
||||
BLANK();
|
||||
|
||||
#ifdef CONFIG_CC_STACKPROTECTOR
|
||||
|
|
|
@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
|
|||
return NULL; /* Not found */
|
||||
}
|
||||
|
||||
__u32 cpu_caps_cleared[NCAPINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS];
|
||||
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
void load_percpu_segment(int cpu)
|
||||
{
|
||||
|
@ -490,27 +490,116 @@ void load_percpu_segment(int cpu)
|
|||
load_stack_canary_segment();
|
||||
}
|
||||
|
||||
/* Setup the fixmap mapping only once per-processor */
|
||||
static inline void setup_fixmap_gdt(int cpu)
|
||||
#ifdef CONFIG_X86_32
|
||||
/* The 32-bit entry code needs to find cpu_entry_area. */
|
||||
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Special IST stacks which the CPU switches to when it calls
|
||||
* an IST-marked descriptor entry. Up to 7 stacks (hardware
|
||||
* limit), all of them are 4K, except the debug stack which
|
||||
* is 8K.
|
||||
*/
|
||||
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
||||
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
|
||||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
#endif
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
|
||||
SYSENTER_stack_storage);
|
||||
|
||||
static void __init
|
||||
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
|
||||
{
|
||||
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
|
||||
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
|
||||
}
|
||||
|
||||
/* Setup the fixmap mappings only once per-processor */
|
||||
static void __init setup_cpu_entry_area(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
/* On 64-bit systems, we use a read-only fixmap GDT. */
|
||||
pgprot_t prot = PAGE_KERNEL_RO;
|
||||
extern char _entry_trampoline[];
|
||||
|
||||
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
|
||||
pgprot_t gdt_prot = PAGE_KERNEL_RO;
|
||||
pgprot_t tss_prot = PAGE_KERNEL_RO;
|
||||
#else
|
||||
/*
|
||||
* On native 32-bit systems, the GDT cannot be read-only because
|
||||
* our double fault handler uses a task gate, and entering through
|
||||
* a task gate needs to change an available TSS to busy. If the GDT
|
||||
* is read-only, that will triple fault.
|
||||
* a task gate needs to change an available TSS to busy. If the
|
||||
* GDT is read-only, that will triple fault. The TSS cannot be
|
||||
* read-only because the CPU writes to it on task switches.
|
||||
*
|
||||
* On Xen PV, the GDT must be read-only because the hypervisor requires
|
||||
* it.
|
||||
* On Xen PV, the GDT must be read-only because the hypervisor
|
||||
* requires it.
|
||||
*/
|
||||
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
|
||||
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
|
||||
PAGE_KERNEL_RO : PAGE_KERNEL;
|
||||
pgprot_t tss_prot = PAGE_KERNEL;
|
||||
#endif
|
||||
|
||||
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
|
||||
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
|
||||
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
|
||||
PAGE_KERNEL);
|
||||
|
||||
/*
|
||||
* The Intel SDM says (Volume 3, 7.2.1):
|
||||
*
|
||||
* Avoid placing a page boundary in the part of the TSS that the
|
||||
* processor reads during a task switch (the first 104 bytes). The
|
||||
* processor may not correctly perform address translations if a
|
||||
* boundary occurs in this area. During a task switch, the processor
|
||||
* reads and writes into the first 104 bytes of each TSS (using
|
||||
* contiguous physical addresses beginning with the physical address
|
||||
* of the first byte of the TSS). So, after TSS access begins, if
|
||||
* part of the 104 bytes is not physically contiguous, the processor
|
||||
* will access incorrect information without generating a page-fault
|
||||
* exception.
|
||||
*
|
||||
* There are also a lot of errata involving the TSS spanning a page
|
||||
* boundary. Assert that we're not doing that.
|
||||
*/
|
||||
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
|
||||
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
|
||||
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
|
||||
&per_cpu(cpu_tss_rw, cpu),
|
||||
sizeof(struct tss_struct) / PAGE_SIZE,
|
||||
tss_prot);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) !=
|
||||
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
|
||||
&per_cpu(exception_stacks, cpu),
|
||||
sizeof(exception_stacks) / PAGE_SIZE,
|
||||
PAGE_KERNEL);
|
||||
|
||||
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
|
||||
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
|
||||
#endif
|
||||
}
|
||||
|
||||
void __init setup_cpu_entry_areas(void)
|
||||
{
|
||||
unsigned int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
setup_cpu_entry_area(cpu);
|
||||
}
|
||||
|
||||
/* Load the original GDT from the per-cpu structure */
|
||||
|
@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
|
|||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NCAPINTS; i++) {
|
||||
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
|
||||
c->x86_capability[i] &= ~cpu_caps_cleared[i];
|
||||
c->x86_capability[i] |= cpu_caps_set[i];
|
||||
}
|
||||
|
@ -1250,7 +1339,7 @@ void enable_sep_cpu(void)
|
|||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
tss = &per_cpu(cpu_tss, cpu);
|
||||
tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
/*
|
||||
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
|
||||
|
@ -1259,11 +1348,7 @@ void enable_sep_cpu(void)
|
|||
|
||||
tss->x86_tss.ss1 = __KERNEL_CS;
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
|
||||
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP,
|
||||
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
|
||||
0);
|
||||
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
|
||||
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
|
||||
|
||||
put_cpu();
|
||||
|
@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
|
|||
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
|
||||
EXPORT_PER_CPU_SYMBOL(__preempt_count);
|
||||
|
||||
/*
|
||||
* Special IST stacks which the CPU switches to when it calls
|
||||
* an IST-marked descriptor entry. Up to 7 stacks (hardware
|
||||
* limit), all of them are 4K, except the debug stack which
|
||||
* is 8K.
|
||||
*/
|
||||
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
||||
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
|
||||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
|
||||
/* May not be marked __init: used by software suspend */
|
||||
void syscall_init(void)
|
||||
{
|
||||
extern char _entry_trampoline[];
|
||||
extern char entry_SYSCALL_64_trampoline[];
|
||||
|
||||
int cpu = smp_processor_id();
|
||||
unsigned long SYSCALL64_entry_trampoline =
|
||||
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
|
||||
(entry_SYSCALL_64_trampoline - _entry_trampoline);
|
||||
|
||||
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
|
||||
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
|
||||
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
|
||||
|
@ -1386,7 +1465,7 @@ void syscall_init(void)
|
|||
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
|
||||
*/
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
|
||||
#else
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
|
||||
|
@ -1530,7 +1609,7 @@ void cpu_init(void)
|
|||
if (cpu)
|
||||
load_ucode_ap();
|
||||
|
||||
t = &per_cpu(cpu_tss, cpu);
|
||||
t = &per_cpu(cpu_tss_rw, cpu);
|
||||
oist = &per_cpu(orig_ist, cpu);
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
|
@ -1569,7 +1648,7 @@ void cpu_init(void)
|
|||
* set up and load the per-CPU TSS
|
||||
*/
|
||||
if (!oist->ist[0]) {
|
||||
char *estacks = per_cpu(exception_stacks, cpu);
|
||||
char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
|
||||
|
||||
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
|
||||
estacks += exception_stack_sizes[v];
|
||||
|
@ -1580,7 +1659,7 @@ void cpu_init(void)
|
|||
}
|
||||
}
|
||||
|
||||
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
|
||||
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
|
||||
|
||||
/*
|
||||
* <= is required because the CPU will access up to
|
||||
|
@ -1596,11 +1675,12 @@ void cpu_init(void)
|
|||
enter_lazy_tlb(&init_mm, me);
|
||||
|
||||
/*
|
||||
* Initialize the TSS. Don't bother initializing sp0, as the initial
|
||||
* task never enters user mode.
|
||||
* Initialize the TSS. sp0 points to the entry trampoline stack
|
||||
* regardless of what task is running.
|
||||
*/
|
||||
set_tss_desc(cpu, t);
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
load_TR_desc();
|
||||
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
|
||||
|
||||
load_mm_ldt(&init_mm);
|
||||
|
||||
|
@ -1612,7 +1692,6 @@ void cpu_init(void)
|
|||
if (is_uv_system())
|
||||
uv_cpu_init();
|
||||
|
||||
setup_fixmap_gdt(cpu);
|
||||
load_fixmap_gdt(cpu);
|
||||
}
|
||||
|
||||
|
@ -1622,7 +1701,7 @@ void cpu_init(void)
|
|||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct task_struct *curr = current;
|
||||
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
wait_for_master_cpu(cpu);
|
||||
|
||||
|
@ -1657,12 +1736,12 @@ void cpu_init(void)
|
|||
* Initialize the TSS. Don't bother initializing sp0, as the initial
|
||||
* task never enters user mode.
|
||||
*/
|
||||
set_tss_desc(cpu, t);
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
load_TR_desc();
|
||||
|
||||
load_mm_ldt(&init_mm);
|
||||
|
||||
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
|
||||
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
|
||||
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
/* Set up doublefault TSS pointer in the GDT */
|
||||
|
@ -1674,7 +1753,6 @@ void cpu_init(void)
|
|||
|
||||
fpu__init_cpu();
|
||||
|
||||
setup_fixmap_gdt(cpu);
|
||||
load_fixmap_gdt(cpu);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -50,25 +50,23 @@ static void doublefault_fn(void)
|
|||
cpu_relax();
|
||||
}
|
||||
|
||||
struct tss_struct doublefault_tss __cacheline_aligned = {
|
||||
.x86_tss = {
|
||||
.sp0 = STACK_START,
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
|
||||
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
|
||||
.sp0 = STACK_START,
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
|
||||
|
||||
.ip = (unsigned long) doublefault_fn,
|
||||
/* 0x2 bit is always set */
|
||||
.flags = X86_EFLAGS_SF | 0x2,
|
||||
.sp = STACK_START,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
.ip = (unsigned long) doublefault_fn,
|
||||
/* 0x2 bit is always set */
|
||||
.flags = X86_EFLAGS_SF | 0x2,
|
||||
.sp = STACK_START,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
}
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
};
|
||||
|
||||
/* dummy for do_double_fault() call */
|
||||
|
|
|
@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
|
||||
{
|
||||
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
|
||||
|
||||
void *begin = ss;
|
||||
void *end = ss + 1;
|
||||
|
||||
if ((void *)stack < begin || (void *)stack >= end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_SYSENTER;
|
||||
info->begin = begin;
|
||||
info->end = end;
|
||||
info->next_sp = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void printk_stack_address(unsigned long address, int reliable,
|
||||
char *log_lvl)
|
||||
{
|
||||
|
@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
|
|||
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
|
||||
}
|
||||
|
||||
void show_iret_regs(struct pt_regs *regs)
|
||||
{
|
||||
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
|
||||
regs->sp, regs->flags);
|
||||
}
|
||||
|
||||
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
|
||||
{
|
||||
if (on_stack(info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
|
||||
IRET_FRAME_SIZE)) {
|
||||
/*
|
||||
* When an interrupt or exception occurs in entry code, the
|
||||
* full pt_regs might not have been saved yet. In that case
|
||||
* just print the iret frame.
|
||||
*/
|
||||
show_iret_regs(regs);
|
||||
}
|
||||
}
|
||||
|
||||
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
unsigned long *stack, char *log_lvl)
|
||||
{
|
||||
|
@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
|||
* - task stack
|
||||
* - interrupt stack
|
||||
* - HW exception stacks (double fault, nmi, debug, mce)
|
||||
* - SYSENTER stack
|
||||
*
|
||||
* x86-32 can have up to three stacks:
|
||||
* x86-32 can have up to four stacks:
|
||||
* - task stack
|
||||
* - softirq stack
|
||||
* - hardirq stack
|
||||
* - SYSENTER stack
|
||||
*/
|
||||
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
const char *stack_name;
|
||||
|
||||
/*
|
||||
* If we overflowed the task stack into a guard page, jump back
|
||||
* to the bottom of the usable stack.
|
||||
*/
|
||||
if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
|
||||
stack = task_stack_page(task);
|
||||
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask))
|
||||
break;
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
|
||||
/*
|
||||
* We weren't on a valid stack. It's possible that
|
||||
* we overflowed a valid stack into a guard page.
|
||||
* See if the next page up is valid so that we can
|
||||
* generate some kind of backtrace if this happens.
|
||||
*/
|
||||
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask))
|
||||
break;
|
||||
}
|
||||
|
||||
stack_name = stack_type_name(stack_info.type);
|
||||
if (stack_name)
|
||||
printk("%s <%s>\n", log_lvl, stack_name);
|
||||
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
if (regs)
|
||||
show_regs_safe(&stack_info, regs);
|
||||
|
||||
/*
|
||||
* Scan the stack, printing any text addresses we find. At the
|
||||
|
@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
|||
|
||||
/*
|
||||
* Don't print regs->ip again if it was already printed
|
||||
* by __show_regs() below.
|
||||
* by show_regs_safe() below.
|
||||
*/
|
||||
if (regs && stack == ®s->ip)
|
||||
goto next;
|
||||
|
@ -155,8 +199,8 @@ next:
|
|||
|
||||
/* if the frame has entry regs, print them */
|
||||
regs = unwind_get_entry_regs(&state);
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
if (regs)
|
||||
show_regs_safe(&stack_info, regs);
|
||||
}
|
||||
|
||||
if (stack_name)
|
||||
|
|
|
@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
|
|||
if (type == STACK_TYPE_SOFTIRQ)
|
||||
return "SOFTIRQ";
|
||||
|
||||
if (type == STACK_TYPE_SYSENTER)
|
||||
return "SYSENTER";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
|||
if (task != current)
|
||||
goto unknown;
|
||||
|
||||
if (in_sysenter_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_hardirq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
|
|
|
@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
|
|||
if (type == STACK_TYPE_IRQ)
|
||||
return "IRQ";
|
||||
|
||||
if (type == STACK_TYPE_SYSENTER)
|
||||
return "SYSENTER";
|
||||
|
||||
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
|
||||
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
|
||||
|
||||
|
@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
|||
if (in_irq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_sysenter_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
goto unknown;
|
||||
|
||||
recursion_check:
|
||||
|
|
|
@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
|
|||
* because the ->io_bitmap_max value must match the bitmap
|
||||
* contents:
|
||||
*/
|
||||
tss = &per_cpu(cpu_tss, get_cpu());
|
||||
tss = &per_cpu(cpu_tss_rw, get_cpu());
|
||||
|
||||
if (turn_on)
|
||||
bitmap_clear(t->io_bitmap_ptr, from, num);
|
||||
|
|
|
@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
|
|||
/* high bit used in ret_from_ code */
|
||||
unsigned vector = ~regs->orig_ax;
|
||||
|
||||
/*
|
||||
* NB: Unlike exception entries, IRQ entries do not reliably
|
||||
* handle context tracking in the low-level entry code. This is
|
||||
* because syscall entries execute briefly with IRQs on before
|
||||
* updating context tracking state, so we can take an IRQ from
|
||||
* kernel mode with CONTEXT_USER. The low-level entry code only
|
||||
* updates the context if we came from user mode, so we won't
|
||||
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
|
||||
* code is cleaned up enough that we can cleanly defer enabling
|
||||
* IRQs.
|
||||
*/
|
||||
|
||||
entering_irq();
|
||||
|
||||
/* entering_irq() tells RCU that we're not quiescent. Check it. */
|
||||
|
|
|
@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
|
|||
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
|
||||
return;
|
||||
|
||||
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
|
||||
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
|
||||
current->comm, curbase, regs->sp,
|
||||
irq_stack_top, irq_stack_bottom,
|
||||
estack_top, estack_bottom);
|
||||
estack_top, estack_bottom, (void *)regs->ip);
|
||||
|
||||
if (sysctl_panic_on_stackoverflow)
|
||||
panic("low stack detected by irq handler - check messages\n");
|
||||
|
|
|
@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
|
|||
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
|
||||
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
|
||||
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
|
||||
|
||||
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
|
||||
|
@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
|
|||
PATCH_SITE(pv_mmu_ops, read_cr2);
|
||||
PATCH_SITE(pv_mmu_ops, read_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, write_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
|
||||
PATCH_SITE(pv_cpu_ops, wbinvd);
|
||||
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
|
||||
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
|
||||
|
|
|
@ -47,7 +47,7 @@
|
|||
* section. Since TSS's are completely CPU-local, we want them
|
||||
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
|
||||
*/
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
|
||||
.x86_tss = {
|
||||
/*
|
||||
* .sp0 is only used when entering ring 0 from a lower
|
||||
|
@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
|||
* Poison it.
|
||||
*/
|
||||
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* .sp1 is cpu_current_top_of_stack. The init task never
|
||||
* runs user code, but cpu_current_top_of_stack should still
|
||||
* be well defined before the first context switch.
|
||||
*/
|
||||
.sp1 = TOP_OF_INIT_STACK,
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ss1 = __KERNEL_CS,
|
||||
|
@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
|||
*/
|
||||
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
|
||||
#endif
|
||||
#ifdef CONFIG_X86_32
|
||||
.SYSENTER_stack_canary = STACK_END_MAGIC,
|
||||
#endif
|
||||
};
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tss);
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
|
||||
|
||||
DEFINE_PER_CPU(bool, __tss_limit_invalid);
|
||||
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
|
||||
|
@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
|
|||
struct fpu *fpu = &t->fpu;
|
||||
|
||||
if (bp) {
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
|
||||
|
||||
t->io_bitmap_ptr = NULL;
|
||||
clear_thread_flag(TIF_IO_BITMAP);
|
||||
|
|
|
@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
struct fpu *prev_fpu = &prev->fpu;
|
||||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
|
||||
|
||||
|
|
|
@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
|
|||
unsigned int fsindex, gsindex;
|
||||
unsigned int ds, cs, es;
|
||||
|
||||
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
|
||||
regs->sp, regs->flags);
|
||||
show_iret_regs(regs);
|
||||
|
||||
if (regs->orig_ax != -1)
|
||||
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
|
||||
else
|
||||
|
@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
|
|||
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
|
||||
regs->r13, regs->r14, regs->r15);
|
||||
|
||||
if (!all)
|
||||
return;
|
||||
|
||||
asm("movl %%ds,%0" : "=r" (ds));
|
||||
asm("movl %%cs,%0" : "=r" (cs));
|
||||
asm("movl %%es,%0" : "=r" (es));
|
||||
|
@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
|
|||
rdmsrl(MSR_GS_BASE, gs);
|
||||
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
|
||||
|
||||
if (!all)
|
||||
return;
|
||||
|
||||
cr0 = read_cr0();
|
||||
cr2 = read_cr2();
|
||||
cr3 = __read_cr3();
|
||||
|
@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
struct fpu *prev_fpu = &prev->fpu;
|
||||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
|
||||
this_cpu_read(irq_count) != -1);
|
||||
|
@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
* Switch the PDA and FPU contexts.
|
||||
*/
|
||||
this_cpu_write(current_task, next_p);
|
||||
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
|
||||
|
||||
/* Reload sp0. */
|
||||
update_sp0(next_p);
|
||||
|
|
|
@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|||
|
||||
/*
|
||||
* If IRET takes a non-IST fault on the espfix64 stack, then we
|
||||
* end up promoting it to a doublefault. In that case, modify
|
||||
* the stack to make it look like we just entered the #GP
|
||||
* handler from user space, similar to bad_iret.
|
||||
* end up promoting it to a doublefault. In that case, take
|
||||
* advantage of the fact that we're not using the normal (TSS.sp0)
|
||||
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
|
||||
* and then modify our own IRET frame so that, when we return,
|
||||
* we land directly at the #GP(0) vector with the stack already
|
||||
* set up according to its expectations.
|
||||
*
|
||||
* The net result is that our #GP handler will think that we
|
||||
* entered from usermode with the bad user context.
|
||||
*
|
||||
* No need for ist_enter here because we don't use RCU.
|
||||
*/
|
||||
|
@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|||
regs->cs == __KERNEL_CS &&
|
||||
regs->ip == (unsigned long)native_irq_return_iret)
|
||||
{
|
||||
struct pt_regs *normal_regs = task_pt_regs(current);
|
||||
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
|
||||
|
||||
/* Fake a #GP(0) from userspace. */
|
||||
memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
|
||||
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
|
||||
/*
|
||||
* regs->sp points to the failing IRET frame on the
|
||||
* ESPFIX64 stack. Copy it to the entry stack. This fills
|
||||
* in gpregs->ss through gpregs->ip.
|
||||
*
|
||||
*/
|
||||
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
|
||||
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
|
||||
|
||||
/*
|
||||
* Adjust our frame so that we return straight to the #GP
|
||||
* vector with the expected RSP value. This is safe because
|
||||
* we won't enable interupts or schedule before we invoke
|
||||
* general_protection, so nothing will clobber the stack
|
||||
* frame we just set up.
|
||||
*/
|
||||
regs->ip = (unsigned long)general_protection;
|
||||
regs->sp = (unsigned long)&normal_regs->orig_ax;
|
||||
regs->sp = (unsigned long)&gpregs->orig_ax;
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|||
*
|
||||
* Processors update CR2 whenever a page fault is detected. If a
|
||||
* second page fault occurs while an earlier page fault is being
|
||||
* deliv- ered, the faulting linear address of the second fault will
|
||||
* delivered, the faulting linear address of the second fault will
|
||||
* overwrite the contents of CR2 (replacing the previous
|
||||
* address). These updates to CR2 occur even if the page fault
|
||||
* results in a double fault or occurs during the delivery of a
|
||||
|
@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3);
|
|||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Help handler running on IST stack to switch off the IST stack if the
|
||||
* interrupted code was in user mode. The actual stack switch is done in
|
||||
* entry_64.S
|
||||
* Help handler running on a per-cpu (IST or entry trampoline) stack
|
||||
* to switch to the normal thread stack if the interrupted code was in
|
||||
* user mode. The actual stack switch is done in entry_64.S
|
||||
*/
|
||||
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
|
||||
{
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
*regs = *eregs;
|
||||
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
|
||||
if (regs != eregs)
|
||||
*regs = *eregs;
|
||||
return regs;
|
||||
}
|
||||
NOKPROBE_SYMBOL(sync_regs);
|
||||
|
@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
|
|||
/*
|
||||
* This is called from entry_64.S early in handling a fault
|
||||
* caused by a bad iret to user mode. To handle the fault
|
||||
* correctly, we want move our stack frame to task_pt_regs
|
||||
* and we want to pretend that the exception came from the
|
||||
* iret target.
|
||||
* correctly, we want to move our stack frame to where it would
|
||||
* be had we entered directly on the entry stack (rather than
|
||||
* just below the IRET frame) and we want to pretend that the
|
||||
* exception came from the IRET target.
|
||||
*/
|
||||
struct bad_iret_stack *new_stack =
|
||||
container_of(task_pt_regs(current),
|
||||
struct bad_iret_stack, regs);
|
||||
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
|
||||
|
||||
/* Copy the IRET target to the new stack. */
|
||||
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
|
||||
|
@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
|
|||
debug_stack_usage_dec();
|
||||
|
||||
exit:
|
||||
#if defined(CONFIG_X86_32)
|
||||
/*
|
||||
* This is the most likely code path that involves non-trivial use
|
||||
* of the SYSENTER stack. Check that we haven't overrun it.
|
||||
*/
|
||||
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
|
||||
"Overran or corrupted SYSENTER stack\n");
|
||||
#endif
|
||||
ist_exit(regs);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_debug);
|
||||
|
@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
|
|||
|
||||
void __init trap_init(void)
|
||||
{
|
||||
/* Init cpu_entry_area before IST entries are set up */
|
||||
setup_cpu_entry_areas();
|
||||
|
||||
idt_setup_traps();
|
||||
|
||||
/*
|
||||
|
|
|
@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
|
||||
static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
|
||||
size_t len)
|
||||
{
|
||||
struct stack_info *info = &state->stack_info;
|
||||
void *addr = (void *)_addr;
|
||||
|
||||
/*
|
||||
* If the address isn't on the current stack, switch to the next one.
|
||||
*
|
||||
* We may have to traverse multiple stacks to deal with the possibility
|
||||
* that info->next_sp could point to an empty stack and the address
|
||||
* could be on a subsequent stack.
|
||||
*/
|
||||
while (!on_stack(info, (void *)addr, len))
|
||||
if (get_stack_info(info->next_sp, state->task, info,
|
||||
&state->stack_mask))
|
||||
return false;
|
||||
if (!on_stack(info, addr, len) &&
|
||||
(get_stack_info(addr, state->task, info, &state->stack_mask)))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
|
|||
return true;
|
||||
}
|
||||
|
||||
#define REGS_SIZE (sizeof(struct pt_regs))
|
||||
#define SP_OFFSET (offsetof(struct pt_regs, sp))
|
||||
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
|
||||
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
|
||||
|
||||
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *ip, unsigned long *sp, bool full)
|
||||
unsigned long *ip, unsigned long *sp)
|
||||
{
|
||||
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
|
||||
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
|
||||
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
|
||||
struct pt_regs *regs = (struct pt_regs *)addr;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
if (!stack_access_ok(state, addr, regs_size))
|
||||
return false;
|
||||
/* x86-32 support will be more complicated due to the ®s->sp hack */
|
||||
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stack_access_ok(state, addr, sp_offset))
|
||||
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (user_mode(regs)) {
|
||||
if (!stack_access_ok(state, addr + sp_offset,
|
||||
REGS_SIZE - SP_OFFSET))
|
||||
return false;
|
||||
static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *ip, unsigned long *sp)
|
||||
{
|
||||
struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
|
||||
|
||||
*sp = regs->sp;
|
||||
} else
|
||||
*sp = (unsigned long)®s->sp;
|
||||
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
|
|||
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
|
||||
enum stack_type prev_type = state->stack_info.type;
|
||||
struct orc_entry *orc;
|
||||
struct pt_regs *ptregs;
|
||||
bool indirect = false;
|
||||
|
||||
if (unwind_done(state))
|
||||
|
@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
|
|||
break;
|
||||
|
||||
case ORC_TYPE_REGS:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
|
||||
orc_warn("can't dereference registers at %p for ip %pB\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
|
@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
|
|||
break;
|
||||
|
||||
case ORC_TYPE_REGS_IRET:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
|
||||
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
|
||||
orc_warn("can't dereference iret registers at %p for ip %pB\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ptregs = container_of((void *)sp, struct pt_regs, ip);
|
||||
if ((unsigned long)ptregs >= prev_sp &&
|
||||
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
|
||||
state->regs = ptregs;
|
||||
state->full_regs = false;
|
||||
} else
|
||||
state->regs = NULL;
|
||||
|
||||
state->regs = (void *)sp - IRET_FRAME_OFFSET;
|
||||
state->full_regs = false;
|
||||
state->signal = true;
|
||||
break;
|
||||
|
||||
|
@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
|
|||
}
|
||||
|
||||
if (get_stack_info((unsigned long *)state->sp, state->task,
|
||||
&state->stack_info, &state->stack_mask))
|
||||
return;
|
||||
&state->stack_info, &state->stack_mask)) {
|
||||
/*
|
||||
* We weren't on a valid stack. It's possible that
|
||||
* we overflowed a valid stack into a guard page.
|
||||
* See if the next page up is valid so that we can
|
||||
* generate some kind of backtrace if this happens.
|
||||
*/
|
||||
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
|
||||
if (get_stack_info(next_page, state->task, &state->stack_info,
|
||||
&state->stack_mask))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller can provide the address of the first frame directly
|
||||
|
|
|
@ -107,6 +107,15 @@ SECTIONS
|
|||
SOFTIRQENTRY_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
_entry_trampoline = .;
|
||||
*(.entry_trampoline)
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
|
||||
#endif
|
||||
|
||||
/* End of text section */
|
||||
_etext = .;
|
||||
} :text = 0x9090
|
||||
|
|
|
@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
* processors. See 22.2.4.
|
||||
*/
|
||||
vmcs_writel(HOST_TR_BASE,
|
||||
(unsigned long)this_cpu_ptr(&cpu_tss));
|
||||
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
|
||||
|
||||
/*
|
||||
|
|
|
@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
|
|||
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
|
||||
|
||||
/*
|
||||
* Use cpu_tss as a cacheline-aligned, seldomly
|
||||
* Use cpu_tss_rw as a cacheline-aligned, seldomly
|
||||
* accessed per-cpu variable as the monitor target.
|
||||
*/
|
||||
__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
|
||||
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
|
||||
|
||||
/*
|
||||
* AMD, like Intel, supports the EAX hint and EAX=0xf
|
||||
|
|
|
@ -277,6 +277,7 @@ void __init kasan_early_init(void)
|
|||
void __init kasan_init(void)
|
||||
{
|
||||
int i;
|
||||
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
|
||||
|
||||
#ifdef CONFIG_KASAN_INLINE
|
||||
register_die_notifier(&kasan_die_notifier);
|
||||
|
@ -329,8 +330,23 @@ void __init kasan_init(void)
|
|||
(unsigned long)kasan_mem_to_shadow(_end),
|
||||
early_pfn_to_nid(__pa(_stext)));
|
||||
|
||||
shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
|
||||
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
|
||||
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
|
||||
PAGE_SIZE);
|
||||
|
||||
shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
|
||||
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
|
||||
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
|
||||
PAGE_SIZE);
|
||||
|
||||
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
|
||||
(void *)KASAN_SHADOW_END);
|
||||
shadow_cpu_entry_begin);
|
||||
|
||||
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
|
||||
(unsigned long)shadow_cpu_entry_end, 0);
|
||||
|
||||
kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
|
||||
|
||||
load_cr3(init_top_pgt);
|
||||
__flush_tlb_all();
|
||||
|
|
|
@ -152,17 +152,19 @@ static void do_fpu_end(void)
|
|||
static void fix_processor_context(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
|
||||
#ifdef CONFIG_X86_64
|
||||
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
|
||||
tss_desc tss;
|
||||
#endif
|
||||
set_tss_desc(cpu, t); /*
|
||||
* This just modifies memory; should not be
|
||||
* necessary. But... This is necessary, because
|
||||
* 386 hardware has concept of busy TSS or some
|
||||
* similar stupidity.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We need to reload TR, which requires that we change the
|
||||
* GDT entry to indicate "available" first.
|
||||
*
|
||||
* XXX: This could probably all be replaced by a call to
|
||||
* force_reload_TR().
|
||||
*/
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
|
||||
|
|
|
@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0)
|
|||
mcs = xen_mc_entry(0);
|
||||
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU);
|
||||
this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
|
||||
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
|
||||
}
|
||||
|
||||
void xen_set_iopl_mask(unsigned mask)
|
||||
|
|
|
@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
|
|||
#endif
|
||||
case FIX_TEXT_POKE0:
|
||||
case FIX_TEXT_POKE1:
|
||||
case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
|
||||
case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
|
||||
/* All local page mappings */
|
||||
pte = pfn_pte(phys, prot);
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue