Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "Various fixes: - Fix the PAT performance regression that downgraded write-combining device memory regions to uncached. - There's been a number of bugs in 32-bit double fault handling - hopefully all fixed now. - Fix an LDT crash - Fix an FPU over-optimization that broke with GCC9 code optimizations. - Misc cleanups" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/pat: Fix off-by-one bugs in interval tree search x86/ioperm: Save an indentation level in tss_update_io_bitmap() x86/fpu: Don't cache access to fpu_fpregs_owner_ctx x86/entry/32: Remove unused 'restore_all_notrace' local label x86/ptrace: Document FSBASE and GSBASE ABI oddities x86/ptrace: Remove set_segment_reg() implementations for current x86/traps: die() instead of panicking on a double fault x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area x86/doublefault/32: Rename doublefault.c to doublefault_32.c x86/traps: Disentangle the 32-bit and 64-bit doublefault code lkdtm: Add a DOUBLE_FAULT crash type on x86 selftests/x86/single_step_syscall: Check SYSENTER directly x86/mm/32: Sync only to VMALLOC_END in vmalloc_sync_all()
This commit is contained in:
commit
e5b3fc125d
|
@ -117,7 +117,7 @@ config DEBUG_WX
|
|||
|
||||
config DOUBLEFAULT
|
||||
default y
|
||||
bool "Enable doublefault exception handler" if EXPERT
|
||||
bool "Enable doublefault exception handler" if EXPERT && X86_32
|
||||
---help---
|
||||
This option allows trapping of rare doublefault exceptions that
|
||||
would otherwise cause a system to silently reboot. Disabling this
|
||||
|
|
|
@ -1090,7 +1090,6 @@ SYM_FUNC_START(entry_INT80_32)
|
|||
restore_all:
|
||||
TRACE_IRQS_IRET
|
||||
SWITCH_TO_ENTRY_STACK
|
||||
.Lrestore_all_notrace:
|
||||
CHECK_AND_APPLY_ESPFIX
|
||||
.Lrestore_nocheck:
|
||||
/* Switch back to user CR3 */
|
||||
|
@ -1537,6 +1536,48 @@ SYM_CODE_START(debug)
|
|||
jmp common_exception
|
||||
SYM_CODE_END(debug)
|
||||
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
SYM_CODE_START(double_fault)
|
||||
1:
|
||||
/*
|
||||
* This is a task gate handler, not an interrupt gate handler.
|
||||
* The error code is on the stack, but the stack is otherwise
|
||||
* empty. Interrupts are off. Our state is sane with the following
|
||||
* exceptions:
|
||||
*
|
||||
* - CR0.TS is set. "TS" literally means "task switched".
|
||||
* - EFLAGS.NT is set because we're a "nested task".
|
||||
* - The doublefault TSS has back_link set and has been marked busy.
|
||||
* - TR points to the doublefault TSS and the normal TSS is busy.
|
||||
* - CR3 is the normal kernel PGD. This would be delightful, except
|
||||
* that the CPU didn't bother to save the old CR3 anywhere. This
|
||||
* would make it very awkward to return back to the context we came
|
||||
* from.
|
||||
*
|
||||
* The rest of EFLAGS is sanitized for us, so we don't need to
|
||||
* worry about AC or DF.
|
||||
*
|
||||
* Don't even bother popping the error code. It's always zero,
|
||||
* and ignoring it makes us a bit more robust against buggy
|
||||
* hypervisor task gate implementations.
|
||||
*
|
||||
* We will manually undo the task switch instead of doing a
|
||||
* task-switching IRET.
|
||||
*/
|
||||
|
||||
clts /* clear CR0.TS */
|
||||
pushl $X86_EFLAGS_FIXED
|
||||
popfl /* clear EFLAGS.NT */
|
||||
|
||||
call doublefault_shim
|
||||
|
||||
/* We don't support returning, so we have no IRET here. */
|
||||
1:
|
||||
hlt
|
||||
jmp 1b
|
||||
SYM_CODE_END(double_fault)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* NMI is doubly nasty. It can happen on the first instruction of
|
||||
* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
|
||||
|
|
|
@ -65,6 +65,13 @@ enum exception_stack_ordering {
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
struct doublefault_stack {
|
||||
unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
|
||||
struct x86_hw_tss tss;
|
||||
} __aligned(PAGE_SIZE);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* cpu_entry_area is a percpu region that contains things needed by the CPU
|
||||
* and early entry/exit code. Real types aren't used for all fields here
|
||||
|
@ -86,6 +93,11 @@ struct cpu_entry_area {
|
|||
#endif
|
||||
struct entry_stack_page entry_stack_page;
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
char guard_doublefault_stack[PAGE_SIZE];
|
||||
struct doublefault_stack doublefault_stack;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
|
||||
* we need task switches to work, and task switches write to the TSS.
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _ASM_X86_DOUBLEFAULT_H
|
||||
#define _ASM_X86_DOUBLEFAULT_H
|
||||
|
||||
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
|
||||
extern void doublefault_init_cpu_tss(void);
|
||||
#else
|
||||
static inline void doublefault_init_cpu_tss(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_DOUBLEFAULT_H */
|
|
@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
|
|||
|
||||
static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
|
||||
{
|
||||
return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
|
||||
return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
|
|||
#endif
|
||||
|
||||
/*
|
||||
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
|
||||
* to avoid include recursion hell
|
||||
* This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
|
||||
* Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
|
||||
* to avoid include recursion hell.
|
||||
*/
|
||||
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41)
|
||||
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43)
|
||||
|
||||
/* The +1 is for the readonly IDT page: */
|
||||
#define CPU_ENTRY_AREA_BASE \
|
||||
|
|
|
@ -166,7 +166,6 @@ enum cpuid_regs_idx {
|
|||
extern struct cpuinfo_x86 boot_cpu_data;
|
||||
extern struct cpuinfo_x86 new_cpu_data;
|
||||
|
||||
extern struct x86_hw_tss doublefault_tss;
|
||||
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
|
@ -997,7 +996,6 @@ bool xen_set_default_idle(void);
|
|||
#endif
|
||||
|
||||
void stop_this_cpu(void *dummy);
|
||||
void df_debug(struct pt_regs *regs, long error_code);
|
||||
void microcode_check(void);
|
||||
|
||||
enum l1tf_mitigations {
|
||||
|
|
|
@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
|
|||
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
|
||||
dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
|
||||
dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
|
||||
#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
|
||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
|
||||
#endif
|
||||
dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
|
||||
dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
|
||||
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
|
||||
|
|
|
@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
|
|||
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
|
||||
obj-y += kprobes/
|
||||
obj-$(CONFIG_MODULES) += module.o
|
||||
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
|
||||
ifeq ($(CONFIG_X86_32),y)
|
||||
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
|
||||
endif
|
||||
obj-$(CONFIG_KGDB) += kgdb.o
|
||||
obj-$(CONFIG_VM86) += vm86_32.o
|
||||
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include <asm/stackprotector.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/doublefault.h>
|
||||
#include <asm/archrandom.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/processor.h>
|
||||
|
@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
|
|||
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
|
||||
}
|
||||
|
||||
static inline void gdt_setup_doublefault_tss(int cpu) { }
|
||||
|
||||
#else /* CONFIG_X86_64 */
|
||||
|
||||
static inline void setup_getcpu(int cpu) { }
|
||||
|
@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
|
|||
|
||||
static inline void tss_setup_ist(struct tss_struct *tss) { }
|
||||
|
||||
static inline void gdt_setup_doublefault_tss(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
/* Set up the doublefault TSS pointer in the GDT */
|
||||
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
|
||||
#endif
|
||||
}
|
||||
#endif /* !CONFIG_X86_64 */
|
||||
|
||||
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
|
||||
|
@ -1923,7 +1915,7 @@ void cpu_init(void)
|
|||
clear_all_debug_regs();
|
||||
dbg_restore_debug_regs();
|
||||
|
||||
gdt_setup_doublefault_tss(cpu);
|
||||
doublefault_init_cpu_tss();
|
||||
|
||||
fpu__init_cpu();
|
||||
|
||||
|
|
|
@ -1,86 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/init_task.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/desc.h>
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
|
||||
#define DOUBLEFAULT_STACKSIZE (1024)
|
||||
static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
|
||||
#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
|
||||
|
||||
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
|
||||
|
||||
static void doublefault_fn(void)
|
||||
{
|
||||
struct desc_ptr gdt_desc = {0, 0};
|
||||
unsigned long gdt, tss;
|
||||
|
||||
native_store_gdt(&gdt_desc);
|
||||
gdt = gdt_desc.address;
|
||||
|
||||
printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
|
||||
|
||||
if (ptr_ok(gdt)) {
|
||||
gdt += GDT_ENTRY_TSS << 3;
|
||||
tss = get_desc_base((struct desc_struct *)gdt);
|
||||
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
|
||||
|
||||
if (ptr_ok(tss)) {
|
||||
struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
|
||||
|
||||
printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
|
||||
t->ip, t->sp);
|
||||
|
||||
printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
|
||||
t->ax, t->bx, t->cx, t->dx);
|
||||
printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
|
||||
t->si, t->di);
|
||||
}
|
||||
}
|
||||
|
||||
for (;;)
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
|
||||
.sp0 = STACK_START,
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
|
||||
|
||||
.ip = (unsigned long) doublefault_fn,
|
||||
/* 0x2 bit is always set */
|
||||
.flags = X86_EFLAGS_SF | 0x2,
|
||||
.sp = STACK_START,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
#ifndef CONFIG_X86_32_LAZY_GS
|
||||
.gs = __KERNEL_STACK_CANARY,
|
||||
#endif
|
||||
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
};
|
||||
|
||||
/* dummy for do_double_fault() call */
|
||||
void df_debug(struct pt_regs *regs, long error_code) {}
|
||||
|
||||
#else /* !CONFIG_X86_32 */
|
||||
|
||||
void df_debug(struct pt_regs *regs, long error_code)
|
||||
{
|
||||
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
|
||||
show_regs(regs);
|
||||
panic("Machine halted.");
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,136 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/init_task.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/traps.h>
|
||||
|
||||
extern void double_fault(void);
|
||||
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
|
||||
|
||||
#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
|
||||
|
||||
static void set_df_gdt_entry(unsigned int cpu);
|
||||
|
||||
/*
|
||||
* Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
|
||||
* we're running the doublefault task. Cannot return.
|
||||
*/
|
||||
asmlinkage notrace void __noreturn doublefault_shim(void)
|
||||
{
|
||||
unsigned long cr2;
|
||||
struct pt_regs regs;
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
|
||||
|
||||
cr2 = native_read_cr2();
|
||||
|
||||
/* Reset back to the normal kernel task. */
|
||||
force_reload_TR();
|
||||
set_df_gdt_entry(smp_processor_id());
|
||||
|
||||
trace_hardirqs_off();
|
||||
|
||||
/*
|
||||
* Fill in pt_regs. A downside of doing this in C is that the unwinder
|
||||
* won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
|
||||
* won't successfully unwind to the source of the double fault.
|
||||
* The main dump from do_double_fault() is fine, though, since it
|
||||
* uses these regs directly.
|
||||
*
|
||||
* If anyone ever cares, this could be moved to asm.
|
||||
*/
|
||||
regs.ss = TSS(ss);
|
||||
regs.__ssh = 0;
|
||||
regs.sp = TSS(sp);
|
||||
regs.flags = TSS(flags);
|
||||
regs.cs = TSS(cs);
|
||||
/* We won't go through the entry asm, so we can leave __csh as 0. */
|
||||
regs.__csh = 0;
|
||||
regs.ip = TSS(ip);
|
||||
regs.orig_ax = 0;
|
||||
regs.gs = TSS(gs);
|
||||
regs.__gsh = 0;
|
||||
regs.fs = TSS(fs);
|
||||
regs.__fsh = 0;
|
||||
regs.es = TSS(es);
|
||||
regs.__esh = 0;
|
||||
regs.ds = TSS(ds);
|
||||
regs.__dsh = 0;
|
||||
regs.ax = TSS(ax);
|
||||
regs.bp = TSS(bp);
|
||||
regs.di = TSS(di);
|
||||
regs.si = TSS(si);
|
||||
regs.dx = TSS(dx);
|
||||
regs.cx = TSS(cx);
|
||||
regs.bx = TSS(bx);
|
||||
|
||||
do_double_fault(®s, 0, cr2);
|
||||
|
||||
/*
|
||||
* x86_32 does not save the original CR3 anywhere on a task switch.
|
||||
* This means that, even if we wanted to return, we would need to find
|
||||
* some way to reconstruct CR3. We could make a credible guess based
|
||||
* on cpu_tlbstate, but that would be racy and would not account for
|
||||
* PTI.
|
||||
*
|
||||
* Instead, don't bother. We can return through
|
||||
* rewind_stack_do_exit() instead.
|
||||
*/
|
||||
panic("cannot return from double fault\n");
|
||||
}
|
||||
NOKPROBE_SYMBOL(doublefault_shim);
|
||||
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
|
||||
.tss = {
|
||||
/*
|
||||
* No sp0 or ss0 -- we never run CPL != 0 with this TSS
|
||||
* active. sp is filled in later.
|
||||
*/
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
|
||||
|
||||
.ip = (unsigned long) double_fault,
|
||||
.flags = X86_EFLAGS_FIXED,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
#ifndef CONFIG_X86_32_LAZY_GS
|
||||
.gs = __KERNEL_STACK_CANARY,
|
||||
#endif
|
||||
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
},
|
||||
};
|
||||
|
||||
static void set_df_gdt_entry(unsigned int cpu)
|
||||
{
|
||||
/* Set up doublefault TSS pointer in the GDT */
|
||||
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
|
||||
&get_cpu_entry_area(cpu)->doublefault_stack.tss);
|
||||
|
||||
}
|
||||
|
||||
void doublefault_init_cpu_tss(void)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
|
||||
|
||||
/*
|
||||
* The linker isn't smart enough to initialize percpu variables that
|
||||
* point to other places in percpu space.
|
||||
*/
|
||||
this_cpu_write(doublefault_stack.tss.sp,
|
||||
(unsigned long)&cea->doublefault_stack.stack +
|
||||
sizeof(doublefault_stack.stack));
|
||||
|
||||
set_df_gdt_entry(cpu);
|
||||
}
|
|
@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
|
|||
if (type == STACK_TYPE_ENTRY)
|
||||
return "ENTRY_TRAMPOLINE";
|
||||
|
||||
if (type == STACK_TYPE_EXCEPTION)
|
||||
return "#DF";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
|
||||
{
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
|
||||
struct doublefault_stack *ss = &cea->doublefault_stack;
|
||||
|
||||
void *begin = ss->stack;
|
||||
void *end = begin + sizeof(ss->stack);
|
||||
|
||||
if ((void *)stack < begin || (void *)stack >= end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_EXCEPTION;
|
||||
info->begin = begin;
|
||||
info->end = end;
|
||||
info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
|
||||
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
struct stack_info *info, unsigned long *visit_mask)
|
||||
{
|
||||
|
@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
|||
if (in_softirq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_doublefault_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
goto unknown;
|
||||
|
||||
recursion_check:
|
||||
|
|
|
@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
|
|||
void tss_update_io_bitmap(void)
|
||||
{
|
||||
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
|
||||
struct thread_struct *t = ¤t->thread;
|
||||
u16 *base = &tss->x86_tss.io_bitmap_base;
|
||||
|
||||
if (test_thread_flag(TIF_IO_BITMAP)) {
|
||||
struct thread_struct *t = ¤t->thread;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
|
||||
*base = IO_BITMAP_OFFSET_VALID_ALL;
|
||||
} else {
|
||||
struct io_bitmap *iobm = t->io_bitmap;
|
||||
/*
|
||||
* Only copy bitmap data when the sequence number
|
||||
* differs. The update time is accounted to the
|
||||
* incoming task.
|
||||
*/
|
||||
if (tss->io_bitmap.prev_sequence != iobm->sequence)
|
||||
tss_copy_io_bitmap(tss, iobm);
|
||||
|
||||
/* Enable the bitmap */
|
||||
*base = IO_BITMAP_OFFSET_VALID_MAP;
|
||||
}
|
||||
/*
|
||||
* Make sure that the TSS limit is covering the io bitmap.
|
||||
* It might have been cut down by a VMEXIT to 0x67 which
|
||||
* would cause a subsequent I/O access from user space to
|
||||
* trigger a #GP because tbe bitmap is outside the TSS
|
||||
* limit.
|
||||
*/
|
||||
refresh_tss_limit();
|
||||
} else {
|
||||
if (!test_thread_flag(TIF_IO_BITMAP)) {
|
||||
tss_invalidate_io_bitmap(tss);
|
||||
return;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
|
||||
*base = IO_BITMAP_OFFSET_VALID_ALL;
|
||||
} else {
|
||||
struct io_bitmap *iobm = t->io_bitmap;
|
||||
|
||||
/*
|
||||
* Only copy bitmap data when the sequence number differs. The
|
||||
* update time is accounted to the incoming task.
|
||||
*/
|
||||
if (tss->io_bitmap.prev_sequence != iobm->sequence)
|
||||
tss_copy_io_bitmap(tss, iobm);
|
||||
|
||||
/* Enable the bitmap */
|
||||
*base = IO_BITMAP_OFFSET_VALID_MAP;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the TSS limit is covering the IO bitmap. It might have
|
||||
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
|
||||
* access from user space to trigger a #GP because tbe bitmap is outside
|
||||
* the TSS limit.
|
||||
*/
|
||||
refresh_tss_limit();
|
||||
}
|
||||
#else /* CONFIG_X86_IOPL_IOPERM */
|
||||
static inline void switch_to_bitmap(unsigned long tifp) { }
|
||||
|
|
|
@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
|
|||
static int set_segment_reg(struct task_struct *task,
|
||||
unsigned long offset, u16 value)
|
||||
{
|
||||
if (WARN_ON_ONCE(task == current))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* The value argument was already truncated to 16 bits.
|
||||
*/
|
||||
|
@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task,
|
|||
break;
|
||||
|
||||
case offsetof(struct user_regs_struct, gs):
|
||||
if (task == current)
|
||||
set_user_gs(task_pt_regs(task), value);
|
||||
else
|
||||
task_user_gs(task) = value;
|
||||
task_user_gs(task) = value;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -272,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
|
|||
static int set_segment_reg(struct task_struct *task,
|
||||
unsigned long offset, u16 value)
|
||||
{
|
||||
if (WARN_ON_ONCE(task == current))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* The value argument was already truncated to 16 bits.
|
||||
*/
|
||||
if (invalid_selector(value))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* This function has some ABI oddities.
|
||||
*
|
||||
* A 32-bit ptracer probably expects that writing FS or GS will change
|
||||
* FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
|
||||
* this code indeed has that effect. When FSGSBASE is added, this
|
||||
* will require a special case.
|
||||
*
|
||||
* For existing 64-bit ptracers, writing FS or GS *also* currently
|
||||
* changes the base if the selector is nonzero the next time the task
|
||||
* is run. This behavior may not be needed, and trying to preserve it
|
||||
* when FSGSBASE is added would be complicated at best.
|
||||
*/
|
||||
|
||||
switch (offset) {
|
||||
case offsetof(struct user_regs_struct,fs):
|
||||
task->thread.fsindex = value;
|
||||
if (task == current)
|
||||
loadsegment(fs, task->thread.fsindex);
|
||||
break;
|
||||
case offsetof(struct user_regs_struct,gs):
|
||||
task->thread.gsindex = value;
|
||||
if (task == current)
|
||||
load_gs_index(task->thread.gsindex);
|
||||
break;
|
||||
case offsetof(struct user_regs_struct,ds):
|
||||
task->thread.ds = value;
|
||||
if (task == current)
|
||||
loadsegment(ds, task->thread.ds);
|
||||
break;
|
||||
case offsetof(struct user_regs_struct,es):
|
||||
task->thread.es = value;
|
||||
if (task == current)
|
||||
loadsegment(es, task->thread.es);
|
||||
break;
|
||||
|
||||
/*
|
||||
|
@ -375,6 +384,9 @@ static int putreg(struct task_struct *child,
|
|||
* When changing the FS base, use do_arch_prctl_64()
|
||||
* to set the index to zero and to set the base
|
||||
* as requested.
|
||||
*
|
||||
* NB: This behavior is nonsensical and likely needs to
|
||||
* change when FSGSBASE support is added.
|
||||
*/
|
||||
if (child->thread.fsbase != value)
|
||||
return do_arch_prctl_64(child, ARCH_SET_FS, value);
|
||||
|
|
|
@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Runs on IST stack */
|
||||
#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
|
||||
/*
|
||||
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
|
||||
*
|
||||
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
|
||||
* SDM's warnings about double faults being unrecoverable, returning works as
|
||||
* expected. Presumably what the SDM actually means is that the CPU may get
|
||||
* the register state wrong on entry, so returning could be a bad idea.
|
||||
*
|
||||
* Various CPU engineers have promised that double faults due to an IRET fault
|
||||
* while the stack is read-only are, in fact, recoverable.
|
||||
*
|
||||
* On x86_32, this is entered through a task gate, and regs are synthesized
|
||||
* from the TSS. Returning is, in principle, okay, but changes to regs will
|
||||
* be lost. If, for some reason, we need to return to a context with modified
|
||||
* regs, the shim code could be adjusted to synchronize the registers.
|
||||
*/
|
||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
|
||||
{
|
||||
static const char str[] = "double fault";
|
||||
|
@ -411,15 +426,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
|
|||
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
df_debug(regs, error_code);
|
||||
#endif
|
||||
/*
|
||||
* This is always a kernel trap and never fixable (and thus must
|
||||
* never return).
|
||||
*/
|
||||
for (;;)
|
||||
die(str, regs, error_code);
|
||||
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
|
||||
die("double fault", regs, error_code);
|
||||
panic("Machine halted.");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
|
|||
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
|
||||
#endif
|
||||
|
||||
struct cpu_entry_area *get_cpu_entry_area(int cpu)
|
||||
{
|
||||
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
|
||||
|
@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
|
|||
cea_map_stack(MCE);
|
||||
}
|
||||
#else
|
||||
static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
|
||||
static inline void percpu_setup_exception_stacks(unsigned int cpu)
|
||||
{
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
|
||||
|
||||
cea_map_percpu_pages(&cea->doublefault_stack,
|
||||
&per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Setup the fixmap mappings only once per-processor */
|
||||
|
|
|
@ -197,7 +197,7 @@ void vmalloc_sync_all(void)
|
|||
return;
|
||||
|
||||
for (address = VMALLOC_START & PMD_MASK;
|
||||
address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
|
||||
address >= TASK_SIZE_MAX && address < VMALLOC_END;
|
||||
address += PMD_SIZE) {
|
||||
struct page *page;
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type)
|
|||
{
|
||||
struct memtype *match;
|
||||
|
||||
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
|
||||
match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
|
||||
while (match != NULL && match->start < end) {
|
||||
if ((match_type == MEMTYPE_EXACT_MATCH) &&
|
||||
(match->start == start) && (match->end == end))
|
||||
|
@ -66,7 +66,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type)
|
|||
(match->start < start) && (match->end == end))
|
||||
return match;
|
||||
|
||||
match = memtype_interval_iter_next(match, start, end);
|
||||
match = memtype_interval_iter_next(match, start, end-1);
|
||||
}
|
||||
|
||||
return NULL; /* Returns NULL if there is no match */
|
||||
|
@ -79,7 +79,7 @@ static int memtype_check_conflict(u64 start, u64 end,
|
|||
struct memtype *match;
|
||||
enum page_cache_mode found_type = reqtype;
|
||||
|
||||
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
|
||||
match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
|
||||
if (match == NULL)
|
||||
goto success;
|
||||
|
||||
|
@ -89,12 +89,12 @@ static int memtype_check_conflict(u64 start, u64 end,
|
|||
dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
|
||||
found_type = match->type;
|
||||
|
||||
match = memtype_interval_iter_next(match, start, end);
|
||||
match = memtype_interval_iter_next(match, start, end-1);
|
||||
while (match) {
|
||||
if (match->type != found_type)
|
||||
goto failure;
|
||||
|
||||
match = memtype_interval_iter_next(match, start, end);
|
||||
match = memtype_interval_iter_next(match, start, end-1);
|
||||
}
|
||||
success:
|
||||
if (newtype)
|
||||
|
@ -160,7 +160,7 @@ struct memtype *memtype_erase(u64 start, u64 end)
|
|||
struct memtype *memtype_lookup(u64 addr)
|
||||
{
|
||||
return memtype_interval_iter_first(&memtype_rbroot, addr,
|
||||
addr + PAGE_SIZE);
|
||||
addr + PAGE_SIZE-1);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
|
|
@ -12,6 +12,10 @@
|
|||
#include <linux/sched/task_stack.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
#include <asm/desc.h>
|
||||
#endif
|
||||
|
||||
struct lkdtm_list {
|
||||
struct list_head node;
|
||||
};
|
||||
|
@ -337,3 +341,38 @@ void lkdtm_UNSET_SMEP(void)
|
|||
pr_err("FAIL: this test is x86_64-only\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
void lkdtm_DOUBLE_FAULT(void)
|
||||
{
|
||||
/*
|
||||
* Trigger #DF by setting the stack limit to zero. This clobbers
|
||||
* a GDT TLS slot, which is okay because the current task will die
|
||||
* anyway due to the double fault.
|
||||
*/
|
||||
struct desc_struct d = {
|
||||
.type = 3, /* expand-up, writable, accessed data */
|
||||
.p = 1, /* present */
|
||||
.d = 1, /* 32-bit */
|
||||
.g = 0, /* limit in bytes */
|
||||
.s = 1, /* not system */
|
||||
};
|
||||
|
||||
local_irq_disable();
|
||||
write_gdt_entry(get_cpu_gdt_rw(smp_processor_id()),
|
||||
GDT_ENTRY_TLS_MIN, &d, DESCTYPE_S);
|
||||
|
||||
/*
|
||||
* Put our zero-limit segment in SS and then trigger a fault. The
|
||||
* 4-byte access to (%esp) will fault with #SS, and the attempt to
|
||||
* deliver the fault will recursively cause #SS and result in #DF.
|
||||
* This whole process happens while NMIs and MCEs are blocked by the
|
||||
* MOV SS window. This is nice because an NMI with an invalid SS
|
||||
* would also double-fault, resulting in the NMI or MCE being lost.
|
||||
*/
|
||||
asm volatile ("movw %0, %%ss; addl $0, (%%esp)" ::
|
||||
"r" ((unsigned short)(GDT_ENTRY_TLS_MIN << 3)));
|
||||
|
||||
panic("tried to double fault but didn't die\n");
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -171,6 +171,9 @@ static const struct crashtype crashtypes[] = {
|
|||
CRASHTYPE(USERCOPY_KERNEL_DS),
|
||||
CRASHTYPE(STACKLEAK_ERASING),
|
||||
CRASHTYPE(CFI_FORWARD_PROTO),
|
||||
#ifdef CONFIG_X86_32
|
||||
CRASHTYPE(DOUBLE_FAULT),
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -28,6 +28,9 @@ void lkdtm_CORRUPT_USER_DS(void);
|
|||
void lkdtm_STACK_GUARD_PAGE_LEADING(void);
|
||||
void lkdtm_STACK_GUARD_PAGE_TRAILING(void);
|
||||
void lkdtm_UNSET_SMEP(void);
|
||||
#ifdef CONFIG_X86_32
|
||||
void lkdtm_DOUBLE_FAULT(void);
|
||||
#endif
|
||||
|
||||
/* lkdtm_heap.c */
|
||||
void __init lkdtm_heap_init(void);
|
||||
|
|
|
@ -43,7 +43,19 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
|||
err(1, "sigaction");
|
||||
}
|
||||
|
||||
static volatile sig_atomic_t sig_traps;
|
||||
static void clearhandler(int sig)
|
||||
{
|
||||
struct sigaction sa;
|
||||
memset(&sa, 0, sizeof(sa));
|
||||
sa.sa_handler = SIG_DFL;
|
||||
sigemptyset(&sa.sa_mask);
|
||||
if (sigaction(sig, &sa, 0))
|
||||
err(1, "sigaction");
|
||||
}
|
||||
|
||||
static volatile sig_atomic_t sig_traps, sig_eflags;
|
||||
sigjmp_buf jmpbuf;
|
||||
static unsigned char altstack_data[SIGSTKSZ];
|
||||
|
||||
#ifdef __x86_64__
|
||||
# define REG_IP REG_RIP
|
||||
|
@ -90,6 +102,25 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
|
|||
}
|
||||
}
|
||||
|
||||
static char const * const signames[] = {
|
||||
[SIGSEGV] = "SIGSEGV",
|
||||
[SIGBUS] = "SIBGUS",
|
||||
[SIGTRAP] = "SIGTRAP",
|
||||
[SIGILL] = "SIGILL",
|
||||
};
|
||||
|
||||
static void print_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
|
||||
{
|
||||
ucontext_t *ctx = ctx_void;
|
||||
|
||||
printf("\tGot %s with RIP=%lx, TF=%ld\n", signames[sig],
|
||||
(unsigned long)ctx->uc_mcontext.gregs[REG_IP],
|
||||
(unsigned long)ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_TF);
|
||||
|
||||
sig_eflags = (unsigned long)ctx->uc_mcontext.gregs[REG_EFL];
|
||||
siglongjmp(jmpbuf, 1);
|
||||
}
|
||||
|
||||
static void check_result(void)
|
||||
{
|
||||
unsigned long new_eflags = get_eflags();
|
||||
|
@ -109,6 +140,22 @@ static void check_result(void)
|
|||
sig_traps = 0;
|
||||
}
|
||||
|
||||
static void fast_syscall_no_tf(void)
|
||||
{
|
||||
sig_traps = 0;
|
||||
printf("[RUN]\tFast syscall with TF cleared\n");
|
||||
fflush(stdout); /* Force a syscall */
|
||||
if (get_eflags() & X86_EFLAGS_TF) {
|
||||
printf("[FAIL]\tTF is now set\n");
|
||||
exit(1);
|
||||
}
|
||||
if (sig_traps) {
|
||||
printf("[FAIL]\tGot SIGTRAP\n");
|
||||
exit(1);
|
||||
}
|
||||
printf("[OK]\tNothing unexpected happened\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
#ifdef CAN_BUILD_32
|
||||
|
@ -163,17 +210,46 @@ int main()
|
|||
check_result();
|
||||
|
||||
/* Now make sure that another fast syscall doesn't set TF again. */
|
||||
printf("[RUN]\tFast syscall with TF cleared\n");
|
||||
fflush(stdout); /* Force a syscall */
|
||||
if (get_eflags() & X86_EFLAGS_TF) {
|
||||
printf("[FAIL]\tTF is now set\n");
|
||||
fast_syscall_no_tf();
|
||||
|
||||
/*
|
||||
* And do a forced SYSENTER to make sure that this works even if
|
||||
* fast syscalls don't use SYSENTER.
|
||||
*
|
||||
* Invoking SYSENTER directly breaks all the rules. Just handle
|
||||
* the SIGSEGV.
|
||||
*/
|
||||
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||
unsigned long nr = SYS_getpid;
|
||||
printf("[RUN]\tSet TF and check SYSENTER\n");
|
||||
stack_t stack = {
|
||||
.ss_sp = altstack_data,
|
||||
.ss_size = SIGSTKSZ,
|
||||
};
|
||||
if (sigaltstack(&stack, NULL) != 0)
|
||||
err(1, "sigaltstack");
|
||||
sethandler(SIGSEGV, print_and_longjmp,
|
||||
SA_RESETHAND | SA_ONSTACK);
|
||||
sethandler(SIGILL, print_and_longjmp, SA_RESETHAND);
|
||||
set_eflags(get_eflags() | X86_EFLAGS_TF);
|
||||
/* Clear EBP first to make sure we segfault cleanly. */
|
||||
asm volatile ("xorl %%ebp, %%ebp; SYSENTER" : "+a" (nr) :: "flags", "rcx"
|
||||
#ifdef __x86_64__
|
||||
, "r11"
|
||||
#endif
|
||||
);
|
||||
|
||||
/* We're unreachable here. SYSENTER forgets RIP. */
|
||||
}
|
||||
clearhandler(SIGSEGV);
|
||||
clearhandler(SIGILL);
|
||||
if (!(sig_eflags & X86_EFLAGS_TF)) {
|
||||
printf("[FAIL]\tTF was cleared\n");
|
||||
exit(1);
|
||||
}
|
||||
if (sig_traps) {
|
||||
printf("[FAIL]\tGot SIGTRAP\n");
|
||||
exit(1);
|
||||
}
|
||||
printf("[OK]\tNothing unexpected happened\n");
|
||||
|
||||
/* Now make sure that another fast syscall doesn't set TF again. */
|
||||
fast_syscall_no_tf();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue