From 3052636aa9aa2492ccac973449be63cae5b93a67 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 16 Dec 2020 21:11:59 +0800 Subject: [PATCH 01/23] x86/mtrr: Convert comma to semicolon Replace a comma between expression statements with a semicolon. Signed-off-by: Zheng Yongjun Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201216131159.14393-1-zhengyongjun3@huawei.com --- arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..9231640782fa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", From 167dcfc08b0b1f964ea95d410aa496fd78adf475 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 15 Dec 2020 20:56:41 +0000 Subject: [PATCH 02/23] x86/mm: Increase pgt_buf size for 5-level page tables pgt_buf is used to allocate page tables on initial direct page mapping which bootstraps the kernel into being able to allocate these before the direct mapping makes further pages available. INIT_PGD_PAGE_COUNT is set to 6 pages (doubled for KASLR) - 3 (PUD, PMD, PTE) for the 1 MiB ISA mapping and 3 more for the first direct mapping assignment in each case providing 2 MiB of address space. This has not been updated for 5-level page tables which has an additional P4D page table level above PUD. In most instances, this will not have a material impact as the first 4 page levels allocated for the ISA mapping will provide sufficient address space to encompass all further address mappings. If the first direct mapping is within 512 GiB of the ISA mapping, only a PMD and PTE needs to be added in the instance the kernel is using 4 KiB page tables (e.g. CONFIG_DEBUG_PAGEALLOC is enabled) and only a PMD if the kernel can use 2 MiB pages (the first allocation is limited to PMD_SIZE so a GiB page cannot be used there). However, if the machine has more than 512 GiB of RAM and the kernel is allocating 4 KiB page size, 3 further page tables are required. If the machine has more than 256 TiB of RAM at 4 KiB or 2 MiB page size, further 3 or 4 page tables are required respectively. Update INIT_PGD_PAGE_COUNT to reflect this. [ bp: Sanitize text into passive voice without ambiguous personal pronouns. ] Signed-off-by: Lorenzo Stoakes Signed-off-by: Borislav Petkov Acked-by: Kirill A. Shutemov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20201215205641.34096-1-lstoakes@gmail.com --- arch/x86/mm/init.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e26f5c5c6565..dd694fb93916 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num) } /* - * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. - * With KASLR memory randomization, depending on the machine e820 memory - * and the PUD alignment. We may need twice more pages when KASLR memory + * By default need to be able to allocate page tables below PGD firstly for + * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. + * With KASLR memory randomization, depending on the machine e820 memory and the + * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled. */ -#ifndef CONFIG_RANDOMIZE_MEMORY -#define INIT_PGD_PAGE_COUNT 6 + +#ifndef CONFIG_X86_5LEVEL +#define INIT_PGD_PAGE_TABLES 3 #else -#define INIT_PGD_PAGE_COUNT 12 +#define INIT_PGD_PAGE_TABLES 4 #endif + +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) +#else +#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) +#endif + #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) From 91a8f6cb06b33adc79fbf5f7381d907485767c00 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Thu, 17 Dec 2020 13:26:48 +0800 Subject: [PATCH 03/23] x86/mm: Refine mmap syscall implementation It is unnecessary to use the local variable 'error' in the mmap syscall implementation function - just return -EINVAL directly and get rid of the local variable altogether. [ bp: Massage commit message. ] Signed-off-by: Adrian Huang Signed-off-by: Borislav Petkov Reviewed-by: Christoph Hellwig Link: https://lkml.kernel.org/r/20201217052648.24656-1-adrianhuang0701@gmail.com --- arch/x86/kernel/sys_x86_64.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 504fa5425bce..660b78827638 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, From 4af0e6e39b7ed77796a41537db91d717fedd0ac3 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Wed, 11 Nov 2020 11:09:46 -0500 Subject: [PATCH 04/23] x86/mm: Remove duplicate definition of _PAGE_PAT_LARGE _PAGE_PAT_LARGE is already defined next to _PAGE_PAT. Remove the duplicate. Fixes: 4efb56649132 ("x86/mm: Tabulate the page table encoding definitions") Signed-off-by: Arvind Sankar Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20201111160946.147341-2-nivedita@alum.mit.edu --- arch/x86/include/asm/pgtable_types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 394757ee030a..f24d7ef8fffa 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -177,8 +177,6 @@ enum page_cache_mode { #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) -#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) - #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) From 11aa1415d8bd2920ce884356479eabbd64b1df2a Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sun, 3 Jan 2021 03:08:34 +0000 Subject: [PATCH 05/23] x86/entry: Remove now unused do_IRQ() declaration do_IRQ() has been replaced by common_interrupt() in fa5e5c409213 ("x86/entry: Use idtentry for interrupts") Remove its now unused declaration. Signed-off-by: Hao Lee Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210103030834.GA15432@haolee.github.io --- arch/x86/include/asm/irq.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 528c8a71fe7f..76d389691b5b 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -40,8 +40,6 @@ extern void native_init_IRQ(void); extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); - extern void init_ISA_irqs(void); extern void __init init_IRQ(void); From b86cb29287be07041b81f5611e37ae9ffabff876 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 14 Jan 2021 13:28:27 -0800 Subject: [PATCH 06/23] x86: Remove definition of DEBUG Defining DEBUG should only be done in development. So remove it. Signed-off-by: Tom Rix Signed-off-by: Borislav Petkov Acked-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20210114212827.47584-1-trix@redhat.com --- arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpu/mtrr/mtrr.c | 2 -- arch/x86/kernel/pci-iommu_table.c | 3 --- arch/x86/mm/mmio-mod.c | 2 -- 4 files changed, 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 23ad8e953dfb..2cf4465d5693 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include #include diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 61eb26edc6d2..28c8a23aa42e 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include /* FIXME: kvm_para.h needs this */ #include diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 2e9006c1e240..42e92ec62973 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -4,9 +4,6 @@ #include #include - -#define DEBUG 1 - static struct iommu_table_entry * __init find_dependents_of(struct iommu_table_entry *start, struct iommu_table_entry *finish, diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index bd7aff5c51f7..cd768dafca9e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -10,8 +10,6 @@ #define pr_fmt(fmt) "mmiotrace: " fmt -#define DEBUG 1 - #include #include #include From 8ece53ef7f428ee3f8eab936268b1a3fe2725e6b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 19 Jan 2021 09:40:55 -0800 Subject: [PATCH 07/23] x86/vm86/32: Remove VM86_SCREEN_BITMAP support The implementation was rather buggy. It unconditionally marked PTEs read-only, even for VM_SHARED mappings. I'm not sure whether this is actually a problem, but it certainly seems unwise. More importantly, it released the mmap lock before flushing the TLB, which could allow a racing CoW operation to falsely believe that the underlying memory was not writable. I can't find any users at all of this mechanism, so just remove it. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Stas Sergeev Link: https://lkml.kernel.org/r/f3086de0babcab36f69949b5780bde851f719bc8.1611078018.git.luto@kernel.org --- arch/x86/include/asm/vm86.h | 1 - arch/x86/include/uapi/asm/vm86.h | 4 +-- arch/x86/kernel/vm86_32.c | 62 ++++++++------------------------ arch/x86/mm/fault.c | 30 ---------------- 4 files changed, 16 insertions(+), 81 deletions(-) diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 26efbec94448..9e8ac5073ecb 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -36,7 +36,6 @@ struct vm86 { unsigned long saved_sp0; unsigned long flags; - unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h index d2ee4e307ef8..18909b8050bc 100644 --- a/arch/x86/include/uapi/asm/vm86.h +++ b/arch/x86/include/uapi/asm/vm86.h @@ -97,7 +97,7 @@ struct revectored_struct { struct vm86_struct { struct vm86_regs regs; unsigned long flags; - unsigned long screen_bitmap; + unsigned long screen_bitmap; /* unused, preserved by vm86() */ unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; @@ -106,7 +106,7 @@ struct vm86_struct { /* * flags masks */ -#define VM86_SCREEN_BITMAP 0x0001 +#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */ struct vm86plus_info_struct { unsigned long force_return_for_pic:1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 764573de3996..e5a7a10a0164 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ user_access_end(); @@ -160,49 +164,6 @@ Efault: do_exit(SIGSEGV); } -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - mmap_write_lock(mm); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); - - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - mmap_write_unlock(mm); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); -} - - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); vm86regs.pt.bx = v.regs.ebx; @@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86regs.gs = v.regs.gs; vm86->flags = v.flags; - vm86->screen_bitmap = v.screen_bitmap; vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, @@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f1f1b5a0956a..106b22d1d189 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -262,25 +262,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -#ifdef CONFIG_VM86 - unsigned long bit; - - if (!v8086_mode(regs) || !tsk->thread.vm86) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.vm86->screen_bitmap |= 1 << bit; -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; @@ -335,15 +316,6 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -} - static int bad_address(void *p) { unsigned long dummy; @@ -1416,8 +1388,6 @@ good_area: mm_fault_error(regs, hw_error_code, address, fault); return; } - - check_v8086_mode(regs, address, tsk); } NOKPROBE_SYMBOL(do_user_addr_fault); From f22fecaf39c30acce701ffc3e9875020ba31f1f5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Feb 2021 10:09:58 -0800 Subject: [PATCH 08/23] x86/ptrace: Clean up PTRACE_GETREGS/PTRACE_PUTREGS regset selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit task_user_regset_view() has nonsensical semantics, but those semantics appear to be relied on by existing users of PTRACE_GETREGSET and PTRACE_SETREGSET. (See added comments below for details.) It shouldn't be used for PTRACE_GETREGS or PTRACE_SETREGS, though. A native 64-bit ptrace() call and an x32 ptrace() call using GETREGS or SETREGS wants the 64-bit regset views, and a 32-bit ptrace() call (native or compat) should use the 32-bit regset. task_user_regset_view() almost does this except that it will malfunction if a ptracer is itself ptraced and the outer ptracer modifies CS on entry to a ptrace() syscall. Hopefully that has never happened. (The compat ptrace() code already hardcoded the 32-bit regset, so this change has no effect on that path.) Improve the situation and deobfuscate the code by hardcoding the 64-bit view in the x32 ptrace() and selecting the view based on the kernel config in the native ptrace(). I tried to figure out the history behind this API. I naïvely assumed that PTRAGE_GETREGSET and PTRACE_SETREGSET were ancient APIs that predated compat, but no. They were introduced by 2225a122ae26 ("ptrace: Add support for generic PTRACE_GETREGSET/PTRACE_SETREGSET") in 2010, and they are simply a poor design. ELF core dumps have the ELF e_machine field and a bunch of register sets in ELF notes, and the pair (e_machine, NT_XXX) indicates the format of the regset blob. But the new PTRACE_GET/SETREGSET API coopted the NT_XXX numbering without any way to specify which e_machine was in effect. This is especially bad on x86, where a process can freely switch between 32-bit and 64-bit mode, and, in fact, the PTRAGE_SETREGSET call itself can cause this switch to happen. Oops. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/9daa791d0c7eaebd59c5bc2b2af1b0e7bebe707d.1612375698.git.luto@kernel.org --- arch/x86/kernel/ptrace.c | 46 +++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bedca011459c..87a4143aa7d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION From 4f63b320afdd9af406f4426b0ff1a2cdb23e5b8d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 5 Mar 2020 21:17:19 +0300 Subject: [PATCH 09/23] x86/asm: Fixup TASK_SIZE_MAX comment Comment says "by preventing anything executable" which is not true. Even PROT_NONE mapping can't be installed at (1<<47 - 4096). mmap(0x7ffffffff000, 4096, PROT_NONE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM [ bp: Fixup to the moved location in page_64_types.h. ] Signed-off-by: Alexey Dobriyan Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200305181719.GA5490@avx2 --- arch/x86/include/asm/page_64_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 645bd1d0ee07..64297eabad63 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -66,7 +66,7 @@ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical * address, then that syscall will enter the kernel with a * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable + * We avoid this particular problem by preventing anything * from being mapped at the maximum canonical address. * * On AMD CPUs in the Ryzen family, there's a nasty bug in which the From 3228e1dc80983ee1f5d2e533d010b3bd8b50f0e2 Mon Sep 17 00:00:00 2001 From: Anand K Mistry Date: Thu, 4 Feb 2021 18:32:32 +1100 Subject: [PATCH 10/23] x86/Kconfig: Remove HPET_EMULATE_RTC depends on RTC The RTC config option was removed in commit f52ef24be21a ("rtc/alpha: remove legacy rtc driver") Signed-off-by: Anand K Mistry Signed-off-by: Thomas Gleixner Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20210204183205.1.If5c6ded53a00ecad6a02a1e974316291cc0239d1@changeid --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b6dd10b162a..865c1e7346fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -889,7 +889,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) + depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER def_bool y if X86_INTEL_MID From 35f1c89b0cce247bf0213df243ed902989b1dcda Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:33 -0800 Subject: [PATCH 11/23] x86/fault: Fix AMD erratum #91 errata fixup for user code The recent rework of probe_kernel_address() and its conversion to get_kernel_nofault() inadvertently broke is_prefetch(). Before this change, probe_kernel_address() was used as a sloppy "read user or kernel memory" helper, but it doesn't do that any more. The new get_kernel_nofault() reads *kernel* memory only, which completely broke is_prefetch() for user access. Adjust the code to the correct accessor based on access mode. The manual address bounds check is no longer necessary, since the accessor helpers (get_user() / get_kernel_nofault()) do the right thing all by themselves. As a bonus, by using the correct accessor, the open-coded address bounds check is not needed anymore. [ bp: Massage commit message. ] Fixes: eab0c6089b68 ("maccess: unify the probe kernel arch hooks") Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/b91f7f92f3367d2d3a88eec3b09c6aab1b2dc8ef.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f1f1b5a0956a..441c3e9b8971 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * @@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, #ifdef CONFIG_X86_64 case 0x40: /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. + * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif @@ -127,20 +123,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) - return 0; + /* + * This code has historically always bailed out if IP points to a + * not-present page (e.g. due to a race). No one has ever + * complained about this. + */ + pagefault_disable(); while (instr < max_instr) { unsigned char opcode; - if (get_kernel_nofault(opcode, instr)) - break; + if (user_mode(regs)) { + if (get_user(opcode, instr)) + break; + } else { + if (get_kernel_nofault(opcode, instr)) + break; + } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } + + pagefault_enable(); return prefetch; } From d24df8ecf9b6f81029f520ae7158a8670a28d70b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:34 -0800 Subject: [PATCH 12/23] x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the Revision Guide for AMD Athlon™ 64 and AMD Opteron™ Processors, only early revisions of family 0xF are affected. This will avoid unnecessarily fetching instruction bytes before sending SIGSEGV to user programs. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/477173b7784bc28afb3e53d76ae5ef143917e8dd.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 441c3e9b8971..818902b08c52 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -106,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, } } +static bool is_amd_k8_pre_npt(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && + c->x86_vendor == X86_VENDOR_AMD && + c->x86 == 0xf && c->x86_model < 0x40); +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { @@ -113,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char *instr; int prefetch = 0; + /* Erratum #91 affects AMD K8, pre-NPT CPUs */ + if (!is_amd_k8_pre_npt()) + return 0; + /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: From ec352711ceba890ea3a0c182c2d49c86c1a5e30e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:35 -0800 Subject: [PATCH 13/23] x86/fault: Fold mm_fault_error() into do_user_addr_fault() mm_fault_error() is logically just the end of do_user_addr_fault(). Combine the functions. This makes the code easier to read. Most of the churn here is from renaming hw_error_code to error_code in do_user_addr_fault(). This makes no difference at all to the generated code (objdump -dr) as compared to changing noinline to __always_inline in the definition of mm_fault_error(). Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/dedc4d9c9b047e51ce38b991bd23971a28af4e7b.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 97 +++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 52 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 818902b08c52..91cf7a672c04 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -981,40 +981,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } -static noinline void -mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, vm_fault_t fault) -{ - if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, 0, 0); - return; - } - - if (fault & VM_FAULT_OOM) { - /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); - return; - } - - /* - * We ran out of memory, call the OOM killer, and return the - * userspace (which will retry the fault, or kill us if we got - * oom-killed): - */ - pagefault_out_of_memory(); - } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| - VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, fault); - else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address); - else - BUG(); - } -} - static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) @@ -1252,7 +1218,7 @@ NOKPROBE_SYMBOL(do_kern_addr_fault); /* Handle faults in the user portion of the address space */ static inline void do_user_addr_fault(struct pt_regs *regs, - unsigned long hw_error_code, + unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; @@ -1272,8 +1238,8 @@ void do_user_addr_fault(struct pt_regs *regs, * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ - if (unlikely(hw_error_code & X86_PF_RSVD)) - pgtable_bad(regs, hw_error_code, address); + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user @@ -1283,10 +1249,10 @@ void do_user_addr_fault(struct pt_regs *regs, * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && - !(hw_error_code & X86_PF_USER) && + !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1295,7 +1261,7 @@ void do_user_addr_fault(struct pt_regs *regs, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1316,9 +1282,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (hw_error_code & X86_PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (hw_error_code & X86_PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1334,7 +1300,7 @@ void do_user_addr_fault(struct pt_regs *regs, * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(hw_error_code, regs, address)) + if (emulate_vsyscall(error_code, regs, address)) return; } #endif @@ -1357,7 +1323,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } retry: @@ -1373,17 +1339,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } @@ -1392,8 +1358,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(hw_error_code, vma))) { - bad_area_access_error(regs, hw_error_code, address, vma); + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); return; } @@ -1415,7 +1381,7 @@ good_area: /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) - no_context(regs, hw_error_code, address, SIGBUS, + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1432,9 +1398,36 @@ good_area: } mmap_read_unlock(mm); - if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, hw_error_code, address, fault); + if (likely(!(fault & VM_FAULT_ERROR))) return; + + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, 0, 0); + return; + } + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); + else + BUG(); } check_v8086_mode(regs, address, tsk); From f42a40fd53fb5c77bae67d917d66078dbaa46bc2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:36 -0800 Subject: [PATCH 14/23] x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault() bad_area() and its relatives are called from many places in fault.c, and exactly one of them wants the F00F workaround. __bad_area_nosemaphore() no longer contains any kernel fault code, which prepares for further cleanups. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/e9668729a48ce6754022b0a4415631e8ebdd00e7.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 91cf7a672c04..3ffed003f281 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -482,10 +482,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) } /* Pentium F0 0F C7 C8 bug workaround: */ -static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && + idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } @@ -853,9 +855,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, return; } - if (is_f00f_bug(regs, address)) - return; - no_context(regs, error_code, address, SIGSEGV, si_code); } @@ -1195,6 +1194,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } #endif + if (is_f00f_bug(regs, hw_error_code, address)) + return; + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; From ef2544fb3f6457b79fc73cea39dafd67ee0f2824 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:37 -0800 Subject: [PATCH 15/23] x86/fault: Document the locking in the fault_signal_pending() path If fault_signal_pending() returns true, then the core mm has unlocked the mm for us. Add a comment to help future readers of this code. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/c56de3d103f40e6304437b150aa7b215530d23f7.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3ffed003f281..013910b7b93f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1380,8 +1380,11 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { + /* + * Quick path to respond to signals. The core mm code + * has unlocked the mm for us if we get here. + */ if (!user_mode(regs)) no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); From 56e62cd28aaae2fcbec8af67b05843c47c6da170 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:38 -0800 Subject: [PATCH 16/23] x86/fault: Correct a few user vs kernel checks wrt WRUSS In general, page fault errors for WRUSS should be just like get_user(), etc. Fix three bugs in this area: There is a comment that says that, if the kernel can't handle a page fault on a user address due to OOM, the OOM-kill-and-retry logic would be skipped. The code checked kernel *privilege*, not kernel mode, so it missed WRUSS. This means that the kernel would malfunction if it got OOM on a WRUSS fault -- this would be a kernel-mode, user-privilege fault, and the OOM killer would be invoked and the handler would retry the faulting instruction. A failed user access from kernel while a fatal signal is pending should fail even if the instruction in question was WRUSS. do_sigbus() should not send SIGBUS for WRUSS -- it should handle it like any other kernel mode failure. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/a7b7bcea730bd4069e6b7e629236bb2cf526c2fb.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 013910b7b93f..b1104844260d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -945,7 +945,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { + if (!user_mode(regs)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1217,7 +1217,14 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* Handle faults in the user portion of the address space */ +/* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost + * all purposes, we should treat a normal kernel access to user memory + * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. + * The one exception is AC flag handling, which is, per the x86 + * architecture, special for WRUSS. + */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, @@ -1406,14 +1413,14 @@ good_area: if (likely(!(fault & VM_FAULT_ERROR))) return; - if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { + if (fatal_signal_pending(current) && !user_mode(regs)) { no_context(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { + if (!user_mode(regs)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; From 03c81ea3331658f613bb2913d33764a4e0410cbd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:39 -0800 Subject: [PATCH 17/23] x86/fault: Improve kernel-executing-user-memory handling Right now, the case of the kernel trying to execute from user memory is treated more or less just like the kernel getting a page fault on a user access. In the failure path, it checks for erratum #93, tries to otherwise fix up the error, and then oopses. If it manages to jump to the user address space, with or without SMEP, it should not try to resolve the page fault. This is an error, pure and simple. Rearrange the code so that this case is caught early, check for erratum #93, and bail out. [ bp: Massage commit message. ] Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/ab8719c7afb8bd501c4eee0e36493150fbbe5f6a.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b1104844260d..cbb1a9754473 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -447,6 +447,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) || boot_cpu_data.x86 != 0xf) return 0; + if (user_mode(regs)) + return 0; + if (address != regs->ip) return 0; @@ -744,9 +747,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_prefetch(regs, error_code, address)) return; - if (is_errata93(regs, address)) - return; - /* * Buggy firmware could access regions which might page fault, try to * recover from such faults. @@ -1239,6 +1239,21 @@ void do_user_addr_fault(struct pt_regs *regs, tsk = current; mm = tsk->mm; + if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { + /* + * Whoops, this is kernel mode code trying to execute from + * user memory. Unless this is AMD erratum #93, which + * corrupts RIP such that it looks like a user address, + * this is unrecoverable. Don't even try to look up the + * VMA. + */ + if (is_errata93(regs, address)) + return; + + bad_area_nosemaphore(regs, error_code, address); + return; + } + /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; From 2cc624b0a7e68ba8957b18600181f7d5b0f3e1b6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:41 -0800 Subject: [PATCH 18/23] x86/fault: Split the OOPS code out from no_context() Not all callers of no_context() want to run exception fixups. Separate the OOPS code out from the fixup code in no_context(). Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/450f8d8eabafb83a5df349108c8e5ea83a2f939d.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 146 +++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 69 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index cbb1a9754473..dbf6a940b03f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -655,13 +655,84 @@ static void set_signal_archinfo(unsigned long address, } static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) +page_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - struct task_struct *tsk = current; unsigned long flags; int sig; + if (user_mode(regs)) { + /* + * Implicit kernel access from user mode? Skip the stack + * overflow and EFI special cases. + */ + goto oops; + } + +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)current->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) { + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : ASM_CALL_CONSTRAINT + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + + /* + * Buggy firmware could access regions which might page fault, try to + * recover from such faults. + */ + if (IS_ENABLED(CONFIG_EFI)) + efi_recover_from_page_fault(address); + +oops: + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_DEFAULT "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ if (user_mode(regs)) { /* * This is an implicit supervisor-mode access from user @@ -702,78 +773,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } -#ifdef CONFIG_VMAP_STACK /* - * Stack overflow? During boot, we can fault near the initial - * stack in the direct map, but that's not an overflow -- check - * that we're in vmalloc space to avoid this. - */ - if (is_vmalloc_addr((void *)address) && - (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || - address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); - /* - * We're likely to be running with very little stack space - * left. It's plausible that we'd hit this condition but - * double-fault even before we get this far, in which case - * we're fine: the double-fault handler will deal with it. - * - * We don't want to make it all the way into the oops code - * and then double-fault, though, because we're likely to - * break the console driver and lose most of the stack dump. - */ - asm volatile ("movq %[stack], %%rsp\n\t" - "call handle_stack_overflow\n\t" - "1: jmp 1b" - : ASM_CALL_CONSTRAINT - : "D" ("kernel stack overflow (page fault)"), - "S" (regs), "d" (address), - [stack] "rm" (stack)); - unreachable(); - } -#endif - - /* - * 32-bit: - * - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * 64-bit: - * - * Hall of shame of CPU/BIOS bugs. + * AMD erratum #91 manifests as a spurious page fault on a PREFETCH + * instruction. */ if (is_prefetch(regs, error_code, address)) return; - /* - * Buggy firmware could access regions which might page fault, try to - * recover from such faults. - */ - if (IS_ENABLED(CONFIG_EFI)) - efi_recover_from_page_fault(address); - oops: - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice: - */ - flags = oops_begin(); - - show_fault_oops(regs, error_code, address); - - if (task_stack_end_corrupted(tsk)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_DEFAULT "CR2: %016lx\n", address); - - oops_end(flags, regs, sig); + page_fault_oops(regs, error_code, address); } /* From 5042d40a264c8a508d58ed71e4c07b05175b3635 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:42 -0800 Subject: [PATCH 19/23] x86/fault: Bypass no_context() for implicit kernel faults from usermode Drop an indentation level and remove the last user_mode(regs) == true caller of no_context() by directly OOPSing for implicit kernel faults from usermode. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/6e3d1129494a8de1e59d28012286e3a292a2296e.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 75 ++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dbf6a940b03f..187975b1df66 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -826,44 +826,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs) && (error_code & X86_PF_USER)) { - /* - * It's possible to have interrupts off here: - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space: - */ - if (is_prefetch(regs, error_code, address)) - return; - - if (is_errata100(regs, address)) - return; - - sanitize_error_code(address, &error_code); - - if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) - return; - - if (likely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); - - set_signal_archinfo(address, error_code); - - if (si_code == SEGV_PKUERR) - force_sig_pkuerr((void __user *)address, pkey); - - force_sig_fault(SIGSEGV, si_code, (void __user *)address); - - local_irq_disable(); - + if (!user_mode(regs)) { + no_context(regs, error_code, address, pkey, si_code); return; } - no_context(regs, error_code, address, SIGSEGV, si_code); + if (!(error_code & X86_PF_USER)) { + /* Implicit user access to kernel memory -- just oops */ + page_fault_oops(regs, error_code, address); + return; + } + + /* + * User mode accesses just cause a SIGSEGV. + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + set_signal_archinfo(address, error_code); + + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); + + force_sig_fault(SIGSEGV, si_code, (void __user *)address); + + local_irq_disable(); } static noinline void From 6456a2a69ee16ad402f26d272d0b67ce1d25061f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:43 -0800 Subject: [PATCH 20/23] x86/fault: Rename no_context() to kernelmode_fixup_or_oops() The name no_context() has never been very clear. It's only called for faults from kernel mode, so rename it and change the no-longer-useful user_mode(regs) check to a WARN_ON_ONCE. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/c21940efe676024bb4bc721f7d70c29c420e127e.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 187975b1df66..3566a594e292 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -730,17 +730,10 @@ oops: } static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) +kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) { - if (user_mode(regs)) { - /* - * This is an implicit supervisor-mode access from user - * mode. Bypass all the kernel-mode recovery code and just - * OOPS. - */ - goto oops; - } + WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { @@ -780,7 +773,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_prefetch(regs, error_code, address)) return; -oops: page_fault_oops(regs, error_code, address); } @@ -827,7 +819,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; if (!user_mode(regs)) { - no_context(regs, error_code, address, pkey, si_code); + kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); return; } @@ -959,7 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1421,8 +1413,8 @@ good_area: * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) - no_context(regs, error_code, address, SIGBUS, - BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR); return; } @@ -1442,15 +1434,15 @@ good_area: return; if (fatal_signal_pending(current) && !user_mode(regs)) { - no_context(regs, error_code, address, 0, 0); + kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); return; } From 66fcd98883816dba3b66da20b5fc86fa410638b5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:44 -0800 Subject: [PATCH 21/23] x86/fault: Don't look for extable entries for SMEP violations If the kernel gets a SMEP violation or a fault that would have been a SMEP violation if it had SMEP support, it shouldn't run fixups. Just OOPS. [ bp: Massage commit message. ] Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/46160d8babce2abf1d6daa052146002efa24ac56.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3566a594e292..1a0cfede8822 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1250,12 +1250,12 @@ void do_user_addr_fault(struct pt_regs *regs, * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the - * VMA. + * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; - bad_area_nosemaphore(regs, error_code, address); + page_fault_oops(regs, error_code, address); return; } From ca247283781d754216395a41c5e8be8ec79a5f1c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:45 -0800 Subject: [PATCH 22/23] x86/fault: Don't run fixups for SMAP violations A SMAP-violating kernel access is not a recoverable condition. Imagine kernel code that, outside of a uaccess region, dereferences a pointer to the user range by accident. If SMAP is on, this will reliably generate as an intentional user access. This makes it easy for bugs to be overlooked if code is inadequately tested both with and without SMAP. This was discovered because BPF can generate invalid accesses to user memory, but those warnings only got printed if SMAP was off. Make it so that this type of error will be discovered with SMAP on as well. [ bp: Massage commit message. ] Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/66a02343624b1ff46f02a838c497fc05c1a871b3.1612924255.git.luto@kernel.org --- arch/x86/mm/fault.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1a0cfede8822..1c3054bb4a5b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1279,9 +1279,12 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && - !(regs->flags & X86_EFLAGS_AC))) - { - bad_area_nosemaphore(regs, error_code, address); + !(regs->flags & X86_EFLAGS_AC))) { + /* + * No extable entry here. This was a kernel access to an + * invalid pointer. get_kernel_nofault() will not get here. + */ + page_fault_oops(regs, error_code, address); return; } From c46f52231e79af025e2c89e889d69ec20a4c024f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 9 Feb 2021 18:33:46 -0800 Subject: [PATCH 23/23] x86/{fault,efi}: Fix and rename efi_recover_from_page_fault() efi_recover_from_page_fault() doesn't recover -- it does a special EFI mini-oops. Rename it to make it clear that it crashes. While renaming it, I noticed a blatant bug: a page fault oops in a different thread happening concurrently with an EFI runtime service call would be misinterpreted as an EFI page fault. Fix that. This isn't quite exact. The situation could be improved by using a special CS for calls into EFI. [ bp: Massage commit message and simplify in interrupt check. ] Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/f43b1e80830dc78ed60ed8b0826f4f189254570c.1612924255.git.luto@kernel.org --- arch/x86/include/asm/efi.h | 2 +- arch/x86/mm/fault.c | 11 ++++++----- arch/x86/platform/efi/quirks.c | 18 +++++++++++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c98f78330b09..4b7706ddd8b6 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -150,7 +150,7 @@ extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); -extern void efi_recover_from_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); /* kexec external ABI */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1c3054bb4a5b..7b3a125e1e98 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,7 +16,7 @@ #include /* prefetchw */ #include /* exception_enter(), ... */ #include /* faulthandler_disabled() */ -#include /* efi_recover_from_page_fault()*/ +#include /* efi_crash_gracefully_on_page_fault()*/ #include #include /* boot_cpu_has, ... */ @@ -25,7 +25,7 @@ #include /* emulate_vsyscall */ #include /* struct vm86 */ #include /* vma_pkey() */ -#include /* efi_recover_from_page_fault()*/ +#include /* efi_crash_gracefully_on_page_fault()*/ #include /* store_idt(), ... */ #include /* exception stack */ #include /* VMALLOC_START, ... */ @@ -701,11 +701,12 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, #endif /* - * Buggy firmware could access regions which might page fault, try to - * recover from such faults. + * Buggy firmware could access regions which might page fault. If + * this happens, EFI has a special OOPS path that will try to + * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) - efi_recover_from_page_fault(address); + efi_crash_gracefully_on_page_fault(address); oops: /* diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5a40fe411ebd..67d93a243c35 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_recover_from_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) { if (!IS_ENABLED(CONFIG_X86_64)) return; /* - * Make sure that an efi runtime service caused the page fault. + * If we get an interrupt/NMI while processing an EFI runtime service + * then this is a regular OOPS, not an EFI failure. */ - if (efi_rts_work.efi_rts_id == EFI_NONE) + if (in_interrupt()) + return; + + /* + * Make sure that an efi runtime service caused the page fault. + * READ_ONCE() because we might be OOPSing in a different thread, + * and we don't want to trip KTSAN while trying to OOPS. + */ + if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE || + current_work() != &efi_rts_work.work) return; /* @@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr) set_current_state(TASK_IDLE); schedule(); } - - return; }