From 326b567f82df0c4c8f50092b9af9a3014616fb3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:12 +0200 Subject: [PATCH 001/110] x86/extable: Tidy up redundant handler functions No need to have the same code all over the place. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132524.963232825@linutronix.de --- arch/x86/mm/extable.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index e1664e9f969c..d9a1046f3a98 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -39,9 +39,8 @@ __visible bool ex_handler_fault(const struct exception_table_entry *fixup, unsigned long error_code, unsigned long fault_addr) { - regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; - return true; + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL_GPL(ex_handler_fault); @@ -76,8 +75,7 @@ __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, unsigned long fault_addr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_uaccess); @@ -87,9 +85,7 @@ __visible bool ex_handler_copy(const struct exception_table_entry *fixup, unsigned long fault_addr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - regs->ax = trapnr; - return true; + return ex_handler_fault(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_copy); @@ -103,10 +99,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup show_stack_regs(regs); /* Pretend that the read succeeded and returned 0. */ - regs->ip = ex_fixup_addr(fixup); regs->ax = 0; regs->dx = 0; - return true; + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); @@ -121,8 +116,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup show_stack_regs(regs); /* Pretend that the write succeeded. */ - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); } EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); From 32fd8b59f91fcd3bf9459aa72d90345735cc2588 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:13 +0200 Subject: [PATCH 002/110] x86/extable: Get rid of redundant macros No point in defining the identical macros twice depending on C or assembly mode. They are still identical. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.023659534@linutronix.de --- arch/x86/include/asm/asm.h | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 3ad3da9a7d97..719955e658a2 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -132,18 +132,6 @@ .long (handler) - . ; \ .popsection -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - # ifdef CONFIG_KPROBES # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ @@ -164,18 +152,6 @@ " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ " .popsection\n" -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - /* For C file, we already have NOKPROBE_SYMBOL macro */ /* @@ -188,6 +164,18 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ +#define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + +#define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + +#define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + +#define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + #endif /* __KERNEL__ */ #endif /* _ASM_X86_ASM_H */ From e42404afc4ca856c48f1e05752541faa3587c472 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:15 +0200 Subject: [PATCH 003/110] x86/mce: Deduplicate exception handling Prepare code for further simplification. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.096452100@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8cb7816d03b4..428eed98742b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -373,13 +373,16 @@ static int msr_to_offset(u32 msr) return -1; } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { - pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } show_stack_regs(regs); @@ -387,7 +390,14 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, while (true) cpu_relax(); +} +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + ex_handler_msr_mce(regs, false); return true; } @@ -432,17 +442,7 @@ __visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, unsigned long error_code, unsigned long fault_addr) { - pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, - regs->ip, (void *)regs->ip); - - show_stack_regs(regs); - - panic("MCA architectural violation!\n"); - - while (true) - cpu_relax(); - + ex_handler_msr_mce(regs, true); return true; } From 083b32d6f4fa26abaf585721abeee73c92ea5376 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:16 +0200 Subject: [PATCH 004/110] x86/mce: Get rid of stray semicolons and the random number of tabs. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.154428878@linutronix.de --- arch/x86/kernel/cpu/mce/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 88dcc79cfb07..95099225defc 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -61,7 +61,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } -static inline bool intel_filter_mce(struct mce *m) { return false; }; +static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -183,7 +183,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); #else -static inline bool amd_filter_mce(struct mce *m) { return false; }; +static inline bool amd_filter_mce(struct mce *m) { return false; } #endif __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, From 46d28947d9876fc0f8f93d3c69813ef6e9852595 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:18 +0200 Subject: [PATCH 005/110] x86/extable: Rework the exception table mechanics The exception table entries contain the instruction address, the fixup address and the handler address. All addresses are relative. Storing the handler address has a few downsides: 1) Most handlers need to be exported 2) Handlers can be defined everywhere and there is no overview about the handler types 3) MCE needs to check the handler type to decide whether an in kernel #MC can be recovered. The functionality of the handler itself is not in any way special, but for these checks there need to be separate functions which in the worst case have to be exported. Some of these 'recoverable' exception fixups are pretty obscure and just reuse some other handler to spare code. That obfuscates e.g. the #MC safe copy functions. Cleaning that up would require more handlers and exports Rework the exception fixup mechanics by storing a fixup type number instead of the handler address and invoke the proper handler for each fixup type. Also teach the extable sort to leave the type field alone. This makes most handlers static except for special cases like the MCE MSR fixup and the BPF fixup. This allows to add more types for cleaning up the obscure places without adding more handler code and exports. There is a marginal code size reduction for a production config and it removes _eight_ exported symbols. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Alexei Starovoitov Link: https://lkml.kernel.org/r/20210908132525.211958725@linutronix.de --- arch/x86/include/asm/asm.h | 22 ++-- arch/x86/include/asm/extable.h | 44 +++++--- arch/x86/include/asm/extable_fixup_types.h | 19 ++++ arch/x86/include/asm/fpu/internal.h | 4 +- arch/x86/include/asm/msr.h | 4 +- arch/x86/include/asm/segment.h | 2 +- arch/x86/kernel/cpu/mce/core.c | 24 +--- arch/x86/kernel/cpu/mce/internal.h | 10 -- arch/x86/kernel/cpu/mce/severity.c | 21 ++-- arch/x86/mm/extable.c | 123 +++++++++------------ arch/x86/net/bpf_jit_comp.c | 11 +- scripts/sorttable.c | 4 +- 12 files changed, 133 insertions(+), 155 deletions(-) create mode 100644 arch/x86/include/asm/extable_fixup_types.h diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 719955e658a2..6aadb9a620ee 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -122,14 +122,17 @@ #ifdef __KERNEL__ +# include + /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ .long (from) - . ; \ .long (to) - . ; \ - .long (handler) - . ; \ + .long type ; \ .popsection # ifdef CONFIG_KPROBES @@ -143,13 +146,13 @@ # endif #else /* ! __ASSEMBLY__ */ -# define _EXPAND_EXTABLE_HANDLE(x) #x -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ - " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ + " .long " __stringify(type) " \n" \ " .popsection\n" /* For C file, we already have NOKPROBE_SYMBOL macro */ @@ -165,17 +168,16 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #endif /* __ASSEMBLY__ */ #define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT) #define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) #define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) #define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) #endif /* __KERNEL__ */ - #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index 1f0cbc52937c..93f400eb728f 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EXTABLE_H #define _ASM_X86_EXTABLE_H + +#include + /* - * The exception table consists of triples of addresses relative to the - * exception table entry itself. The first address is of an instruction - * that is allowed to fault, the second is the target at which the program - * should continue. The third is a handler function to deal with the fault - * caused by the instruction in the first field. + * The exception table consists of two addresses relative to the + * exception table entry itself and a type selector field. + * + * The first address is of an instruction that is allowed to fault, the + * second is the target at which the program should continue. + * + * The type entry is used by fixup_exception() to select the handler to + * deal with the fault caused by the instruction in the first field. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -15,7 +21,7 @@ */ struct exception_table_entry { - int insn, fixup, handler; + int insn, fixup, type; }; struct pt_regs; @@ -25,21 +31,27 @@ struct pt_regs; do { \ (a)->fixup = (b)->fixup + (delta); \ (b)->fixup = (tmp).fixup - (delta); \ - (a)->handler = (b)->handler + (delta); \ - (b)->handler = (tmp).handler - (delta); \ + (a)->type = (b)->type; \ + (b)->type = (tmp).type; \ } while (0) -enum handler_type { - EX_HANDLER_NONE, - EX_HANDLER_FAULT, - EX_HANDLER_UACCESS, - EX_HANDLER_OTHER -}; - extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); -extern enum handler_type ex_get_fault_handler_type(unsigned long ip); +extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); +#ifdef CONFIG_X86_MCE +extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); +#else +static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { } +#endif + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs); +#else +static inline bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs) { return false; } +#endif + #endif diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h new file mode 100644 index 000000000000..0adc117618e6 --- /dev/null +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H +#define _ASM_X86_EXTABLE_FIXUP_TYPES_H + +#define EX_TYPE_NONE 0 +#define EX_TYPE_DEFAULT 1 +#define EX_TYPE_FAULT 2 +#define EX_TYPE_UACCESS 3 +#define EX_TYPE_COPY 4 +#define EX_TYPE_CLEAR_FS 5 +#define EX_TYPE_FPU_RESTORE 6 +#define EX_TYPE_WRMSR 7 +#define EX_TYPE_RDMSR 8 +#define EX_TYPE_BPF 9 + +#define EX_TYPE_WRMSR_IN_MCE 10 +#define EX_TYPE_RDMSR_IN_MCE 11 + +#endif diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5a18694a89b2..ce6fc4f8d1d1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -126,7 +126,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ : output : input) static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) @@ -253,7 +253,7 @@ static inline void fxsave(struct fxregs_state *fx) XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ - _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index a3f87f1015d3..6b52182e178a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -92,7 +92,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr) asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); @@ -102,7 +102,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high) : "memory"); } diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 72044026eb3c..8dd8e8ec9fa5 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -339,7 +339,7 @@ static inline void __loadsegment_fs(unsigned short value) "1: movw %0, %%fs \n" "2: \n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 428eed98742b..cd919fce3ca3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -373,7 +373,7 @@ static int msr_to_offset(u32 msr) return -1; } -static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { if (wrmsr) { pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", @@ -392,15 +392,6 @@ static void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) cpu_relax(); } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - ex_handler_msr_mce(regs, false); - return true; -} - /* MSR access wrappers used for error injection */ static noinstr u64 mce_rdmsrl(u32 msr) { @@ -430,22 +421,13 @@ static noinstr u64 mce_rdmsrl(u32 msr) */ asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - ex_handler_msr_mce(regs, true); - return true; -} - static noinstr void mce_wrmsrl(u32 msr, u64 v) { u32 low, high; @@ -470,7 +452,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) /* See comment in mce_rdmsrl() */ asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) : : "c" (msr), "a"(low), "d" (high) : "memory"); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 95099225defc..3463f8cedb32 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -186,14 +186,4 @@ extern bool amd_filter_mce(struct mce *m); static inline bool amd_filter_mce(struct mce *m) { return false; } #endif -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); - -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); - #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 17e631443116..74fe763bffda 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -265,25 +265,24 @@ static bool is_copy_from_user(struct pt_regs *regs) */ static int error_context(struct mce *m, struct pt_regs *regs) { - enum handler_type t; - if ((m->cs & 3) == 3) return IN_USER; if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - t = ex_get_fault_handler_type(m->ip); - if (t == EX_HANDLER_FAULT) { - m->kflags |= MCE_IN_KERNEL_RECOV; - return IN_KERNEL_RECOV; - } - if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { - m->kflags |= MCE_IN_KERNEL_RECOV; + switch (ex_get_fixup_type(m->ip)) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!regs || !is_copy_from_user(regs)) + return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + case EX_TYPE_FAULT: + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + default: + return IN_KERNEL; } - - return IN_KERNEL; } static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index d9a1046f3a98..5db46df409b5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -9,40 +9,25 @@ #include #include -typedef bool (*ex_handler_t)(const struct exception_table_entry *, - struct pt_regs *, int, unsigned long, - unsigned long); - static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } -static inline ex_handler_t -ex_fixup_handler(const struct exception_table_entry *x) -{ - return (ex_handler_t)((unsigned long)&x->handler + x->handler); -} -__visible bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); return true; } -EXPORT_SYMBOL(ex_handler_default); -__visible bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { regs->ax = trapnr; - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL_GPL(ex_handler_fault); /* * Handler for when we fail to restore a task's FPU state. We should never get @@ -54,10 +39,8 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ -__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); @@ -67,32 +50,23 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); return true; } -EXPORT_SYMBOL_GPL(ex_handler_fprestore); -__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_uaccess); -__visible bool ex_handler_copy(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - return ex_handler_fault(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_fault(fixup, regs, trapnr); } -EXPORT_SYMBOL(ex_handler_copy); -__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) @@ -101,14 +75,11 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup /* Pretend that the read succeeded and returned 0. */ regs->ax = 0; regs->dx = 0; - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); -__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, @@ -116,44 +87,29 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup show_stack_regs(regs); /* Pretend that the write succeeded. */ - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); -__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_clear_fs); -enum handler_type ex_get_fault_handler_type(unsigned long ip) +int ex_get_fixup_type(unsigned long ip) { - const struct exception_table_entry *e; - ex_handler_t handler; + const struct exception_table_entry *e = search_exception_tables(ip); - e = search_exception_tables(ip); - if (!e) - return EX_HANDLER_NONE; - handler = ex_fixup_handler(e); - if (handler == ex_handler_fault) - return EX_HANDLER_FAULT; - else if (handler == ex_handler_uaccess || handler == ex_handler_copy) - return EX_HANDLER_UACCESS; - else - return EX_HANDLER_OTHER; + return e ? e->type : EX_TYPE_NONE; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct exception_table_entry *e; - ex_handler_t handler; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -173,8 +129,33 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, if (!e) return 0; - handler = ex_fixup_handler(e); - return handler(e, regs, trapnr, error_code, fault_addr); + switch (e->type) { + case EX_TYPE_DEFAULT: + return ex_handler_default(e, regs); + case EX_TYPE_FAULT: + return ex_handler_fault(e, regs, trapnr); + case EX_TYPE_UACCESS: + return ex_handler_uaccess(e, regs, trapnr); + case EX_TYPE_COPY: + return ex_handler_copy(e, regs, trapnr); + case EX_TYPE_CLEAR_FS: + return ex_handler_clear_fs(e, regs); + case EX_TYPE_FPU_RESTORE: + return ex_handler_fprestore(e, regs); + case EX_TYPE_RDMSR: + return ex_handler_rdmsr_unsafe(e, regs); + case EX_TYPE_WRMSR: + return ex_handler_wrmsr_unsafe(e, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(e, regs); + case EX_TYPE_RDMSR_IN_MCE: + ex_handler_msr_mce(regs, false); + break; + case EX_TYPE_WRMSR_IN_MCE: + ex_handler_msr_mce(regs, true); + break; + } + BUG(); } extern unsigned int early_recursion_flag; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3db..703dc6eaf4c7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -827,9 +827,7 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static bool ex_handler_bpf(const struct exception_table_entry *x, - struct pt_regs *regs, int trapnr, - unsigned long error_code, unsigned long fault_addr) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { u32 reg = x->fixup >> 8; @@ -1313,12 +1311,7 @@ st: if (is_imm8(insn->off)) } ex->insn = delta; - delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; - if (!is_simm32(delta)) { - pr_err("extable->handler doesn't fit into 32-bit\n"); - return -EFAULT; - } - ex->handler = delta; + ex->type = EX_TYPE_BPF; if (dst_reg > BPF_REG_9) { pr_err("verifier error\n"); diff --git a/scripts/sorttable.c b/scripts/sorttable.c index f355869c65cd..a9b3324cacf9 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -236,7 +236,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size) w(r(loc) + i, loc); w(r(loc + 1) + i + 4, loc + 1); - w(r(loc + 2) + i + 8, loc + 2); + /* Don't touch the fixup type */ i += sizeof(uint32_t) * 3; } @@ -249,7 +249,7 @@ static void x86_sort_relative_table(char *extab_image, int image_size) w(r(loc) - i, loc); w(r(loc + 1) - (i + 4), loc + 1); - w(r(loc + 2) - (i + 8), loc + 2); + /* Don't touch the fixup type */ i += sizeof(uint32_t) * 3; } From 2cadf5248b9316d3c8af876e795d61c55476f6e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:19 +0200 Subject: [PATCH 006/110] x86/extable: Provide EX_TYPE_DEFAULT_MCE_SAFE and EX_TYPE_FAULT_MCE_SAFE Provide exception fixup types which can be used to identify fixups which allow in kernel #MC recovery and make them invoke the existing handlers. These will be used at places where #MC recovery is handled correctly by the caller. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.269689153@linutronix.de --- arch/x86/include/asm/extable_fixup_types.h | 3 +++ arch/x86/kernel/cpu/mce/severity.c | 2 ++ arch/x86/mm/extable.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 0adc117618e6..409524d5d2eb 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -16,4 +16,7 @@ #define EX_TYPE_WRMSR_IN_MCE 10 #define EX_TYPE_RDMSR_IN_MCE 11 +#define EX_TYPE_DEFAULT_MCE_SAFE 12 +#define EX_TYPE_FAULT_MCE_SAFE 13 + #endif diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 74fe763bffda..d9b77a74f8d2 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -278,6 +278,8 @@ static int error_context(struct mce *m, struct pt_regs *regs) m->kflags |= MCE_IN_KERNEL_COPYIN; fallthrough; case EX_TYPE_FAULT: + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; default: diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 5db46df409b5..f37e290e6d0a 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -131,8 +131,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, switch (e->type) { case EX_TYPE_DEFAULT: + case EX_TYPE_DEFAULT_MCE_SAFE: return ex_handler_default(e, regs); case EX_TYPE_FAULT: + case EX_TYPE_FAULT_MCE_SAFE: return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: return ex_handler_uaccess(e, regs, trapnr); From c1c97d175493ab32325df81133611ce8e4e05088 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:21 +0200 Subject: [PATCH 007/110] x86/copy_mc: Use EX_TYPE_DEFAULT_MCE_SAFE for exception fixups Nothing in that code uses the trap number which was stored by the exception fixup which is instantiated via _ASM_EXTABLE_FAULT(). Use _ASM_EXTABLE(... EX_TYPE_DEFAULT_MCE_SAFE) instead which just handles the IP fixup and the type indicates to the #MC handler that the call site can handle the abort caused by #MC correctly. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.328706042@linutronix.de --- arch/x86/lib/copy_mc_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index e5f77e293034..7334055157ba 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -107,9 +107,9 @@ SYM_FUNC_END(copy_mc_fragile) .previous - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE) _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) _ASM_EXTABLE(.L_write_words, .E_write_words) _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) @@ -149,5 +149,5 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string) .previous - _ASM_EXTABLE_FAULT(.L_copy, .E_copy) + _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE) #endif /* !CONFIG_UML */ From c6304556f3ae98c943bbb4042a30205c98e4f921 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:23 +0200 Subject: [PATCH 008/110] x86/fpu: Use EX_TYPE_FAULT_MCE_SAFE for exception fixups The macros used for restoring FPU state from a user space buffer can handle all exceptions including #MC. They need to return the trap number in the error case as the code which invokes them needs to distinguish the cause of the failure. It aborts the operation for anything except #PF. Use the new EX_TYPE_FAULT_MCE_SAFE exception table fixup type to document the nature of the fixup. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.387464538@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index ce6fc4f8d1d1..cb1ca602e848 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -102,7 +102,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); "3: negl %%eax\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ @@ -209,7 +209,7 @@ static inline void fxsave(struct fxregs_state *fx) "3: negl %%eax\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") From 0c2e62ba04cd0b7194b380bae4fc35c45bb2e46e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:24 +0200 Subject: [PATCH 009/110] x86/extable: Remove EX_TYPE_FAULT from MCE safe fixups Now that the MC safe copy and FPU have been converted to use the MCE safe fixup types remove EX_TYPE_FAULT from the list of types which MCE considers to be safe to be recovered in kernel. This removes the SGX exception handling of ENCLS from the #MC safe handling, but according to the SGX wizards the current SGX implementations cannot survive #MC on ENCLS: https://lore.kernel.org/r/YS+upEmTfpZub3s9@google.com The code relies on the trap number being stored if ENCLS raised an exception. That's still working, but it does no longer trick the MCE code into assuming that #MC is handled correctly for ENCLS. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.445255957@linutronix.de --- arch/x86/kernel/cpu/mce/severity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index d9b77a74f8d2..f60bbaff9f65 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -277,7 +277,6 @@ static int error_context(struct mce *m, struct pt_regs *regs) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; fallthrough; - case EX_TYPE_FAULT: case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; From 4339d0c63c2d5bea1fe6de4091ee2fe9eeea09a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:26 +0200 Subject: [PATCH 010/110] x86/fpu/signal: Clarify exception handling in restore_fpregs_from_user() FPU restore from a signal frame can trigger various exceptions. The exceptions are caught with an exception table entry. The handler of this entry stores the trap number in EAX. The FPU specific fixup negates that trap number to convert it into an negative error code. Any other exception than #PF is fatal and recovery is not possible. This relies on the fact that the #PF exception number is the same as EFAULT, but that's not really obvious. Remove the negation from the exception fixup as it really has no value and check for X86_TRAP_PF at the call site. There is still confusion due to the return code conversion for the error case which will be cleaned up separately. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.506192488@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 21 ++++++++------------- arch/x86/kernel/fpu/signal.c | 5 +++-- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index cb1ca602e848..4cfd40dc3cb5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -88,7 +88,10 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif extern void save_fpregs_to_fpstate(struct fpu *fpu); -/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ #define user_insn(insn, output, input...) \ ({ \ int err; \ @@ -98,11 +101,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: negl %%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ @@ -198,18 +197,14 @@ static inline void fxsave(struct fxregs_state *fx) #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" /* - * After this @err contains 0 on success or the negated trap number when - * the operation raises an exception. For faults this results in -EFAULT. + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ - ".pushsection .fixup,\"ax\"\n\t" \ - "3: negl %%eax\n\t" \ - "jmp 2b\n\t" \ - ".popsection\n\t" \ - _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FAULT_MCE_SAFE) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c539..9bfffdb64c0d 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -13,6 +13,7 @@ #include #include +#include #include static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; @@ -275,7 +276,7 @@ retry: fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ - if (ret != -EFAULT) + if (ret != X86_TRAP_PF) return -EINVAL; ret = fault_in_pages_readable(buf, size); @@ -405,7 +406,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpu->state.xsave.header.xfeatures &= mask; - ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); + ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all) ? -EINVAL : 0; } else { ret = fxrstor_safe(&fpu->state.fxsave); } From 4164a482a5d92c29eaf53d01755103f6bbce38f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:29 +0200 Subject: [PATCH 011/110] x86/fpu/signal: Move header zeroing out of xsave_to_user_sigframe() There is no reason to have the header zeroing in the pagefault disabled region. Do it upfront once. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.621674721@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 17 ++++++----------- arch/x86/kernel/fpu/signal.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4cfd40dc3cb5..c856ca481546 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -318,9 +318,12 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask) * We don't use modified optimization because xrstor/xrstors might track * a different application. * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. */ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) { @@ -334,14 +337,6 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) u32 hmask = mask >> 32; int err; - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->header, sizeof(buf->header)); - if (unlikely(err)) - return -EFAULT; - stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9bfffdb64c0d..5ca3ce94cf6b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -189,6 +189,18 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!access_ok(buf, size)) return -EACCES; + + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + ret = __clear_user(&xbuf->header, sizeof(xbuf->header)); + if (unlikely(ret)) + return ret; + } retry: /* * Load the FPU registers if they are not valid for the current task. From fcfb7163329ce832aafef31f26345ef5e8642a17 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:30 +0200 Subject: [PATCH 012/110] x86/fpu/signal: Move xstate clearing out of copy_fpregs_to_sigframe() When the direct saving of the FPU registers to the user space sigframe fails, copy_fpregs_to_sigframe() attempts to clear the user buffer. The most likely reason for such a fail is a page fault. As copy_fpregs_to_sigframe() is invoked with pagefaults disabled the chance that __clear_user() succeeds is minuscule. Move the clearing out into the caller which replaces the fault_in_pages_writeable() in that error handling path. The return value confusion will be cleaned up separately. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.679356300@linutronix.de --- arch/x86/kernel/fpu/signal.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 5ca3ce94cf6b..c4abbd97587b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -136,18 +136,12 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { - int err; - if (use_xsave()) - err = xsave_to_user_sigframe(buf); - else if (use_fxsr()) - err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = fnsave_to_user_sigframe((struct fregs_state __user *) buf); - - if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) - err = -EFAULT; - return err; + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* @@ -218,9 +212,9 @@ retry: fpregs_unlock(); if (ret) { - if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpu_user_xstate_size)) goto retry; - return -EFAULT; + return -1; } /* Save the fsave header for the 32-bit frames. */ From 052adee668284b67105375c0a524f16a423f1424 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:32 +0200 Subject: [PATCH 013/110] x86/fpu/signal: Change return type of copy_fpstate_to_sigframe() to boolean None of the call sites cares about the actual return code. Change the return type to boolean and return 'true' on success. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.736773588@linutronix.de --- arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/signal.c | 20 ++++++++++---------- arch/x86/kernel/signal.c | 4 +--- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 5e3d9b7fd5fb..023198edf863 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -220,8 +220,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_32 __user *) sp; - if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size) < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, + math_size)) return (void __user *) -1L; sp -= frame_size; diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index c856ca481546..74aa53eeedf5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -386,7 +386,7 @@ static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); } -extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); /* * FPU context switch related helper methods: diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c4abbd97587b..7ce396dcc942 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -165,7 +165,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -176,13 +176,14 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); - return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) - return -EACCES; + return false; if (use_xsave()) { struct xregs_state __user *xbuf = buf_fx; @@ -191,9 +192,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) * Clear the xsave header first, so that reserved fields are * initialized to zero. */ - ret = __clear_user(&xbuf->header, sizeof(xbuf->header)); - if (unlikely(ret)) - return ret; + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; } retry: /* @@ -214,17 +214,17 @@ retry: if (ret) { if (!__clear_user(buf_fx, fpu_user_xstate_size)) goto retry; - return -1; + return false; } /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; + return false; if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; + return false; - return 0; + return true; } static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f4d21e470083..5f623a1c8075 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -244,7 +244,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -292,8 +291,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } /* save i387 and extended state */ - ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); - if (ret < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) return (void __user *)-1L; return (void __user *)sp; From 2af07f3a6e9fb81331421ca24b26a96180d792dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:34 +0200 Subject: [PATCH 014/110] x86/fpu/signal: Change return type of copy_fpregs_to_sigframe() helpers to boolean Now that copy_fpregs_to_sigframe() returns boolean the individual return codes in the related helper functions do not make sense anymore. Change them to return boolean success/fail. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.794334915@linutronix.de --- arch/x86/kernel/fpu/signal.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 7ce396dcc942..1d10fe9b5b6b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -65,7 +65,7 @@ setfx: /* * Signal frame handlers. */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; @@ -82,18 +82,19 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; + return false; } else { struct fregs_state __user *fp = buf; u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; + return false; } - return 0; + return true; } -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame) { struct xregs_state __user *x = buf; struct _fpx_sw_bytes *sw_bytes; @@ -131,7 +132,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - return err; + return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) @@ -218,10 +219,10 @@ retry: } /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) return false; - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate)) return false; return true; From ee4ecdfbd28954086a09740dc931c10c93e39370 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:35 +0200 Subject: [PATCH 015/110] x86/signal: Change return type of restore_sigcontext() to boolean None of the call sites cares about the return code. All they are interested in is success or fail. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.851280949@linutronix.de --- arch/x86/ia32/ia32_signal.c | 12 ++++++------ arch/x86/kernel/signal.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 023198edf863..0d6789b6e5ca 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -57,8 +57,8 @@ static inline void reload_segments(struct sigcontext_32 *sc) /* * Do a signal return; undo the signal stack. */ -static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_32 __user *usc) +static bool ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_32 __user *usc) { struct sigcontext_32 sc; @@ -66,7 +66,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) - return -EFAULT; + return false; /* Get only the ia32 registers. */ regs->bx = sc.bx; @@ -94,7 +94,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, * normal case. */ reload_segments(&sc); - return fpu__restore_sig(compat_ptr(sc.fpstate), 1); + return !fpu__restore_sig(compat_ptr(sc.fpstate), 1); } COMPAT_SYSCALL_DEFINE0(sigreturn) @@ -111,7 +111,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->sc)) + if (!ia32_restore_sigcontext(regs, &frame->sc)) goto badframe; return regs->ax; @@ -135,7 +135,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 5f623a1c8075..140b7b2dbafe 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -79,9 +79,9 @@ static void force_valid_ss(struct pt_regs *regs) # define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif -static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) { struct sigcontext sc; @@ -89,7 +89,7 @@ static int restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return -EFAULT; + return false; #ifdef CONFIG_X86_32 set_user_gs(regs, sc.gs); @@ -136,8 +136,8 @@ static int restore_sigcontext(struct pt_regs *regs, force_valid_ss(regs); #endif - return fpu__restore_sig((void __user *)sc.fpstate, - IS_ENABLED(CONFIG_X86_32)); + return !fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); } static __always_inline int @@ -641,7 +641,7 @@ SYSCALL_DEFINE0(sigreturn) * x86_32 has no uc_flags bits relevant to restore_sigcontext. * Save a few cycles by skipping the __get_user. */ - if (restore_sigcontext(regs, &frame->sc, 0)) + if (!restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -669,7 +669,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -927,7 +927,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) From f3305be5feecae62adfa5a6a1441a76493fe7412 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:37 +0200 Subject: [PATCH 016/110] x86/fpu/signal: Change return type of fpu__restore_sig() to boolean None of the call sites cares about the error code. All they need to know is whether the function succeeded or not. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.909065931@linutronix.de --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/signal.c | 22 ++++++++++------------ arch/x86/kernel/signal.c | 4 ++-- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 0d6789b6e5ca..828ab0a9239b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -94,7 +94,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, * normal case. */ reload_segments(&sc); - return !fpu__restore_sig(compat_ptr(sc.fpstate), 1); + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); } COMPAT_SYSCALL_DEFINE0(sigreturn) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 74aa53eeedf5..89960e479f87 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,7 +26,7 @@ /* * High level FPU state handling functions: */ -extern int fpu__restore_sig(void __user *buf, int ia32_frame); +extern bool fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern void fpu__clear_user_states(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 1d10fe9b5b6b..d418d28819b9 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -433,17 +433,17 @@ static inline int xstate_sigframe_size(void) /* * Restore FPU state from a sigframe: */ -int fpu__restore_sig(void __user *buf, int ia32_frame) +bool fpu__restore_sig(void __user *buf, int ia32_frame) { unsigned int size = xstate_sigframe_size(); struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; - int ret; + bool success = false; if (unlikely(!buf)) { fpu__clear_user_states(fpu); - return 0; + return true; } ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || @@ -459,23 +459,21 @@ int fpu__restore_sig(void __user *buf, int ia32_frame) ia32_fxstate = true; } - if (!access_ok(buf, size)) { - ret = -EACCES; + if (!access_ok(buf, size)) goto out; - } if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { - ret = fpregs_soft_set(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), - NULL, buf); + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); } else { - ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + success = !__fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: - if (unlikely(ret)) + if (unlikely(!success)) fpu__clear_user_states(fpu); - return ret; + return success; } unsigned long diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 140b7b2dbafe..02ee68e68184 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -136,8 +136,8 @@ static bool restore_sigcontext(struct pt_regs *regs, force_valid_ss(regs); #endif - return !fpu__restore_sig((void __user *)sc.fpstate, - IS_ENABLED(CONFIG_X86_32)); + return fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); } static __always_inline int From 1193f408cd5140f2cfd38c7e60a2d39d39cd485f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:38 +0200 Subject: [PATCH 017/110] x86/fpu/signal: Change return type of __fpu_restore_sig() to boolean Now that fpu__restore_sig() returns a boolean get rid of the individual error codes in __fpu_restore_sig() as well. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132525.966197097@linutronix.de --- arch/x86/kernel/fpu/signal.c | 41 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index d418d28819b9..912d770363e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -309,8 +309,8 @@ retry: return 0; } -static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, - bool ia32_fxstate) +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; @@ -318,14 +318,14 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct user_i387_ia32_struct env; u64 user_xfeatures = 0; bool fx_only = false; - int ret; + bool success; + if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user); - if (unlikely(ret)) - return ret; + if (check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; @@ -341,8 +341,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * faults. If it does, fall back to the slow path below, going * through the kernel buffer with the enabled pagefault handler. */ - return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, - state_size); + return !restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, + state_size); } /* @@ -350,9 +350,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - return ret; + if (__copy_from_user(&env, buf, sizeof(env))) + return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is @@ -379,17 +378,16 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs_unlock(); if (use_xsave() && !fx_only) { - ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); - if (ret) - return ret; + if (copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx)) + return false; } else { if (__copy_from_user(&fpu->state.fxsave, buf_fx, sizeof(fpu->state.fxsave))) - return -EFAULT; + return false; /* Reject invalid MXCSR values. */ if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + return false; /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) @@ -413,17 +411,18 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpu->state.xsave.header.xfeatures &= mask; - ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all) ? -EINVAL : 0; + success = !os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); } else { - ret = fxrstor_safe(&fpu->state.fxsave); + success = !fxrstor_safe(&fpu->state.fxsave); } - if (likely(!ret)) + if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); - return ret; + return success; } + static inline int xstate_sigframe_size(void) { return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : @@ -467,7 +466,7 @@ bool fpu__restore_sig(void __user *buf, int ia32_frame) sizeof(struct user_i387_ia32_struct), NULL, buf); } else { - success = !__fpu_restore_sig(buf, buf_fx, ia32_fxstate); + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: From be0040144152ed834c369a7830487e5ee4f27080 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:40 +0200 Subject: [PATCH 018/110] x86/fpu/signal: Change return code of check_xstate_in_sigframe() to boolean __fpu_sig_restore() only needs success/fail information and no detailed error code. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132526.024024598@linutronix.de --- arch/x86/kernel/fpu/signal.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 912d770363e6..2bd4d51f985e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -23,8 +23,8 @@ static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, - struct _fpx_sw_bytes *fx_sw) +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); @@ -32,7 +32,7 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) - return -EFAULT; + return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || @@ -48,10 +48,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) - return -EFAULT; + return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) - return 0; + return true; setfx: trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); @@ -59,7 +59,7 @@ setfx: fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; - return 0; + return true; } /* @@ -324,7 +324,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - if (check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) return false; fx_only = !fx_sw_user.magic1; From a2a8fd9a3efd8d22ee14a441e9e78cf5c998e69a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Sep 2021 15:29:41 +0200 Subject: [PATCH 019/110] x86/fpu/signal: Change return code of restore_fpregs_from_user() to boolean __fpu_sig_restore() only needs information about success or fail and no real error code. This cleans up the confusing conversion of the trap number, which is returned by the *RSTOR() exception fixups, to an error code. Suggested-by: Al Viro Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210908132526.084109938@linutronix.de --- arch/x86/kernel/fpu/signal.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2bd4d51f985e..68f03da2012e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -254,8 +254,8 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { struct fpu *fpu = ¤t->thread.fpu; int ret; @@ -284,12 +284,11 @@ retry: /* Try to handle #PF, but anything else is fatal. */ if (ret != X86_TRAP_PF) - return -EINVAL; + return false; - ret = fault_in_pages_readable(buf, size); - if (!ret) + if (!fault_in_pages_readable(buf, size)) goto retry; - return ret; + return false; } /* @@ -306,7 +305,7 @@ retry: fpregs_mark_activate(); fpregs_unlock(); - return 0; + return true; } static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, @@ -341,8 +340,8 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, * faults. If it does, fall back to the slow path below, going * through the kernel buffer with the enabled pagefault handler. */ - return !restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, - state_size); + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, + state_size); } /* From 724fc0248d450224b19ef5b5ee41e392348f6704 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Wed, 22 Sep 2021 22:09:01 +0200 Subject: [PATCH 020/110] x86/fpu/signal: Fix missed conversion to correct boolean retval in save_xstate_epilog() Fix the missing return code polarity in save_xstate_epilog(). [ bp: Massage, use the right commit in the Fixes: tag ] Fixes: 2af07f3a6e9f ("x86/fpu/signal: Change return type of copy_fpregs_to_sigframe() helpers to boolean") Reported-by: Remi Duraffort Signed-off-by: Anders Roxell Signed-off-by: Borislav Petkov Tested-by: Nick Desaulniers Link: https://github.com/ClangBuiltLinux/linux/issues/1461 Link: https://lkml.kernel.org/r/20210922200901.1823741-1-anders.roxell@linaro.org --- arch/x86/kernel/fpu/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 68f03da2012e..39c7bae97daf 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -106,7 +106,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame) err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); if (!use_xsave()) - return err; + return !err; err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *)(buf + fpu_user_xstate_size)); From 9568bfb4f04bd9a280c592879ccd7a26a77c1390 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:15:54 +0200 Subject: [PATCH 021/110] x86/fpu: Remove pointless argument from switch_fpu_finish() Unused since the FPU switching rework. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.433135710@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/process_32.c | 3 +-- arch/x86/kernel/process_64.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 89960e479f87..1503750534f7 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -521,7 +521,7 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * Delay loading of the complete FPU state until the return to userland. * PKRU is handled separately. */ -static inline void switch_fpu_finish(struct fpu *new_fpu) +static inline void switch_fpu_finish(void) { if (cpu_feature_enabled(X86_FEATURE_FPU)) set_thread_flag(TIF_NEED_FPU_LOAD); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4f2f54e1281c..d008e222a302 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ec0d836a13b1..39f12ef1c85c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && @@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); From d2d926482cdfbd5517826eca4e39dcd8757f04d3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:15:56 +0200 Subject: [PATCH 022/110] x86/fpu: Update stale comments copy_fpstate_to_sigframe() does not have a slow path anymore. Neither does the !ia32 restore in __fpu_restore_sig(). Update the comments accordingly. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.493570236@linutronix.de --- arch/x86/kernel/fpu/signal.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 51c4915a35f0..e257805d3d3f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,10 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Try to save it directly to the user frame with disabled page fault handler. - * If this fails then do the slow path where the FPU state is first saved to - * task's fpu->state and then copy it to the user frame pointed to by the - * aligned pointer 'buf_fx'. + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -334,12 +332,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, } if (likely(!ia32_fxstate)) { - /* - * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause page - * faults. If it does, fall back to the slow path below, going - * through the kernel buffer with the enabled pagefault handler. - */ + /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, state_size); } From b50854eca0e014c2d3738073b387ab8ec85118ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:15:57 +0200 Subject: [PATCH 023/110] x86/pkru: Remove useless include PKRU code does not need anything from FPU headers. Include cpufeature.h instead and fixup the resulting fallout in perf. This is a preparation for FPU changes in order to prevent recursive include hell. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.551522694@linutronix.de --- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/pkru.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e3ac05c97b5e..134c08df7340 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,7 @@ #include +#include #include #include diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index ccc539faa5bb..4cd49afa0ca4 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PKRU_H #define _ASM_X86_PKRU_H -#include +#include #define PKRU_AD_BIT 0x1 #define PKRU_WD_BIT 0x2 From f5daf836f292f795f9cf8f36e036bf47adcbc3a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:15:59 +0200 Subject: [PATCH 024/110] x86/fpu: Restrict xsaves()/xrstors() to independent states These interfaces are really only valid for features which are independently managed and not part of the task context state for various reasons. Tighten the checks and adjust the misleading comments. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.608492174@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8def1b7f8fb..5a76df965337 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1175,20 +1175,14 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, return copy_uabi_to_xstate(xsave, NULL, ubuf); } -static bool validate_xsaves_xrstors(u64 mask) +static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; - /* - * Validate that this is either a task->fpstate related component - * subset or an independent one. - */ - if (mask & xfeatures_mask_independent()) - xchk = ~xfeatures_mask_independent(); - else - xchk = ~xfeatures_mask_all; + + xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; @@ -1206,14 +1200,13 @@ static bool validate_xsaves_xrstors(u64 mask) * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); @@ -1231,14 +1224,13 @@ void xsaves(struct xregs_state *xstate, u64 mask) * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); From dc2f39fd1bf23eee644d409b84e8e435606997bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:01 +0200 Subject: [PATCH 025/110] x86/fpu: Cleanup the on_boot_cpu clutter Defensive programming is useful, but this on_boot_cpu debug is really silly. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.665080855@linutronix.de --- arch/x86/kernel/fpu/init.c | 16 ---------------- arch/x86/kernel/fpu/xstate.c | 9 --------- 2 files changed, 25 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 64e29927cc32..86bc9759fc8b 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -192,11 +192,6 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - /* * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). @@ -216,15 +211,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_user_xstate_size = fpu_kernel_xstate_size; } -/* Legacy code to initialize eager fpu mode. */ -static void __init fpu__init_system_ctx_switch(void) -{ - static bool on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -243,6 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); fpu__init_task_struct_size(); - - fpu__init_system_ctx_switch(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5a76df965337..d6b5f2266143 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -379,15 +379,10 @@ static void __init print_xstate_offset_size(void) */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu __initdata = 1; - BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; @@ -721,14 +716,10 @@ static void fpu__init_disable_system_xstate(void) void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; u64 xfeatures; int err; int i; - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; From 01f9f62d3ae75077a54a11d2777082f1e58e2d9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:02 +0200 Subject: [PATCH 026/110] x86/fpu: Remove pointless memset in fpu_clone() Zeroing the forked task's FPU registers buffer to avoid leaking init optimized stale data into the clone is a pointless exercise for the case where the current task has TIF_NEED_FPU_LOAD set. In that case, the FPU registers state is copied from current's FPU register buffer which can contain stale init optimized data as well. The alledged information leak is non-existant because this stale init optimized data is used nowhere and cannot leak anywhere. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.722854569@linutronix.de --- arch/x86/kernel/fpu/core.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7ada7bd03a32..191269edac97 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -259,12 +259,6 @@ int fpu_clone(struct task_struct *dst) if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; - /* - * Don't let 'init optimized' areas of the XSAVE area - * leak into the child task: - */ - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); - /* * If the FPU registers are not owned by current just memcpy() the * state. Otherwise save the FPU registers directly into the From 2d16a1876f20218f8970ea4b7f679cead1cdb510 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:04 +0200 Subject: [PATCH 027/110] x86/process: Clone FPU in copy_thread() There is no reason to clone FPU in arch_dup_task_struct(). Quite the contrary - it prevents optimizations. Move it to copy_thread(). Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.780714235@linutronix.de --- arch/x86/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d9463e3096b..d2227c55e683 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,7 +87,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - return fpu_clone(dst); + return 0; } /* @@ -154,6 +154,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif + fpu_clone(p); + /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); From 509e7a30cd0a9f38abac4114832d9f69ff0d73b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:06 +0200 Subject: [PATCH 028/110] x86/fpu: Do not inherit FPU context for kernel and IO worker threads There is no reason why kernel and IO worker threads need a full clone of the parent's FPU state. Both are kernel threads which are not supposed to use FPU. So copying a large state or doing XSAVE() is pointless. Just clean out the minimally required state for those tasks. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.839822981@linutronix.de --- arch/x86/kernel/fpu/core.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 191269edac97..9a6b195a8a00 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -212,6 +212,15 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave) xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; } +static inline unsigned int init_fpstate_copy_size(void) +{ + if (!use_xsave()) + return fpu_kernel_xstate_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.xsave); +} + static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; @@ -259,6 +268,23 @@ int fpu_clone(struct task_struct *dst) if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; + /* + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. + */ + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + + /* + * No FPU state inheritance for kernel threads and IO + * worker threads. + */ + if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->state, &init_fpstate, + init_fpstate_copy_size()); + return 0; + } + /* * If the FPU registers are not owned by current just memcpy() the * state. Otherwise save the FPU registers directly into the @@ -272,8 +298,6 @@ int fpu_clone(struct task_struct *dst) save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); - set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -322,15 +346,6 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) pkru_write_default(); } -static inline unsigned int init_fpstate_copy_size(void) -{ - if (!use_xsave()) - return fpu_kernel_xstate_size; - - /* XSAVE(S) just needs the legacy and the xstate header part */ - return sizeof(init_fpstate.xsave); -} - /* * Reset current->fpu memory state to the init values. */ From 126fe0401883598b45b34dbbd5e0d7d8a0aefa21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:07 +0200 Subject: [PATCH 029/110] x86/fpu: Cleanup xstate xcomp_bv initialization No point in having this duplicated all over the place with needlessly different defines. Provide a proper initialization function which initializes user buffers properly and make KVM use it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.897664678@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 4 +++- arch/x86/kernel/fpu/core.c | 35 ++++++++++++++++------------- arch/x86/kernel/fpu/init.c | 6 ++--- arch/x86/kernel/fpu/xstate.c | 8 +++---- arch/x86/kernel/fpu/xstate.h | 18 +++++++++++++++ arch/x86/kvm/x86.c | 11 +++------ 6 files changed, 49 insertions(+), 33 deletions(-) create mode 100644 arch/x86/kernel/fpu/xstate.h diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 1503750534f7..df57f1af3a4c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -80,7 +80,9 @@ static __always_inline __pure bool use_fxsr(void) extern union fpregs_state init_fpstate; -extern void fpstate_init(union fpregs_state *state); +extern void fpstate_init_user(union fpregs_state *state); +extern void fpu_init_fpstate_user(struct fpu *fpu); + #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9a6b195a8a00..0789f0c3dca9 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -16,6 +16,8 @@ #include #include +#include "xstate.h" + #define CREATE_TRACE_POINTS #include @@ -203,15 +205,6 @@ void fpu_sync_fpstate(struct fpu *fpu) fpregs_unlock(); } -static inline void fpstate_init_xstate(struct xregs_state *xsave) -{ - /* - * XRSTORS requires these bits set in xcomp_bv, or it will - * trigger #GP: - */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; -} - static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) @@ -238,23 +231,33 @@ static inline void fpstate_init_fstate(struct fregs_state *fp) fp->fos = 0xffff0000u; } -void fpstate_init(union fpregs_state *state) +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(union fpregs_state *state) { - if (!static_cpu_has(X86_FEATURE_FPU)) { + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } - memset(state, 0, fpu_kernel_xstate_size); + xstate_init_xcomp_bv(&state->xsave, xfeatures_mask_uabi()); - if (static_cpu_has(X86_FEATURE_XSAVES)) - fpstate_init_xstate(&state->xsave); - if (static_cpu_has(X86_FEATURE_FXSR)) + if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); } -EXPORT_SYMBOL_GPL(fpstate_init); + +#if IS_ENABLED(CONFIG_KVM) +void fpu_init_fpstate_user(struct fpu *fpu) +{ + fpstate_init_user(&fpu->state); +} +EXPORT_SYMBOL_GPL(fpu_init_fpstate_user); +#endif /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 86bc9759fc8b..37f872630a0e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -121,10 +121,10 @@ static void __init fpu__init_system_mxcsr(void) static void __init fpu__init_system_generic(void) { /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. */ - fpstate_init(&init_fpstate); + fpstate_init_user(&init_fpstate); fpu__init_system_mxcsr(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d6b5f2266143..259951d1eec5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,10 +15,10 @@ #include #include #include -#include #include -#include + +#include "xstate.h" /* * Although we spell it out in here, the Processor Trace @@ -389,9 +389,7 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask_all; + xstate_init_xcomp_bv(&init_fpstate.xsave, xfeatures_mask_all); /* * Init all the features state with header.xfeatures being 0x0 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000000..0789a04ee705 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include +#include + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aabd3a2ec1bc..74712e5b473a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10626,14 +10626,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - /* * Ensure guest xcr0 is valid for loading */ @@ -10720,6 +10712,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) pr_err("kvm: failed to allocate vcpu's fpu\n"); goto free_user_fpu; } + + fpu_init_fpstate_user(vcpu->arch.user_fpu); + fpu_init_fpstate_user(vcpu->arch.guest_fpu); fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); From ffd3e504c9e0de8b85755f3c7eabbbdd984cfeed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:09 +0200 Subject: [PATCH 030/110] x86/fpu/xstate: Provide and use for_each_xfeature() These loops evaluating xfeature bits are really hard to read. Create an iterator and use for_each_set_bit_from() inside which already does the right thing. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011538.958107505@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 56 +++++++++++++++--------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 259951d1eec5..a2bdc0cf8687 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -4,6 +4,7 @@ * * Author: Suresh Siddha */ +#include #include #include #include @@ -20,6 +21,10 @@ #include "xstate.h" +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) + /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms @@ -184,10 +189,7 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, xfeatures_mask_all) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; @@ -291,20 +293,15 @@ static void __init setup_xstate_comp_offsets(void) xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, xmm_space); - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_offsets[i] = xstate_offsets[i]; - } + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { + for_each_extended_xfeature(i, xfeatures_mask_all) + xstate_comp_offsets[i] = xstate_offsets[i]; return; } next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, xfeatures_mask_all) { if (xfeature_is_aligned(i)) next_offset = ALIGN(next_offset, 64); @@ -328,8 +325,8 @@ static void __init setup_supervisor_only_offsets(void) next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + for_each_extended_xfeature(i, xfeatures_mask_all) { + if (!xfeature_is_supervisor(i)) continue; if (xfeature_is_aligned(i)) @@ -347,9 +344,7 @@ static void __init print_xstate_offset_size(void) { int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; + for_each_extended_xfeature(i, xfeatures_mask_all) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xstate_comp_offsets[i], i, xstate_sizes[i]); } @@ -554,10 +549,7 @@ static void do_extra_xstate_size_checks(void) int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, xfeatures_mask_all) { check_xstate_against_struct(i); /* * Supervisor state components can be managed only by @@ -586,7 +578,6 @@ static void do_extra_xstate_size_checks(void) XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); } - /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * @@ -969,6 +960,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, struct xregs_state *xinit = &init_fpstate.xsave; struct xstate_header header; unsigned int zerofrom; + u64 mask; int i; memset(&header, 0, sizeof(header)); @@ -1022,17 +1014,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, zerofrom = offsetof(struct xregs_state, extended_state_area); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - /* - * The ptrace buffer is in non-compacted XSAVE format. - * In non-compacted format disabled features still occupy - * state space, but there is no state to copy from in the - * compacted init_fpstate. The gap tracking will zero this - * later. - */ - if (!(xfeatures_mask_uabi() & BIT_ULL(i))) - continue; + /* + * The ptrace buffer is in non-compacted XSAVE format. In + * non-compacted format disabled features still occupy state space, + * but there is no state to copy from in the compacted + * init_fpstate. The gap tracking will zero these states. + */ + mask = xfeatures_mask_uabi(); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. From 63cf05a19a5d3fb6e66b5f7ceb76e77dfc2695f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:10 +0200 Subject: [PATCH 031/110] x86/fpu/xstate: Mark all init only functions __init No point to keep them around after boot. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.017919252@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a2bdc0cf8687..b35ecfa8d450 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -462,7 +462,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr) return 0; } -static void __xstate_dump_leaves(void) +static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; @@ -502,7 +502,7 @@ static void __xstate_dump_leaves(void) * that our software representation matches what the CPU * tells us about the state's size. */ -static void check_xstate_against_struct(int nr) +static void __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -544,7 +544,7 @@ static void check_xstate_against_struct(int nr) * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ -static void do_extra_xstate_size_checks(void) +static void __init do_extra_xstate_size_checks(void) { int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -646,7 +646,7 @@ static unsigned int __init get_xsave_size(void) * Will the runtime-enumerated 'xstate_size' fit in the init * task's statically-allocated buffer? */ -static bool is_supported_xstate_size(unsigned int test_xstate_size) +static bool __init is_supported_xstate_size(unsigned int test_xstate_size) { if (test_xstate_size <= sizeof(union fpregs_state)) return true; @@ -691,7 +691,7 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(void) { xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); From a0ff0611c2fbde94f6c9db8351939b08f2cb6797 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:12 +0200 Subject: [PATCH 032/110] x86/fpu: Move KVMs FPU swapping to FPU core Swapping the host/guest FPU is directly fiddling with FPU internals which requires 5 exports. The upcoming support of dynamically enabled states would even need more. Implement a swap function in the FPU core code and export that instead. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Paolo Bonzini Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20211015011539.076072399@linutronix.de --- arch/x86/include/asm/fpu/api.h | 8 +++++ arch/x86/include/asm/fpu/internal.h | 15 ++------- arch/x86/kernel/fpu/core.c | 30 ++++++++++++++--- arch/x86/kernel/fpu/init.c | 1 - arch/x86/kernel/fpu/xstate.c | 1 - arch/x86/kvm/x86.c | 51 +++++++---------------------- arch/x86/mm/extable.c | 2 +- 7 files changed, 48 insertions(+), 60 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 23bef08a8388..d2b8603a9c7e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,6 +12,8 @@ #define _ASM_X86_FPU_API_H #include +#include + /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods @@ -108,4 +110,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); static inline void update_pasid(void) { } +/* fpstate-related functions which are exported to KVM */ +extern void fpu_init_fpstate_user(struct fpu *fpu); + +/* KVM specific functions */ +extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index df57f1af3a4c..3ac55ba55782 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -74,14 +74,8 @@ static __always_inline __pure bool use_fxsr(void) return static_cpu_has(X86_FEATURE_FXSR); } -/* - * fpstate handling functions: - */ - extern union fpregs_state init_fpstate; - extern void fpstate_init_user(union fpregs_state *state); -extern void fpu_init_fpstate_user(struct fpu *fpu); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); @@ -381,12 +375,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) return err; } -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); - -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) -{ - __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); -} +extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -467,7 +456,7 @@ static inline void fpregs_restore_userregs(void) */ mask = xfeatures_mask_restore_user() | xfeatures_mask_supervisor(); - __restore_fpregs_from_fpstate(&fpu->state, mask); + restore_fpregs_from_fpstate(&fpu->state, mask); fpregs_activate(fpu); fpu->last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0789f0c3dca9..023bfe857907 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -124,9 +124,8 @@ void save_fpregs_to_fpstate(struct fpu *fpu) asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); frstor(&fpu->state.fsave); } -EXPORT_SYMBOL(save_fpregs_to_fpstate); -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -151,7 +150,31 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) frstor(&fpstate->fsave); } } -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); + +#if IS_ENABLED(CONFIG_KVM) +void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) +{ + fpregs_lock(); + + if (save) { + if (test_thread_flag(TIF_NEED_FPU_LOAD)) { + memcpy(&save->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + } else { + save_fpregs_to_fpstate(save); + } + } + + if (rstor) { + restore_mask &= xfeatures_mask_fpstate(); + restore_fpregs_from_fpstate(&rstor->state, restore_mask); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); +#endif void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -457,7 +480,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 37f872630a0e..545c91c723b8 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -136,7 +136,6 @@ static void __init fpu__init_system_generic(void) * components into a single, continuous memory block: */ unsigned int fpu_kernel_xstate_size __ro_after_init; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b35ecfa8d450..68355605ca75 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,7 +65,6 @@ static short xsave_cpuid_features[] __initdata = { * XSAVE buffer, both supervisor and user xstates. */ u64 xfeatures_mask_all __ro_after_init; -EXPORT_SYMBOL_GPL(xfeatures_mask_all); static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 74712e5b473a..66eea4e314db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,7 +68,9 @@ #include #include #include -#include /* Ugh! */ +#include +#include +#include #include #include #include @@ -9913,58 +9915,27 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. + * Guests with protected state have guest_fpu == NULL which makes + * the swap only save the host state. Exclude PKRU from restore as + * it is restored separately in kvm_x86_ops.run(). */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu, + ~XFEATURE_MASK_PKRU); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. + * Guests with protected state have guest_fpu == NULL which makes + * swap only restore the host state. */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index f37e290e6d0a..043ec385af45 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -47,7 +47,7 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); return true; } From ea4d6938d4c0761672ff6237964a20db3cb95cc1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:13 +0200 Subject: [PATCH 033/110] x86/fpu: Replace KVMs home brewed FPU copy from user Copying a user space buffer to the memory buffer is already available in the FPU core. The copy mechanism in KVM lacks sanity checks and needs to use cpuid() to lookup the offset of each component, while the FPU core has this information cached. Make the FPU core variant accessible for KVM and replace the home brewed mechanism. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20211015011539.134065207@linutronix.de --- arch/x86/include/asm/fpu/api.h | 2 + arch/x86/kernel/fpu/core.c | 38 ++++++++++++++++- arch/x86/kernel/fpu/xstate.c | 3 +- arch/x86/kvm/x86.c | 74 ++-------------------------------- 4 files changed, 43 insertions(+), 74 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index d2b8603a9c7e..77a732ea4cda 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -116,4 +116,6 @@ extern void fpu_init_fpstate_user(struct fpu *fpu); /* KVM specific functions */ extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); +extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 023bfe857907..65fc87760011 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -174,7 +174,43 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); -#endif + +int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, + u32 *vpkru) +{ + union fpregs_state *kstate = &fpu->state; + const union fpregs_state *ustate = buf; + struct pkru_state *xpkru; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + ret = copy_uabi_from_kernel_to_xstate(&kstate->xsave, ustate); + if (ret) + return ret; + + /* Retrieve PKRU if not in init state */ + if (kstate->xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->xsave, XFEATURE_PKRU); + *vpkru = xpkru->pkru; + } + + /* Ensure that XCOMP_BV is set up for XSAVES */ + xstate_init_xcomp_bv(&kstate->xsave, xfeatures_mask_uabi()); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate); +#endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 68355605ca75..eeeb807b9717 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1134,8 +1134,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] - * format and copy to the target thread. This is called from - * xstateregs_set(). + * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 66eea4e314db..cdc19b1d5775 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4702,8 +4702,6 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) - static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; @@ -4747,50 +4745,6 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) } } -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - } else { - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) - memcpy(dest, src + offset, size); - } - - valid -= xfeature_mask; - } -} - static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { @@ -4809,37 +4763,15 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } } -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv; - u32 mxcsr; - if (!vcpu->arch.guest_fpu) return 0; - xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_kvm_uabi_to_fpstate(vcpu->arch.guest_fpu, + guest_xsave->region, + supported_xcr0, &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, From ca834defd33bae9cf9542ff92b15635a84e91946 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:15 +0200 Subject: [PATCH 034/110] x86/fpu: Rework copy_xstate_to_uabi_buf() Prepare for replacing the KVM copy xstate to user function by extending copy_xstate_to_uabi_buf() with a pkru argument which allows the caller to hand in the pkru value, which is required for KVM because the guest PKRU is not accessible via current. Fixup all callsites accordingly. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.191902137@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 34 ++++++++++++++++++++++++++-------- arch/x86/kernel/fpu/xstate.h | 3 +++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index eeeb807b9717..b2537a8203ee 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -940,9 +940,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, } /** - * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @tsk: The task from which to copy the saved xstate + * @xsave: The xsave from which to copy + * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming @@ -951,11 +952,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode copy_mode) +void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, + u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct xregs_state *xinit = &init_fpstate.xsave; struct xstate_header header; unsigned int zerofrom; @@ -1033,10 +1033,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the - * thread's XSAVE buffer. Fill this part from the - * per-thread storage. + * XSAVE buffer. Use the provided value. */ - pkru.pkru = tsk->thread.pkru; + pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { copy_feature(header.xfeatures & BIT_ULL(i), &to, @@ -1056,6 +1055,25 @@ out: membuf_zero(&to, to.left); } +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) +{ + __copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.state.xsave, + tsk->thread.pkru, copy_mode); +} + static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 0789a04ee705..81f4202781ac 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -15,4 +15,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, + u32 pkru_val, enum xstate_copy_mode copy_mode); + #endif From 9603445549dacd7688532a4076c377e43a3ecfce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:18 +0200 Subject: [PATCH 035/110] x86/fpu: Mark fpu__init_prepare_fx_sw_frame() as __init No need to keep it around. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.296435736@linutronix.de --- arch/x86/include/asm/fpu/signal.h | 2 -- arch/x86/kernel/fpu/internal.h | 8 ++++++++ arch/x86/kernel/fpu/signal.c | 4 +++- arch/x86/kernel/fpu/xstate.c | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kernel/fpu/internal.h diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 8b6631dffefd..04868a76239a 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -31,6 +31,4 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern void fpu__init_prepare_fx_sw_frame(void); - #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000000..036f84c236dd --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +/* Init functions */ +extern void fpu__init_prepare_fx_sw_frame(void); + +#endif diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index e257805d3d3f..2a4d1d0b32d4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -16,6 +16,8 @@ #include #include +#include "internal.h" + static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; @@ -514,7 +516,7 @@ unsigned long fpu__get_fpstate_size(void) * This will be saved when ever the FP and extended state context is * saved on the user stack during the signal handler delivery to the user. */ -void fpu__init_prepare_fx_sw_frame(void) +void __init fpu__init_prepare_fx_sw_frame(void) { int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b2537a8203ee..1f5a66a38671 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -19,6 +19,7 @@ #include +#include "internal.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ From 63e81807c1f94e91b9d71c536112a40cd74bab85 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:20 +0200 Subject: [PATCH 036/110] x86/fpu: Move context switch and exit to user inlines into sched.h internal.h is a kitchen sink which needs to get out of the way to prepare for the upcoming changes. Move the context switch and exit to user inlines into a separate header, which is all that code needs. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.349132461@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 60 ------------------------- arch/x86/include/asm/fpu/sched.h | 68 +++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 1 + arch/x86/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 6 files changed, 72 insertions(+), 63 deletions(-) create mode 100644 arch/x86/include/asm/fpu/sched.h diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3ac55ba55782..398c87c8e199 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -27,16 +27,11 @@ * High level FPU state handling functions: */ extern bool fpu__restore_sig(void __user *buf, int ia32_frame); -extern void fpu__drop(struct fpu *fpu); extern void fpu__clear_user_states(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); -/* Clone and exit operations */ -extern int fpu_clone(struct task_struct *dst); -extern void fpu_flush_thread(void); - /* * Boot time FPU initialization functions: */ @@ -82,7 +77,6 @@ extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif -extern void save_fpregs_to_fpstate(struct fpu *fpu); /* * Returns 0 on success or the trap number when the operation raises an @@ -464,58 +458,4 @@ static inline void fpregs_restore_userregs(void) clear_thread_flag(TIF_NEED_FPU_LOAD); } -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the - * registers before returning to userland or using the content - * otherwise. - * - * The FPU context is only stored/restored for a user task and - * PF_KTHREAD is used to distinguish between kernel and user threads. - */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) -{ - if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { - save_fpregs_to_fpstate(old_fpu); - /* - * The save operation preserved register state, so the - * fpu_fpregs_owner_ctx is still @old_fpu. Store the - * current CPU number in @old_fpu, so the next return - * to user space can avoid the FPU register restore - * when is returns on the same CPU and still owns the - * context. - */ - old_fpu->last_cpu = cpu; - - trace_x86_fpu_regs_deactivated(old_fpu); - } -} - -/* - * Misc helper functions: - */ - -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(void) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); -} - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h new file mode 100644 index 000000000000..cdb78d590c86 --- /dev/null +++ b/arch/x86/include/asm/fpu/sched.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_SCHED_H +#define _ASM_X86_FPU_SCHED_H + +#include + +#include +#include + +#include + +extern void save_fpregs_to_fpstate(struct fpu *fpu); +extern void fpu__drop(struct fpu *fpu); +extern int fpu_clone(struct task_struct *dst); +extern void fpu_flush_thread(void); + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. + * + * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state + * will get loaded on return to userspace, or when the kernel needs it. + * + * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers + * are saved in the current thread's FPU register state. + * + * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not + * hold current()'s FPU registers. It is required to load the + * registers before returning to userland or using the content + * otherwise. + * + * The FPU context is only stored/restored for a user task and + * PF_KTHREAD is used to distinguish between kernel and user threads. + */ +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU) && + !(current->flags & PF_KTHREAD)) { + save_fpregs_to_fpstate(old_fpu); + /* + * The save operation preserved register state, so the + * fpu_fpregs_owner_ctx is still @old_fpu. Store the + * current CPU number in @old_fpu, so the next return + * to user space can avoid the FPU register restore + * when is returns on the same CPU and still owns the + * context. + */ + old_fpu->last_cpu = cpu; + + trace_x86_fpu_regs_deactivated(old_fpu); + } +} + +/* + * Delay loading of the complete FPU state until the return to userland. + * PKRU is handled separately. + */ +static inline void switch_fpu_finish(void) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU)) + set_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 65fc87760011..e6087a61a844 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d2227c55e683..5cd82082353e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d008e222a302..26edb1cd07a4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 39f12ef1c85c..3402edec236c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include #include #include From d06241f52cfe4a0580856ef2cfac90dc7f752cae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:21 +0200 Subject: [PATCH 037/110] x86/fpu: Clean up CPU feature tests Further disintegration of internal.h: Move the CPU feature tests to a core header and remove the unused one. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.401510559@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 18 ------------------ arch/x86/kernel/fpu/core.c | 1 + arch/x86/kernel/fpu/internal.h | 11 +++++++++++ arch/x86/kernel/fpu/regset.c | 2 ++ 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 398c87c8e199..5da7528b3b2f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -51,24 +51,6 @@ extern void fpu__resume_cpu(void); # define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif -/* - * FPU related CPU feature flag helper routines: - */ -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - extern union fpregs_state init_fpstate; extern void fpstate_init_user(union fpregs_state *state); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e6087a61a844..e9b51c75e2c9 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -17,6 +17,7 @@ #include #include +#include "internal.h" #include "xstate.h" #define CREATE_TRACE_POINTS diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index 036f84c236dd..a8aac21ba364 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -2,6 +2,17 @@ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + /* Init functions */ extern void fpu__init_prepare_fx_sw_frame(void); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 66ed317ebc0d..ccf0c59955f1 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -10,6 +10,8 @@ #include #include +#include "internal.h" + /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature From b579d0c3750eedc0dee433edaba88206a8e4348a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:23 +0200 Subject: [PATCH 038/110] x86/fpu: Make os_xrstor_booting() private It's only required in the xstate init code. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.455836597@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 25 ------------------------- arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5da7528b3b2f..3ad2ae73efa5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -225,31 +225,6 @@ static inline void fxsave(struct fxregs_state *fx) : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline void os_xrstor_booting(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_fpstate(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - /* - * We should never fault when copying from a kernel buffer, and the FPU - * state we set at boot time should be valid. - */ - WARN_ON_FPU(err); -} - /* * Save processor xstate to xsave area. * diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1f5a66a38671..b712c06cbbfb 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -350,6 +350,29 @@ static void __init print_xstate_offset_size(void) } } +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static __init void os_xrstor_booting(struct xregs_state *xstate) +{ + u64 mask = xfeatures_mask_fpstate(); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ + WARN_ON_FPU(err); +} + /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit From df95b0f1aa56dfa71a0ef657e3e62294ee6d9034 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:24 +0200 Subject: [PATCH 039/110] x86/fpu: Move os_xsave() and os_xrstor() to core Nothing outside the core code needs these. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.513368075@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 165 -------------------------- arch/x86/include/asm/fpu/xstate.h | 6 - arch/x86/kernel/fpu/signal.c | 1 + arch/x86/kernel/fpu/xstate.h | 174 ++++++++++++++++++++++++++++ 4 files changed, 175 insertions(+), 171 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3ad2ae73efa5..b68f9940489f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -161,171 +161,6 @@ static inline void fxsave(struct fxregs_state *fx) asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -/* - * After this @err contains 0 on success or the trap number when the - * operation raises an exception. - */ -#define XSTATE_OP(op, st, lmask, hmask, err) \ - asm volatile("1:" op "\n\t" \ - "xor %[err], %[err]\n" \ - "2:\n\t" \ - _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ - : [err] "=a" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. - * - * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT - * supports modified optimization which is not supported by XSAVE. - * - * We use XSAVE as a fallback. - * - * The 661 label is defined in the ALTERNATIVE* macros as the address of the - * original instruction which gets replaced. We need to use it here as the - * address of the instruction where we might get an exception at. - */ -#define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ - XSAVEOPT, X86_FEATURE_XSAVEOPT, \ - XSAVES, X86_FEATURE_XSAVES) \ - "\n" \ - "xor %[err], %[err]\n" \ - "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact - * XSAVE area format. - */ -#define XSTATE_XRESTORE(st, lmask, hmask) \ - asm volatile(ALTERNATIVE(XRSTOR, \ - XRSTORS, X86_FEATURE_XSAVES) \ - "\n" \ - "3:\n" \ - _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ - : \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * Save processor xstate to xsave area. - * - * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features - * and command line options. The choice is permanent until the next reboot. - */ -static inline void os_xsave(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_all; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON_FPU(!alternatives_patched); - - XSTATE_XSAVE(xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - -/* - * Restore processor xstate from xsave area. - * - * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. - */ -static inline void os_xrstor(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - - XSTATE_XRESTORE(xstate, lmask, hmask); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for backward compatibility for - * old applications which don't understand the compacted format of the - * xsave area. - * - * The caller has to zero buf::header before calling this because XSAVE* - * does not touch the reserved fields in the header. - */ -static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) -{ - /* - * Include the features which are not xsaved/rstored by the kernel - * internally, e.g. PKRU. That's user space ABI and also required - * to allow the signal handler to modify PKRU. - */ - u64 mask = xfeatures_mask_uabi(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - stac(); - XSTATE_OP(XSAVE, buf, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) -{ - struct xregs_state *xstate = ((__force struct xregs_state *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - stac(); - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from kernel space xsave area, return an error code instead of - * an exception. - */ -static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - return err; -} - extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 109dfcc75299..b8cebc0ee420 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -78,12 +78,6 @@ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - extern u64 xfeatures_mask_all; static inline u64 xfeatures_mask_supervisor(void) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2a4d1d0b32d4..3b38c59ce3f8 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -17,6 +17,7 @@ #include #include "internal.h" +#include "xstate.h" static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 81f4202781ac..ae61baa97682 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -18,4 +18,178 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, u32 pkru_val, enum xstate_copy_mode copy_mode); +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact + * format and supervisor states in addition to modified optimization in + * XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_2(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + ".pushsection .fixup,\"ax\"\n" \ + "4: movl $-2, %[err]\n" \ + "jmp 3b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(661b, 4b) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct xregs_state *xstate) +{ + u64 mask = xfeatures_mask_all; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + + XSTATE_XSAVE(xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(xstate, lmask, hmask); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + u64 mask = xfeatures_mask_uabi(); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + #endif From 34002571cb4199a446f7582704424d20a01c276e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:26 +0200 Subject: [PATCH 040/110] x86/fpu: Move legacy ASM wrappers to core Nothing outside the core code requires them. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.572439164@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 101 -------------------------- arch/x86/kernel/fpu/core.c | 1 + arch/x86/kernel/fpu/legacy.h | 108 ++++++++++++++++++++++++++++ arch/x86/kernel/fpu/signal.c | 1 + arch/x86/kernel/fpu/xstate.c | 1 + 5 files changed, 111 insertions(+), 101 deletions(-) create mode 100644 arch/x86/kernel/fpu/legacy.h diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b68f9940489f..7722aadc3278 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -60,107 +60,6 @@ extern void fpstate_init_soft(struct swregs_state *soft); static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif -/* - * Returns 0 on success or the trap number when the operation raises an - * exception. - */ -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - \ - might_fault(); \ - \ - asm volatile(ASM_STAC "\n" \ - "1: " #insn "\n" \ - "2: " ASM_CLAC "\n" \ - _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ - : [err] "=a" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn_err(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn(insn, output, input...) \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ - : output : input) - -static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - -} - -static inline void fxrstor(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_safe(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void frstor(struct fregs_state *fx) -{ - kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_safe(struct fregs_state *fx) -{ - return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fxsave(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); - else - asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); -} - extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e9b51c75e2c9..a009c82336a4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -18,6 +18,7 @@ #include #include "internal.h" +#include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000000..2ff36b0f79e9 --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3b38c59ce3f8..e0198b24e28c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -17,6 +17,7 @@ #include #include "internal.h" +#include "legacy.h" #include "xstate.h" static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b712c06cbbfb..246a7fea06b1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -20,6 +20,7 @@ #include #include "internal.h" +#include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ From cdcb6fa14e1499ff2b2a3f3e0938c7b3b7ef2cd6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:28 +0200 Subject: [PATCH 041/110] x86/fpu: Make WARN_ON_FPU() private No point in being in global headers. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.628516182@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 9 --------- arch/x86/kernel/fpu/init.c | 2 ++ arch/x86/kernel/fpu/internal.h | 6 ++++++ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7722aadc3278..f8413a509ba5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -42,15 +42,6 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); -/* - * Debugging facility: - */ -#ifdef CONFIG_X86_DEBUG_FPU -# define WARN_ON_FPU(x) WARN_ON_ONCE(x) -#else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) -#endif - extern union fpregs_state init_fpstate; extern void fpstate_init_user(union fpregs_state *state); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 545c91c723b8..24873dfe2dba 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -10,6 +10,8 @@ #include #include +#include "internal.h" + /* * Initialize the registers found in all CPUs, CR0 and CR4: */ diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index a8aac21ba364..5ddc09e03c2a 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -13,6 +13,12 @@ static __always_inline __pure bool use_fxsr(void) return cpu_feature_enabled(X86_FEATURE_FXSR); } +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + /* Init functions */ extern void fpu__init_prepare_fx_sw_frame(void); From 9848fb96839bfd6ad4c00748842ccfd5bd3b0346 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:30 +0200 Subject: [PATCH 042/110] x86/fpu: Move fpregs_restore_userregs() to core Only used internally in the FPU core code. While at it, convert to the percpu accessors which verify preemption is disabled. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.686806639@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 83 ---------------------------- arch/x86/kernel/fpu/context.h | 85 +++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 1 + arch/x86/kernel/fpu/regset.c | 1 + arch/x86/kernel/fpu/signal.c | 1 + 5 files changed, 88 insertions(+), 83 deletions(-) create mode 100644 arch/x86/kernel/fpu/context.h diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index f8413a509ba5..74b7cc3d2e77 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -55,89 +55,6 @@ extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); -/* - * FPU context switch related helper methods: - */ - DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -/* - * The in-register FPU state for an FPU context on a CPU is assumed to be - * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx - * matches the FPU. - * - * If the FPU register state is valid, the kernel can skip restoring the - * FPU state from memory. - * - * Any code that clobbers the FPU registers or updates the in-memory - * FPU state for a task MUST let the rest of the kernel know that the - * FPU registers are no longer valid for this task. - * - * Either one of these invalidation functions is enough. Invalidate - * a resource you control: CPU if using the CPU for something else - * (with preemption disabled), FPU for the current task, or a task that - * is prevented from running by the current task. - */ -static inline void __cpu_invalidate_fpregs_state(void) -{ - __this_cpu_write(fpu_fpregs_owner_ctx, NULL); -} - -static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) -{ - fpu->last_cpu = -1; -} - -static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - -/* - * These generally need preemption protection to work, - * do try to avoid using these on their own: - */ -static inline void fpregs_deactivate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - trace_x86_fpu_regs_deactivated(fpu); -} - -static inline void fpregs_activate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, fpu); - trace_x86_fpu_regs_activated(fpu); -} - -/* Internal helper for switch_fpu_return() and signal frame setup */ -static inline void fpregs_restore_userregs(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - int cpu = smp_processor_id(); - - if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) - return; - - if (!fpregs_state_valid(fpu, cpu)) { - u64 mask; - - /* - * This restores _all_ xstate which has not been - * established yet. - * - * If PKRU is enabled, then the PKRU value is already - * correct because it was either set in switch_to() or in - * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. - */ - mask = xfeatures_mask_restore_user() | - xfeatures_mask_supervisor(); - restore_fpregs_from_fpstate(&fpu->state, mask); - - fpregs_activate(fpu); - fpu->last_cpu = cpu; - } - clear_thread_flag(TIF_NEED_FPU_LOAD); -} - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000000..e652282842c8 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include +#include + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + u64 mask; + + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + */ + mask = xfeatures_mask_restore_user() | + xfeatures_mask_supervisor(); + restore_fpregs_from_fpstate(&fpu->state, mask); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a009c82336a4..739728889b54 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -17,6 +17,7 @@ #include #include +#include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ccf0c59955f1..a40150e350b6 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -10,6 +10,7 @@ #include #include +#include "context.h" #include "internal.h" /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index e0198b24e28c..32dbcde72fbe 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -16,6 +16,7 @@ #include #include +#include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" From d9d005f32aac7362a1998f4b7fdf8874e91546bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:31 +0200 Subject: [PATCH 043/110] x86/fpu: Move mxcsr related code to core No need to expose that to code which only needs the XCR0 accessors. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.740012411@linutronix.de --- arch/x86/include/asm/fpu/xcr.h | 11 ----------- arch/x86/kernel/fpu/init.c | 1 + arch/x86/kernel/fpu/legacy.h | 7 +++++++ arch/x86/kernel/fpu/regset.c | 1 + arch/x86/kernel/fpu/xstate.c | 3 ++- arch/x86/kvm/svm/sev.c | 2 +- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 1c7ab8d95da5..79f95d3787e2 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -2,17 +2,6 @@ #ifndef _ASM_X86_FPU_XCR_H #define _ASM_X86_FPU_XCR_H -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ - asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - #define XCR_XFEATURE_ENABLED_MASK 0x00000000 static inline u64 xgetbv(u32 index) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 24873dfe2dba..e77084a6ae7c 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -11,6 +11,7 @@ #include #include "internal.h" +#include "legacy.h" /* * Initialize the registers found in all CPUs, CR0 and CR4: diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h index 2ff36b0f79e9..17c26b164c63 100644 --- a/arch/x86/kernel/fpu/legacy.h +++ b/arch/x86/kernel/fpu/legacy.h @@ -4,6 +4,13 @@ #include +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + /* * Returns 0 on success or the trap number when the operation raises an * exception. diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index a40150e350b6..3d8ed45da166 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -12,6 +12,7 @@ #include "context.h" #include "internal.h" +#include "legacy.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 246a7fea06b1..f0305b2b227f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -14,8 +14,9 @@ #include #include -#include #include +#include +#include #include diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c36b5fe4c27c..3c57bd091120 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -17,10 +17,10 @@ #include #include #include -#include #include #include +#include #include "x86.h" #include "svm.h" From 90489f1dee8b703a3301857917c0aba0b22b5d83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:33 +0200 Subject: [PATCH 044/110] x86/fpu: Move fpstate functions to api.h Move function declarations which need to be globally available to api.h where they belong. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.792363754@linutronix.de --- arch/x86/include/asm/fpu/api.h | 9 +++++++++ arch/x86/include/asm/fpu/internal.h | 9 --------- arch/x86/kernel/fpu/internal.h | 3 +++ arch/x86/math-emu/fpu_entry.c | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 77a732ea4cda..56cf884ecdae 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -110,6 +110,15 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); static inline void update_pasid(void) { } +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif + +/* fpstate */ +extern union fpregs_state init_fpstate; + /* fpstate-related functions which are exported to KVM */ extern void fpu_init_fpstate_user(struct fpu *fpu); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 74b7cc3d2e77..d8bb49134ebb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -42,15 +42,6 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); -extern union fpregs_state init_fpstate; -extern void fpstate_init_user(union fpregs_state *state); - -#ifdef CONFIG_MATH_EMULATION -extern void fpstate_init_soft(struct swregs_state *soft); -#else -static inline void fpstate_init_soft(struct swregs_state *soft) {} -#endif - extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index 5ddc09e03c2a..bd7f813242dd 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -22,4 +22,7 @@ static __always_inline __pure bool use_fxsr(void) /* Init functions */ extern void fpu__init_prepare_fx_sw_frame(void); +/* Used in init.c */ +extern void fpstate_init_user(union fpregs_state *state); + #endif diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 8679a9d6c47f..50195e249753 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "fpu_system.h" #include "fpu_emu.h" From 0ae67cc34f765078a63137120e4567ad2f050b75 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:35 +0200 Subject: [PATCH 045/110] x86/fpu: Remove internal.h dependency from fpu/signal.h In order to remove internal.h make signal.h independent of it. Include asm/fpu/xstate.h to fix a missing update_regset_xstate_info() prototype, which is Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.844565975@linutronix.de --- arch/x86/ia32/ia32_signal.c | 1 - arch/x86/include/asm/fpu/api.h | 3 +++ arch/x86/include/asm/fpu/internal.h | 7 ------- arch/x86/include/asm/fpu/signal.h | 13 +++++++++++++ arch/x86/kernel/fpu/signal.c | 1 - arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 1 - arch/x86/mm/extable.c | 3 ++- 8 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 828ab0a9239b..c9c3859322fa 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 56cf884ecdae..17893af0880c 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -116,6 +116,9 @@ extern void fpstate_init_soft(struct swregs_state *soft); static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif +/* State tracking */ +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + /* fpstate */ extern union fpregs_state init_fpstate; diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d8bb49134ebb..8f97d3e375de 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,7 +26,6 @@ /* * High level FPU state handling functions: */ -extern bool fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__clear_user_states(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); @@ -42,10 +41,4 @@ extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); -extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); - -extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); - -DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 04868a76239a..9a63a21c219d 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,6 +5,11 @@ #ifndef _ASM_X86_FPU_SIGNAL_H #define _ASM_X86_FPU_SIGNAL_H +#include +#include + +#include + #ifdef CONFIG_X86_64 # include # include @@ -31,4 +36,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern void fpu__clear_user_states(struct fpu *fpu); +extern bool fpu__restore_sig(void __user *buf, int ia32_frame); + +extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); + +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); + #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 32dbcde72fbe..274cd58b3dc8 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4c208ea3bd9f..6d2244c94799 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -29,9 +29,9 @@ #include #include -#include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 02ee68e68184..58bd07071d14 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -30,7 +30,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 043ec385af45..79c2e30d93ae 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -4,7 +4,8 @@ #include #include -#include +#include +#include #include #include #include From ff0c37e191f2629bf2776dbd95db5d06f704ab93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:36 +0200 Subject: [PATCH 046/110] x86/sev: Include fpu/xcr.h Include the header which only provides the XCR accessors. That's all what is needed here. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.896573039@linutronix.de --- arch/x86/kernel/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a6895e440bc3..50c773c3384c 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include From 6415bb80926379310afd74800415f6ebf4bb5c31 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:38 +0200 Subject: [PATCH 047/110] x86/fpu: Mop up the internal.h leftovers Move the global interfaces to api.h and the rest into the core. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011539.948837194@linutronix.de --- arch/x86/include/asm/fpu/api.h | 10 ++++++++++ arch/x86/include/asm/fpu/internal.h | 18 ------------------ arch/x86/kernel/fpu/init.c | 1 + arch/x86/kernel/fpu/xstate.h | 3 +++ 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 17893af0880c..c691f079a56e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -110,6 +110,16 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); static inline void update_pasid(void) { } +/* Trap handling */ +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern void fpu_sync_fpstate(struct fpu *fpu); + +/* Boot, hotplug and resume */ +extern void fpu__init_cpu(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8f97d3e375de..8df83e887ff6 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,22 +23,4 @@ #include #include -/* - * High level FPU state handling functions: - */ -extern void fpu__clear_user_states(struct fpu *fpu); -extern int fpu__exception_code(struct fpu *fpu, int trap_nr); - -extern void fpu_sync_fpstate(struct fpu *fpu); - -/* - * Boot time FPU initialization functions: - */ -extern void fpu__init_cpu(void); -extern void fpu__init_system_xstate(void); -extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpu__init_check_bugs(void); -extern void fpu__resume_cpu(void); - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index e77084a6ae7c..d420d29e58be 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -12,6 +12,7 @@ #include "internal.h" #include "legacy.h" +#include "xstate.h" /* * Initialize the registers found in all CPUs, CR0 and CR4: diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index ae61baa97682..bb6d7d298d2a 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -18,6 +18,9 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(void); + /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 From b56d2795b29792c465cc8ef036abad5127a003fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:39 +0200 Subject: [PATCH 048/110] x86/fpu: Replace the includes of fpu/internal.h Now that the file is empty, fixup all references with the proper includes and delete the former kitchen sink. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011540.001197214@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 26 -------------------------- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/fpu/bugs.c | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 1 - arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/power/cpu.c | 2 +- 12 files changed, 10 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8df83e887ff6..e69de29bb2d1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_FPU_INTERNAL_H -#define _ASM_X86_FPU_INTERNAL_H - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..6c8a86b58020 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b3410f1ac217..486dc8c1d388 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 2954fab15e51..794e70151203 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,7 +2,7 @@ /* * x86 FPU bug checks: */ -#include +#include /* * Boot time CPU/FPU FDIV bug detection code: diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 739728889b54..9bb0c1c45e27 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,7 +6,7 @@ * General FPU state handling cleanups * Gareth Hughes , May 2000 */ -#include +#include #include #include #include diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d420d29e58be..23791355ca67 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -2,7 +2,7 @@ /* * x86 FPU boot time init code: */ -#include +#include #include #include diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 3d8ed45da166..01a1d97c3cb6 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f0305b2b227f..b022df95a302 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85f6e242b6b4..2577ed3b5e6b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a58800973aed..bae7582c58f5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 116b08904ac3..9a979279a37b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6665f8802098..9f2b251e83c5 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include From 079ec41b22b952cdf3126527d735e373c9125f6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:41 +0200 Subject: [PATCH 049/110] x86/fpu: Provide a proper function for ex_handler_fprestore() To make upcoming changes for support of dynamically enabled features simpler, provide a proper function for the exception handler which removes exposure of FPU internals. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211015011540.053515012@linutronix.de --- arch/x86/include/asm/fpu/api.h | 4 +--- arch/x86/kernel/fpu/core.c | 5 +++++ arch/x86/kernel/fpu/internal.h | 2 ++ arch/x86/mm/extable.c | 5 ++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index c691f079a56e..9263d708dff9 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -113,6 +113,7 @@ static inline void update_pasid(void) { } /* Trap handling */ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); +extern void fpu_reset_from_exception_fixup(void); /* Boot, hotplug and resume */ extern void fpu__init_cpu(void); @@ -129,9 +130,6 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} /* State tracking */ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -/* fpstate */ -extern union fpregs_state init_fpstate; - /* fpstate-related functions which are exported to KVM */ extern void fpu_init_fpstate_user(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9bb0c1c45e27..79f2e8ddd10c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -155,6 +155,11 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) } } +void fpu_reset_from_exception_fixup(void) +{ + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); +} + #if IS_ENABLED(CONFIG_KVM) void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) { diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index bd7f813242dd..479f2db6e160 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -2,6 +2,8 @@ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H +extern union fpregs_state init_fpstate; + /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) { diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 79c2e30d93ae..5cd2a88930a9 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -4,8 +4,7 @@ #include #include -#include -#include +#include #include #include #include @@ -48,7 +47,7 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + fpu_reset_from_exception_fixup(); return true; } From bf5d00470787067ff27593c6a097b5eb6e01168e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 03:16:17 +0200 Subject: [PATCH 050/110] x86/fpu: Replace KVMs home brewed FPU copy to user Similar to the copy from user function the FPU core has this already implemented with all bells and whistles. Get rid of the duplicated code and use the core functionality. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20211015011539.244101845@linutronix.de --- arch/x86/include/asm/fpu/api.h | 1 + arch/x86/kernel/fpu/core.c | 18 +++++++++++ arch/x86/kvm/x86.c | 56 ++-------------------------------- 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 9263d708dff9..5ac5e4596b53 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -137,5 +137,6 @@ extern void fpu_init_fpstate_user(struct fpu *fpu); extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru); +extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 79f2e8ddd10c..ac540a7d410e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -184,6 +184,24 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); +void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, + unsigned int size, u32 pkru) +{ + union fpregs_state *kstate = &fpu->state; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, &kstate->xsave, pkru, + XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->fxsave, sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi); + int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *vpkru) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdc19b1d5775..a18d4670e640 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4702,65 +4702,15 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - } else { - src = get_xsave_addr(xsave, xfeature_nr); - if (src) - memcpy(dest + offset, src, size); - } - - valid -= xfeature_mask; - } -} - static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (!vcpu->arch.guest_fpu) return; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } + fpu_copy_fpstate_to_kvm_uabi(vcpu->arch.guest_fpu, guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, From 87d0e5be0fac322f4415128def9f16a71a267a40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:27 +0200 Subject: [PATCH 051/110] x86/fpu: Provide struct fpstate New xfeatures will not longer be automatically stored in the regular XSAVE buffer in thread_struct::fpu. The kernel will provide the default sized buffer for storing the regular features up to AVX512 in thread_struct::fpu and if a task requests to use one of the new features then the register storage has to be extended. The state will be accessed via a pointer in thread_struct::fpu which defaults to the builtin storage and can be switched when extended storage is required. To avoid conditionals all over the code, create a new container for the register storage which will gain other information, e.g. size, feature masks etc., later. For now it just contains the register storage, which gives it exactly the same layout as the exiting fpu::state. Stick fpu::state and the new fpu::__fpstate into an anonymous union and initialize the pointer. Add build time checks to validate that both are at the same place and have the same size. This allows step by step conversion of all users. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.234458659@linutronix.de --- arch/x86/include/asm/fpu/types.h | 20 +++++++++++++++++++- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/fpu/core.c | 11 ++++++++++- arch/x86/kernel/fpu/init.c | 9 +++++++-- arch/x86/kernel/fpu/internal.h | 1 + 5 files changed, 39 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f5a38a5f3ae1..3bb6277efbb5 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -309,6 +309,13 @@ union fpregs_state { u8 __padding[PAGE_SIZE]; }; +struct fpstate { + /* @regs: The register state union for all supported formats */ + union fpregs_state regs; + + /* @regs is dynamically sized! Don't add anything after @regs! */ +} __aligned(64); + /* * Highest level per task FPU state data structure that * contains the FPU register state plus various FPU @@ -336,6 +343,14 @@ struct fpu { */ unsigned long avx512_timestamp; + /* + * @fpstate: + * + * Pointer to the active struct fpstate. Initialized to + * point at @__fpstate below. + */ + struct fpstate *fpstate; + /* * @state: * @@ -345,7 +360,10 @@ struct fpu { * copy. If the task context-switches away then they get * saved here and represent the FPU state. */ - union fpregs_state state; + union { + struct fpstate __fpstate; + union fpregs_state state; + }; /* * WARNING: 'state' is dynamically-sized. Do not put * anything after it here. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9ad2acaaae9b..4519d334bbdb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -537,11 +537,11 @@ struct thread_struct { */ }; -/* Whitelist the FPU state from the task_struct for hardened usercopy. */ +/* Whitelist the FPU register state from the task_struct for hardened usercopy. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.state); + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_xstate_size; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ac540a7d410e..d7643115a7ee 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -337,10 +337,17 @@ void fpstate_init_user(union fpregs_state *state) fpstate_init_fstate(&state->fsave); } +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; +} + #if IS_ENABLED(CONFIG_KVM) void fpu_init_fpstate_user(struct fpu *fpu) { - fpstate_init_user(&fpu->state); + fpstate_reset(fpu); + fpstate_init_user(&fpu->fpstate->regs); } EXPORT_SYMBOL_GPL(fpu_init_fpstate_user); #endif @@ -354,6 +361,8 @@ int fpu_clone(struct task_struct *dst) /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; + fpstate_reset(dst_fpu); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 23791355ca67..31ecbfba9ff7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -165,7 +165,7 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + task_size -= sizeof(current->thread.fpu.__fpstate.regs); /* * Add back the dynamically-calculated register state @@ -180,10 +180,14 @@ static void __init fpu__init_task_struct_size(void) * you hit a compile error here, check the structure to * see if something got added to the end. */ - CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); + BUILD_BUG_ON(sizeof(struct fpstate) != sizeof(union fpregs_state)); + BUILD_BUG_ON(offsetof(struct thread_struct, fpu.state) != + offsetof(struct thread_struct, fpu.__fpstate)); + arch_task_struct_size = task_size; } @@ -220,6 +224,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(c); /* diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index 479f2db6e160..63bd75fe95a8 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -26,5 +26,6 @@ extern void fpu__init_prepare_fx_sw_frame(void); /* Used in init.c */ extern void fpstate_init_user(union fpregs_state *state); +extern void fpstate_reset(struct fpu *fpu); #endif From f83ac56acdad0815366bb541b6cc9d24f6cea2b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:28 +0200 Subject: [PATCH 052/110] x86/fpu: Convert fpstate_init() to struct fpstate Convert fpstate_init() and related code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.292157401@linutronix.de --- arch/x86/kernel/fpu/core.c | 44 +++++++++++++++++----------------- arch/x86/kernel/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 12 +++++----- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d7643115a7ee..19e14b5c519d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -29,7 +29,7 @@ * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __ro_after_init; +struct fpstate init_fpstate __ro_after_init; /* * Track whether the kernel is using the FPU state @@ -157,7 +157,7 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) void fpu_reset_from_exception_fixup(void) { - restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + restore_fpregs_from_fpstate(&init_fpstate.regs, xfeatures_mask_fpstate()); } #if IS_ENABLED(CONFIG_KVM) @@ -297,24 +297,24 @@ static inline unsigned int init_fpstate_copy_size(void) return fpu_kernel_xstate_size; /* XSAVE(S) just needs the legacy and the xstate header part */ - return sizeof(init_fpstate.xsave); + return sizeof(init_fpstate.regs.xsave); } -static inline void fpstate_init_fxstate(struct fxregs_state *fx) +static inline void fpstate_init_fxstate(struct fpstate *fpstate) { - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct fregs_state *fp) +static inline void fpstate_init_fstate(struct fpstate *fpstate) { - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; } /* @@ -322,19 +322,19 @@ static inline void fpstate_init_fstate(struct fregs_state *fp) * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_init_fpstate_user() which is invoked from KVM */ -void fpstate_init_user(union fpregs_state *state) +void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpstate_init_soft(&state->soft); + fpstate_init_soft(&fpstate->regs.soft); return; } - xstate_init_xcomp_bv(&state->xsave, xfeatures_mask_uabi()); + xstate_init_xcomp_bv(&fpstate->regs.xsave, xfeatures_mask_uabi()); if (cpu_feature_enabled(X86_FEATURE_FXSR)) - fpstate_init_fxstate(&state->fxsave); + fpstate_init_fxstate(fpstate); else - fpstate_init_fstate(&state->fsave); + fpstate_init_fstate(fpstate); } void fpstate_reset(struct fpu *fpu) @@ -347,7 +347,7 @@ void fpstate_reset(struct fpu *fpu) void fpu_init_fpstate_user(struct fpu *fpu) { fpstate_reset(fpu); - fpstate_init_user(&fpu->fpstate->regs); + fpstate_init_user(fpu->fpstate); } EXPORT_SYMBOL_GPL(fpu_init_fpstate_user); #endif @@ -378,7 +378,7 @@ int fpu_clone(struct task_struct *dst) */ if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { /* Clear out the minimal state */ - memcpy(&dst_fpu->state, &init_fpstate, + memcpy(&dst_fpu->state, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } @@ -435,11 +435,11 @@ void fpu__drop(struct fpu *fpu) static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) - os_xrstor(&init_fpstate.xsave, features_mask); + os_xrstor(&init_fpstate.regs.xsave, features_mask); else if (use_fxsr()) - fxrstor(&init_fpstate.fxsave); + fxrstor(&init_fpstate.regs.fxsave); else - frstor(&init_fpstate.fsave); + frstor(&init_fpstate.regs.fsave); pkru_write_default(); } @@ -466,7 +466,7 @@ static void fpu_reset_fpstate(void) * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ - memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); + memcpy(&fpu->state, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index 63bd75fe95a8..e1d8a352f12d 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -2,7 +2,7 @@ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H -extern union fpregs_state init_fpstate; +extern struct fpstate init_fpstate; /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) @@ -25,7 +25,7 @@ static __always_inline __pure bool use_fxsr(void) extern void fpu__init_prepare_fx_sw_frame(void); /* Used in init.c */ -extern void fpstate_init_user(union fpregs_state *state); +extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); #endif diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 274cd58b3dc8..416a110f2196 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -243,7 +243,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate.regs.xsave, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b022df95a302..937ad5b394ca 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -408,12 +408,12 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.xsave, xfeatures_mask_all); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, xfeatures_mask_all); /* * Init all the features state with header.xfeatures being 0x0 */ - os_xrstor_booting(&init_fpstate.xsave); + os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so @@ -431,7 +431,7 @@ static void __init setup_init_fpu_buf(void) * state is all zeroes or if not to add the necessary handling * here. */ - fxsave(&init_fpstate.fxsave); + fxsave(&init_fpstate.regs.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) @@ -672,11 +672,11 @@ static unsigned int __init get_xsave_size(void) */ static bool __init is_supported_xstate_size(unsigned int test_xstate_size) { - if (test_xstate_size <= sizeof(union fpregs_state)) + if (test_xstate_size <= sizeof(init_fpstate.regs)) return true; pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(union fpregs_state), test_xstate_size); + sizeof(init_fpstate.regs), test_xstate_size); return false; } @@ -981,7 +981,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); - struct xregs_state *xinit = &init_fpstate.xsave; + struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xstate_header header; unsigned int zerofrom; u64 mask; From 18b3fa1ad15fa8d777ac32f117553cce1a968460 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:30 +0200 Subject: [PATCH 053/110] x86/fpu: Convert restore_fpregs_from_fpstate() to struct fpstate Convert restore_fpregs_from_fpstate() and related code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.347395546@linutronix.de --- arch/x86/include/asm/fpu/signal.h | 2 +- arch/x86/kernel/fpu/context.h | 2 +- arch/x86/kernel/fpu/core.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 9a63a21c219d..22b0273a8bf1 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -40,7 +40,7 @@ extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size extern void fpu__clear_user_states(struct fpu *fpu); extern bool fpu__restore_sig(void __user *buf, int ia32_frame); -extern void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); +extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask); extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index e652282842c8..f8f510519688 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -74,7 +74,7 @@ static inline void fpregs_restore_userregs(void) */ mask = xfeatures_mask_restore_user() | xfeatures_mask_supervisor(); - restore_fpregs_from_fpstate(&fpu->state, mask); + restore_fpregs_from_fpstate(fpu->fpstate, mask); fpregs_activate(fpu); fpu->last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 19e14b5c519d..03926bf00971 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -129,7 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu) frstor(&fpu->state.fsave); } -void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -146,18 +146,18 @@ void restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) } if (use_xsave()) { - os_xrstor(&fpstate->xsave, mask); + os_xrstor(&fpstate->regs.xsave, mask); } else { if (use_fxsr()) - fxrstor(&fpstate->fxsave); + fxrstor(&fpstate->regs.fxsave); else - frstor(&fpstate->fsave); + frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { - restore_fpregs_from_fpstate(&init_fpstate.regs, xfeatures_mask_fpstate()); + restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); } #if IS_ENABLED(CONFIG_KVM) @@ -176,7 +176,7 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) if (rstor) { restore_mask &= xfeatures_mask_fpstate(); - restore_fpregs_from_fpstate(&rstor->state, restore_mask); + restore_fpregs_from_fpstate(rstor->fpstate, restore_mask); } fpregs_mark_activate(); From 087df48c298c1cb829f4cd468d90f93234b1bc44 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:31 +0200 Subject: [PATCH 054/110] x86/fpu: Replace KVMs xstate component clearing In order to prepare for the support of dynamically enabled FPU features, move the clearing of xstate components to the FPU core code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20211013145322.399567049@linutronix.de --- arch/x86/include/asm/fpu/api.h | 1 + arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/xstate.c | 12 +++++++++++- arch/x86/kernel/fpu/xstate.h | 2 ++ arch/x86/kvm/x86.c | 14 +++++--------- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 5ac5e4596b53..a97cf3e5887b 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -132,6 +132,7 @@ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* fpstate-related functions which are exported to KVM */ extern void fpu_init_fpstate_user(struct fpu *fpu); +extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); /* KVM specific functions */ extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index b8cebc0ee420..fb329bbfe89f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -128,7 +128,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); -void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 937ad5b394ca..b1409a769ba5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -908,7 +908,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); #ifdef CONFIG_ARCH_HAS_PKEYS @@ -1257,6 +1256,17 @@ void xrstors(struct xregs_state *xstate, u64 mask) WARN_ON_ONCE(err); } +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index bb6d7d298d2a..99f8cfec719d 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -21,6 +21,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsav extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(void); +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a18d4670e640..96936a2e8267 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10705,7 +10705,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apf.halted = false; if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - void *mpx_state_buffer; + struct fpstate *fpstate = vcpu->arch.guest_fpu->fpstate; /* * To avoid have the INIT path from kvm_apic_has_events() that be @@ -10713,14 +10713,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + if (init_event) kvm_load_guest_fpu(vcpu); } From 1c57572d754fc54e0b8ac0df5350969ce6292d12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:33 +0200 Subject: [PATCH 055/110] x86/KVM: Convert to fpstate Convert KVM code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Paolo Bonzini Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20211013145322.451439983@linutronix.de --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96936a2e8267..0eb1021e0275 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10403,7 +10403,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10426,7 +10426,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; From cceb496420fa11a6e11989abc68b8e7564dc40f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:34 +0200 Subject: [PATCH 056/110] x86/fpu: Convert tracing to fpstate Convert FPU tracing code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.503327333@linutronix.de --- arch/x86/include/asm/trace/fpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 879b77792f94..4645a6334063 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { - __entry->xfeatures = fpu->state.xsave.header.xfeatures; - __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; + __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; + __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", From caee31a36c33ed7788d0b3d93a663860157f6c55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:36 +0200 Subject: [PATCH 057/110] x86/fpu/regset: Convert to fpstate Convert regset related code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.555239736@linutronix.de --- arch/x86/kernel/fpu/regset.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 01a1d97c3cb6..ec777793d890 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -78,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sync_fpstate(fpu); if (!use_xsave()) { - return membuf_write(&to, &fpu->state.fxsave, - sizeof(fpu->state.fxsave)); + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); } copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); @@ -114,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); /* Copy the state */ - memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); /* Clear xmm8..15 */ - BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16); - memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16); + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return 0; } @@ -168,7 +168,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(&fpu->fpstate->regs.xsave, + kbuf ?: tmpbuf); out: vfree(tmpbuf); @@ -287,7 +288,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave); + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -330,7 +331,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, return fpregs_soft_get(target, regset, to); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { - return membuf_write(&to, &fpu->state.fsave, + return membuf_write(&to, &fpu->fpstate->regs.fsave, sizeof(struct fregs_state)); } @@ -341,7 +342,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { - fx = &fpu->state.fxsave; + fx = &fpu->fpstate->regs.fxsave; } __convert_from_fxsr(&env, target, fx); @@ -370,16 +371,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); else - memcpy(&fpu->state.fsave, &env, sizeof(env)); + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* * Update the header bit in the xsave header, indicating the * presence of FP. */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; return 0; } From 7e049e8b74591038c831e765585ae9038b7880a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:37 +0200 Subject: [PATCH 058/110] x86/fpu/signal: Convert to fpstate Convert signal related code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.607370221@linutronix.de --- arch/x86/kernel/fpu/signal.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 416a110f2196..c54c2a3dda44 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -72,13 +72,13 @@ setfx: static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.state.fxsave); + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -303,7 +303,7 @@ retry: * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor()); fpregs_mark_activate(); fpregs_unlock(); @@ -317,6 +317,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; + union fpregs_state *fpregs; u64 user_xfeatures = 0; bool fx_only = false; bool success; @@ -349,6 +350,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (__copy_from_user(&env, buf, sizeof(env))) return false; + fpregs = &fpu->fpstate->regs; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is * not modified on context switch and that the xstate is considered @@ -366,7 +368,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) - os_xsave(&fpu->state.xsave); + os_xsave(&fpregs->xsave); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); @@ -374,29 +376,29 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs_unlock(); if (use_xsave() && !fx_only) { - if (copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx)) + if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx)) return false; } else { - if (__copy_from_user(&fpu->state.fxsave, buf_fx, - sizeof(fpu->state.fxsave))) + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) return false; if (IS_ENABLED(CONFIG_X86_64)) { /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) return false; } else { /* Mask invalid bits out for historical reasons (broken hardware). */ - fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { @@ -411,10 +413,10 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); - fpu->state.xsave.header.xfeatures &= mask; - success = !os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(&fpregs->xsave, xfeatures_mask_all); } else { - success = !fxrstor_safe(&fpu->state.fxsave); + success = !fxrstor_safe(&fpregs->fxsave); } if (likely(success)) From c20942ce5128ef92e2c451f943ba33462ad2fbc4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:39 +0200 Subject: [PATCH 059/110] x86/fpu/core: Convert to fpstate Convert the rest of the core code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.659456185@linutronix.de --- arch/x86/include/asm/fpu/api.h | 4 ++-- arch/x86/kernel/fpu/core.c | 44 ++++++++++++++++++---------------- arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/xstate.c | 2 +- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index a97cf3e5887b..9ce83148058f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -50,9 +50,9 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. * * local_bh_disable() protects against both preemption and soft interrupts diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 03926bf00971..14560fda15c2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -89,7 +89,7 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* - * Save the FPU register state in fpu->state. The register state is + * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. @@ -105,19 +105,19 @@ EXPORT_SYMBOL(irq_fpu_usable); void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - os_xsave(&fpu->state.xsave); + os_xsave(&fpu->fpstate->regs.xsave); /* * AVX512 state is tracked here because its use is * known to slow the max clock speed of the core. */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; return; } if (likely(use_fxsr())) { - fxsave(&fpu->state.fxsave); + fxsave(&fpu->fpstate->regs.fxsave); return; } @@ -125,8 +125,8 @@ void save_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - frstor(&fpu->state.fsave); + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) @@ -167,7 +167,8 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) if (save) { if (test_thread_flag(TIF_NEED_FPU_LOAD)) { - memcpy(&save->state, ¤t->thread.fpu.state, + memcpy(&save->fpstate->regs, + ¤t->thread.fpu.fpstate->regs, fpu_kernel_xstate_size); } else { save_fpregs_to_fpstate(save); @@ -187,7 +188,7 @@ EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru) { - union fpregs_state *kstate = &fpu->state; + union fpregs_state *kstate = &fpu->fpstate->regs; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; @@ -205,7 +206,7 @@ EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi); int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *vpkru) { - union fpregs_state *kstate = &fpu->state; + union fpregs_state *kstate = &fpu->fpstate->regs; const union fpregs_state *ustate = buf; struct pkru_state *xpkru; int ret; @@ -378,7 +379,7 @@ int fpu_clone(struct task_struct *dst) */ if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { /* Clear out the minimal state */ - memcpy(&dst_fpu->state, &init_fpstate.regs, + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } @@ -389,11 +390,12 @@ int fpu_clone(struct task_struct *dst) * child's FPU context, without any memory-to-memory copying. */ fpregs_lock(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - - else + if (test_thread_flag(TIF_NEED_FPU_LOAD)) { + memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs, + fpu_kernel_xstate_size); + } else { save_fpregs_to_fpstate(dst_fpu); + } fpregs_unlock(); trace_x86_fpu_copy_src(src_fpu); @@ -466,7 +468,7 @@ static void fpu_reset_fpstate(void) * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ - memcpy(&fpu->state, &init_fpstate.regs, init_fpstate_copy_size()); + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } @@ -493,7 +495,7 @@ void fpu__clear_user_states(struct fpu *fpu) */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) { - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor()); } /* Reset user states in registers. */ @@ -574,11 +576,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { - cwd = fpu->state.fxsave.cwd; - swd = fpu->state.fxsave.swd; + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; } else { - cwd = (unsigned short)fpu->state.fsave.cwd; - swd = (unsigned short)fpu->state.fsave.swd; + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; @@ -592,7 +594,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) - mxcsr = fpu->state.fxsave.mxcsr; + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 31ecbfba9ff7..b524cd053114 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); else #endif asm volatile ("fninit"); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b1409a769ba5..ca72a3e9080c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1094,7 +1094,7 @@ out: void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.state.xsave, + __copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.fpstate->regs.xsave, tsk->thread.pkru, copy_mode); } From 63d6bdf36ce1541e656966604c12ac4d9fc5d1f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:40 +0200 Subject: [PATCH 060/110] x86/math-emu: Convert to fpstate Convert math emulation code to the new register storage mechanism in preparation for dynamically sized buffers. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.711347464@linutronix.de --- arch/x86/math-emu/fpu_aux.c | 2 +- arch/x86/math-emu/fpu_entry.c | 4 ++-- arch/x86/math-emu/fpu_system.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 034748459482..d62662bdd460 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft) void finit(void) { - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 50195e249753..7fe56c594aa6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; const void *space = s387->st_space; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 9b41391867dc..eec3e4805c75 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d) return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; } -#define I387 (¤t->thread.fpu.state) +#define I387 (¤t->thread.fpu.fpstate->regs) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) From 2f27b5034244c4ebd70c90066defa771a99a5320 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:42 +0200 Subject: [PATCH 061/110] x86/fpu: Remove fpu::state All users converted. Remove it along with the sanity checks. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.765063318@linutronix.de --- arch/x86/include/asm/fpu/types.h | 18 +++++++----------- arch/x86/kernel/fpu/init.c | 4 ---- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3bb6277efbb5..297e3b4920cb 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -352,20 +352,16 @@ struct fpu { struct fpstate *fpstate; /* - * @state: + * @__fpstate: * - * In-memory copy of all FPU registers that we save/restore - * over context switches. If the task is using the FPU then - * the registers in the FPU are more recent than this state - * copy. If the task context-switches away then they get - * saved here and represent the FPU state. + * Initial in-memory storage for FPU registers which are saved in + * context switch and when the kernel uses the FPU. The registers + * are restored from this storage on return to user space if they + * are not longer containing the tasks FPU register state. */ - union { - struct fpstate __fpstate; - union fpregs_state state; - }; + struct fpstate __fpstate; /* - * WARNING: 'state' is dynamically-sized. Do not put + * WARNING: '__fpstate' is dynamically-sized. Do not put * anything after it here. */ }; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index b524cd053114..cffbaf491886 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -184,10 +184,6 @@ static void __init fpu__init_task_struct_size(void) CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); - BUILD_BUG_ON(sizeof(struct fpstate) != sizeof(union fpregs_state)); - BUILD_BUG_ON(offsetof(struct thread_struct, fpu.state) != - offsetof(struct thread_struct, fpu.__fpstate)); - arch_task_struct_size = task_size; } From f0cbc8b3cdf7d1c724155cd9cecffe329bb96119 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:43 +0200 Subject: [PATCH 062/110] x86/fpu: Do not leak fpstate pointer on fork If fork fails early then the copied task struct would carry the fpstate pointer of the parent task. Not a problem right now, but later when dynamically allocated buffers are available, keeping the pointer might result in freeing the parent's buffer. Set it to NULL which prevents that. If fork reaches clone_thread(), the pointer will be correctly set to the new task context. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.817101108@linutronix.de --- arch/x86/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5cd82082353e..c74c7e889e9d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,6 +87,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; return 0; } From 2dd8eedc80b184bb16aad697ae60367c5bf07299 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:45 +0200 Subject: [PATCH 063/110] x86/process: Move arch_thread_struct_whitelist() out of line In preparation for dynamically enabled FPU features move the function out of line as the goal is to expose less and not more information. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.869001791@linutronix.de --- arch/x86/include/asm/processor.h | 9 +++------ arch/x86/kernel/fpu/core.c | 10 ++++++++++ arch/x86/kernel/fpu/internal.h | 2 ++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4519d334bbdb..1bd3e8d05604 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -461,9 +461,6 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ -extern unsigned int fpu_kernel_xstate_size; -extern unsigned int fpu_user_xstate_size; - struct perf_event; struct thread_struct { @@ -537,12 +534,12 @@ struct thread_struct { */ }; -/* Whitelist the FPU register state from the task_struct for hardened usercopy. */ +extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); + static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); - *size = fpu_kernel_xstate_size; + fpu_thread_struct_whitelist(offset, size); } static inline void diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 14560fda15c2..c6df97517ec8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -404,6 +404,16 @@ int fpu_clone(struct task_struct *dst) return 0; } +/* + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_xstate_size; +} + /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index e1d8a352f12d..5c4f71ff6ae9 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -2,6 +2,8 @@ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H +extern unsigned int fpu_kernel_xstate_size; +extern unsigned int fpu_user_xstate_size; extern struct fpstate init_fpstate; /* CPU feature check wrappers */ From 248452ce21aeb08da2d2af23d88f890886bd379f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:46 +0200 Subject: [PATCH 064/110] x86/fpu: Add size and mask information to fpstate Add state size and feature mask information to the fpstate container. This will be used for runtime checks with the upcoming support for dynamically enabled features and dynamically sized buffers. That avoids conditionals all over the place as the required information is accessible for both default and extended buffers. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.921388806@linutronix.de --- arch/x86/include/asm/fpu/types.h | 12 ++++++++++++ arch/x86/kernel/fpu/core.c | 6 ++++++ arch/x86/kernel/fpu/init.c | 9 +++++++++ arch/x86/kernel/fpu/xstate.c | 3 +++ 4 files changed, 30 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 297e3b4920cb..3a12e97e475d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -310,6 +310,18 @@ union fpregs_state { }; struct fpstate { + /* @kernel_size: The size of the kernel register image */ + unsigned int size; + + /* @user_size: The size in non-compacted UABI format */ + unsigned int user_size; + + /* @xfeatures: xfeatures for which the storage is sized */ + u64 xfeatures; + + /* @user_xfeatures: xfeatures valid in UABI buffers */ + u64 user_xfeatures; + /* @regs: The register state union for all supported formats */ union fpregs_state regs; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c6df97517ec8..a8cc20e90751 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -342,6 +342,12 @@ void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; + + /* Initialize sizes and feature masks */ + fpu->fpstate->size = fpu_kernel_xstate_size; + fpu->fpstate->user_size = fpu_user_xstate_size; + fpu->fpstate->xfeatures = xfeatures_mask_all; + fpu->fpstate->user_xfeatures = xfeatures_mask_uabi(); } #if IS_ENABLED(CONFIG_KVM) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index cffbaf491886..65d763faace9 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -212,6 +212,14 @@ static void __init fpu__init_system_xstate_size_legacy(void) } fpu_user_xstate_size = fpu_kernel_xstate_size; + fpstate_reset(¤t->thread.fpu); +} + +static void __init fpu__init_init_fpstate(void) +{ + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_xstate_size; + init_fpstate.xfeatures = xfeatures_mask_all; } /* @@ -233,4 +241,5 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); fpu__init_task_struct_size(); + fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ca72a3e9080c..4beb010d19fc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -720,6 +720,7 @@ static void __init fpu__init_disable_system_xstate(void) xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + fpstate_reset(¤t->thread.fpu); } /* @@ -792,6 +793,8 @@ void __init fpu__init_system_xstate(void) if (err) goto out_disable; + fpstate_reset(¤t->thread.fpu); + /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: From be31dfdfd75b172af3ddcfa7511cdc3bb7adb25e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:48 +0200 Subject: [PATCH 065/110] x86/fpu: Use fpstate::size Make use of fpstate::size in various places which require the buffer size information for sanity checks or memcpy() sizing. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145322.973518954@linutronix.de --- arch/x86/kernel/fpu/core.c | 13 ++++++------- arch/x86/kernel/fpu/signal.c | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a8cc20e90751..cb48c80ce5e3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -166,13 +166,12 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) fpregs_lock(); if (save) { - if (test_thread_flag(TIF_NEED_FPU_LOAD)) { - memcpy(&save->fpstate->regs, - ¤t->thread.fpu.fpstate->regs, - fpu_kernel_xstate_size); - } else { + struct fpstate *fpcur = current->thread.fpu.fpstate; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&save->fpstate->regs, &fpcur->regs, fpcur->size); + else save_fpregs_to_fpstate(save); - } } if (rstor) { @@ -398,7 +397,7 @@ int fpu_clone(struct task_struct *dst) fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) { memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs, - fpu_kernel_xstate_size); + dst_fpu->fpstate->size); } else { save_fpregs_to_fpstate(dst_fpu); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c54c2a3dda44..aa9329189864 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -313,15 +313,13 @@ retry: static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { - int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; + bool success, fx_only = false; union fpregs_state *fpregs; + unsigned int state_size; u64 user_xfeatures = 0; - bool fx_only = false; - bool success; - if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -334,6 +332,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->size; } if (likely(!ia32_fxstate)) { From 073e627a4537e682c43a1e8df659ce24cbced40c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:49 +0200 Subject: [PATCH 066/110] x86/fpu/xstate: Use fpstate for os_xsave() With variable feature sets XSAVE[S] requires to know the feature set for which the buffer is valid. Retrieve it from fpstate. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145323.025695590@linutronix.de --- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 4 ++-- arch/x86/kernel/fpu/xstate.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index cb48c80ce5e3..f4db70b64e2e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -105,7 +105,7 @@ EXPORT_SYMBOL(irq_fpu_usable); void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - os_xsave(&fpu->fpstate->regs.xsave); + os_xsave(fpu->fpstate); /* * AVX512 state is tracked here because its use is diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index aa9329189864..5aca418490f0 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -349,7 +349,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (__copy_from_user(&env, buf, sizeof(env))) return false; - fpregs = &fpu->fpstate->regs; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is * not modified on context switch and that the xstate is considered @@ -367,13 +366,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) - os_xsave(&fpregs->xsave); + os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); + fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx)) return false; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 99f8cfec719d..24a1479caea2 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -101,16 +101,16 @@ extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features * and command line options. The choice is permanent until the next reboot. */ -static inline void os_xsave(struct xregs_state *xstate) +static inline void os_xsave(struct fpstate *fpstate) { - u64 mask = xfeatures_mask_all; + u64 mask = fpstate->xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); - XSTATE_XSAVE(xstate, lmask, hmask, err); + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); From 0b2d39aa03574eb401cdfaac2f483a6f68173355 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:51 +0200 Subject: [PATCH 067/110] x86/fpu/xstate: Use fpstate for xsave_to_user_sigframe() With dynamically enabled features the sigframe code must know the features which are enabled for the task. Get them from fpstate. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145323.077781448@linutronix.de --- arch/x86/kernel/fpu/xstate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 24a1479caea2..3e9eaf9f7cf3 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -149,7 +149,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - u64 mask = xfeatures_mask_uabi(); + u64 mask = current->thread.fpu.fpstate->user_xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; From ad6ede407aae01d9617e172b27e179ce1046cbfc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:52 +0200 Subject: [PATCH 068/110] x86/fpu: Use fpstate in fpu_copy_kvm_uabi_to_fpstate() Straight forward conversion. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145323.129699950@linutronix.de --- arch/x86/kernel/fpu/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index f4db70b64e2e..052e5efbf9f4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi); int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *vpkru) { - union fpregs_state *kstate = &fpu->fpstate->regs; + struct fpstate *kstate = fpu->fpstate; const union fpregs_state *ustate = buf; struct pkru_state *xpkru; int ret; @@ -215,25 +215,25 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; - memcpy(&kstate->fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; - ret = copy_uabi_from_kernel_to_xstate(&kstate->xsave, ustate); + ret = copy_uabi_from_kernel_to_xstate(&kstate->regs.xsave, ustate); if (ret) return ret; /* Retrieve PKRU if not in init state */ - if (kstate->xsave.header.xfeatures & XFEATURE_MASK_PKRU) { - xpkru = get_xsave_addr(&kstate->xsave, XFEATURE_PKRU); + if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); *vpkru = xpkru->pkru; } /* Ensure that XCOMP_BV is set up for XSAVES */ - xstate_init_xcomp_bv(&kstate->xsave, xfeatures_mask_uabi()); + xstate_init_xcomp_bv(&kstate->regs.xsave, xfeatures_mask_uabi()); return 0; } EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate); From 3ac8d75778fc8c1c22daad9bc674166b862f6f6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:54 +0200 Subject: [PATCH 069/110] x86/fpu: Use fpstate in __copy_xstate_to_uabi_buf() With dynamically enabled features the copy function must know the features and the size which is valid for the task. Retrieve them from fpstate. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145323.181495492@linutronix.de --- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 11 ++++++----- arch/x86/kernel/fpu/xstate.h | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 052e5efbf9f4..04fef4795211 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -187,15 +187,15 @@ EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru) { - union fpregs_state *kstate = &fpu->fpstate->regs; + struct fpstate *kstate = fpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { - __copy_xstate_to_uabi_buf(mb, &kstate->xsave, pkru, - XSTATE_COPY_XSAVE); + __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); } else { - memcpy(&ustate->fxsave, &kstate->fxsave, sizeof(ustate->fxsave)); + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4beb010d19fc..54cc0a4db8e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -969,7 +969,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @xsave: The xsave from which to copy + * @fpstate: The fpstate buffer from which to copy * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * @@ -979,11 +979,12 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; u64 mask; @@ -1003,7 +1004,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= xfeatures_mask_uabi(); + header.xfeatures &= fpstate->user_xfeatures; break; } @@ -1046,7 +1047,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, * but there is no state to copy from in the compacted * init_fpstate. The gap tracking will zero these states. */ - mask = xfeatures_mask_uabi(); + mask = fpstate->user_xfeatures; for_each_extended_xfeature(i, mask) { /* @@ -1097,7 +1098,7 @@ out: void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, &tsk->thread.fpu.fpstate->regs.xsave, + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, tsk->thread.pkru, copy_mode); } diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 3e9eaf9f7cf3..b74c5953558c 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -15,7 +15,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } -extern void __copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void fpu__init_cpu_xstate(void); From 49e4eb4125d506937e52e10c34c8cafd93ab0ed6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Oct 2021 16:55:55 +0200 Subject: [PATCH 070/110] x86/fpu/xstate: Use fpstate for copy_uabi_to_xstate() Prepare for dynamically enabled states per task. The function needs to retrieve the features and sizes which are valid in a fpstate context. Retrieve them from fpstate. Move the function declarations to the core header as they are not required anywhere else. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211013145323.233529986@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 12 ------------ arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/regset.c | 5 ++--- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 18 ++++++++++-------- arch/x86/kernel/fpu/xstate.h | 12 ++++++++++++ 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fb329bbfe89f..61fcb15d880a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -129,20 +129,8 @@ extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); int xfeature_size(int xfeature_nr); -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); -enum xstate_copy_mode { - XSTATE_COPY_FP, - XSTATE_COPY_FX, - XSTATE_COPY_XSAVE, -}; - -struct membuf; -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode mode); - #endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 04fef4795211..b497ecae9270 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -222,7 +222,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; - ret = copy_uabi_from_kernel_to_xstate(&kstate->regs.xsave, ustate); + ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); if (ret) return ret; diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ec777793d890..f8c485ab73f5 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,11 +8,11 @@ #include #include #include -#include #include "context.h" #include "internal.h" #include "legacy.h" +#include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -168,8 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(&fpu->fpstate->regs.xsave, - kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); out: vfree(tmpbuf); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 5aca418490f0..935818b0406e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -375,7 +375,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - if (copy_sigframe_from_user_to_xstate(&fpregs->xsave, buf_fx)) + if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 54cc0a4db8e8..4cfd3bcfe76a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -463,10 +463,11 @@ int xfeature_size(int xfeature_nr) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -static int validate_user_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~xfeatures_mask_uabi()) + if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -1115,9 +1116,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } -static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf) { + struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; @@ -1127,7 +1129,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; - if (validate_user_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ @@ -1182,9 +1184,9 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) { - return copy_uabi_to_xstate(xsave, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL); } /* @@ -1192,10 +1194,10 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, +int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf) { - return copy_uabi_to_xstate(xsave, NULL, ubuf); + return copy_uabi_to_xstate(fpstate, NULL, ubuf); } static bool validate_independent_components(u64 mask) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index b74c5953558c..379dbfa4f526 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -15,8 +15,20 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); +extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); + extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(void); From 5509cc78080d29b23706dbf076d51691b69f3c79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 00:51:51 +0200 Subject: [PATCH 071/110] x86/fpu/signal: Use fpstate for size and features For dynamically enabled features it's required to get the features which are enabled for that context when restoring from sigframe. The same applies for all signal frame size calculations. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/87ilxz5iew.ffs@tglx --- arch/x86/kernel/fpu/signal.c | 44 ++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 935818b0406e..f9af1747be6e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -41,7 +41,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; @@ -98,7 +98,8 @@ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) return true; } -static inline bool save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + unsigned int usize) { struct xregs_state __user *x = buf; struct _fpx_sw_bytes *sw_bytes; @@ -113,7 +114,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame) return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + usize)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -171,6 +172,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; + struct fpstate *fpstate = tsk->thread.fpu.fpstate; int ia32_fxstate = (buf != buf_fx); int ret; @@ -215,7 +217,7 @@ retry: fpregs_unlock(); if (ret) { - if (!__clear_user(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; return false; } @@ -224,17 +226,18 @@ retry: if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) return false; - if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate)) + if (use_fxsr() && + !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate->user_size)) return false; return true; } -static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) { if (use_xsave()) { - u64 init_bv = xfeatures_mask_uabi() & ~xrestore; + u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) @@ -265,7 +268,8 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, retry: fpregs_lock(); pagefault_disable(); - ret = __restore_fpregs_from_user(buf, xrestore, fx_only); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { @@ -332,7 +336,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; - state_size = fpu->fpstate->size; + state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { @@ -425,10 +429,11 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, return success; } -static inline int xstate_sigframe_size(void) +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { - return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : - fpu_user_xstate_size; + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* @@ -436,17 +441,19 @@ static inline int xstate_sigframe_size(void) */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { - unsigned int size = xstate_sigframe_size(); struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; + unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); return true; } + size = xstate_sigframe_size(fpu->fpstate); + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -481,7 +488,7 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(); + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { @@ -494,9 +501,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } -unsigned long fpu__get_fpstate_size(void) +unsigned long __init fpu__get_fpstate_size(void) { - unsigned long ret = xstate_sigframe_size(); + unsigned long ret = fpu_user_xstate_size; + + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit From 578971f4e228f386ad4d7ce16e979f2ed922de54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:29 +0200 Subject: [PATCH 072/110] x86/fpu: Provide struct fpu_config Provide a struct to store information about the maximum supported and the default feature set and buffer sizes for both user and kernel space. This allows quick retrieval of this information for the upcoming support for dynamically enabled features. [ bp: Add vertical spacing between the struct members. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.126107370@linutronix.de --- arch/x86/include/asm/fpu/types.h | 42 ++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 4 +++ 2 files changed, 46 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3a12e97e475d..a32be07f1418 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -378,4 +378,46 @@ struct fpu { */ }; +/* + * FPU state configuration data. Initialized at boot time. Read only after init. + */ +struct fpu_state_config { + /* + * @max_size: + * + * The maximum size of the register state buffer. Includes all + * supported features except independent managed features. + */ + unsigned int max_size; + + /* + * @default_size: + * + * The default size of the register state buffer. Includes all + * supported features except independent managed features and + * features which have to be requested by user space before usage. + */ + unsigned int default_size; + + /* + * @max_features: + * + * The maximum supported features bitmap. Does not include + * independent managed features. + */ + u64 max_features; + + /* + * @default_features: + * + * The default supported features bitmap. Does not include + * independent managed features and features which have to + * be requested by user space before usage. + */ + u64 default_features; +}; + +/* FPU state configuration information */ +extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; + #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b497ecae9270..3512bb241d95 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -25,6 +25,10 @@ #define CREATE_TRACE_POINTS #include +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: From 617473acdfe45aa9aa2be23cd5b02da7cd2717f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:31 +0200 Subject: [PATCH 073/110] x86/fpu: Cleanup fpu__init_system_xstate_size_legacy() Clean the function up before making changes. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.184014242@linutronix.de --- arch/x86/kernel/fpu/init.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 65d763faace9..c9293ade321d 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -199,17 +199,12 @@ static void __init fpu__init_system_xstate_size_legacy(void) * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). */ - - if (!boot_cpu_has(X86_FEATURE_FPU)) { + if (!cpu_feature_enabled(X86_FEATURE_FPU)) fpu_kernel_xstate_size = sizeof(struct swregs_state); - } else { - if (boot_cpu_has(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = - sizeof(struct fxregs_state); - else - fpu_kernel_xstate_size = - sizeof(struct fregs_state); - } + else if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpu_kernel_xstate_size = sizeof(struct fxregs_state); + else + fpu_kernel_xstate_size = sizeof(struct fregs_state); fpu_user_xstate_size = fpu_kernel_xstate_size; fpstate_reset(¤t->thread.fpu); From cd9ae761744912a96d7fd968b9c0173594e3f6be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:32 +0200 Subject: [PATCH 074/110] x86/fpu/xstate: Cleanup size calculations The size calculations are partially unreadable gunk. Clean them up. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.241223689@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 82 ++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4cfd3bcfe76a..c5582bd16f7a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -527,7 +527,7 @@ static void __init __xstate_dump_leaves(void) * that our software representation matches what the CPU * tells us about the state's size. */ -static void __init check_xstate_against_struct(int nr) +static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -557,7 +557,9 @@ static void __init check_xstate_against_struct(int nr) ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); + return false; } + return true; } /* @@ -569,38 +571,44 @@ static void __init check_xstate_against_struct(int nr) * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ -static void __init do_extra_xstate_size_checks(void) +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, xfeatures_mask_all) { - check_xstate_against_struct(i); + if (!check_xstate_against_struct(i)) + return false; /* * Supervisor state components can be managed only by * XSAVES. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_WARN_ON(xfeature_is_supervisor(i)); + if (!compacted && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1); + return false; + } /* Align from the end of the previous feature */ if (xfeature_is_aligned(i)) - paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); + size = ALIGN(size, 64); /* - * The offset of a given state in the non-compacted - * format is given to us in a CPUID leaf. We check - * them for being ordered (increasing offsets) in - * setup_xstate_features(). XSAVES uses compacted format. + * In compacted format the enabled features are packed, + * i.e. disabled features do not occupy space. + * + * In non-compacted format the offsets are fixed and + * disabled states still occupy space in the memory buffer. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - paranoid_xstate_size = xfeature_uncompacted_offset(i); + if (!compacted) + size = xfeature_uncompacted_offset(i); /* - * The compacted-format offset always depends on where - * the previous state ended. + * Add the feature size even for non-compacted format + * to make the end result correct */ - paranoid_xstate_size += xfeature_size(i); + size += xfeature_size(i); } - XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); + XSTATE_WARN_ON(size != kernel_size); + return size == kernel_size; } /* @@ -653,7 +661,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) return size; } -static unsigned int __init get_xsave_size(void) +static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* @@ -684,31 +692,33 @@ static bool __init is_supported_xstate_size(unsigned int test_xstate_size) static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size; - unsigned int xsave_size; + unsigned int user_size, kernel_size; - xsave_size = get_xsave_size(); + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size_no_independent(); + /* + * XSAVES kernel size includes supervisor states and + * uses compacted format. + * + * XSAVE does not support supervisor states so + * kernel and user size is identical. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + kernel_size = get_xsaves_size_no_independent(); else - possible_xstate_size = xsave_size; + kernel_size = user_size; - /* Ensure we have the space to store all enabled: */ - if (!is_supported_xstate_size(possible_xstate_size)) + /* Ensure we have the space to store all enabled features. */ + if (!is_supported_xstate_size(kernel_size)) return -EINVAL; - /* - * The size is OK, we are definitely going to use xsave, - * make it known to the world that we need more space. - */ - fpu_kernel_xstate_size = possible_xstate_size; - do_extra_xstate_size_checks(); + if (!paranoid_xstate_size_valid(kernel_size)) + return -EINVAL; + + fpu_kernel_xstate_size = kernel_size; + fpu_user_xstate_size = user_size; - /* - * User space is always in standard format. - */ - fpu_user_xstate_size = xsave_size; return 0; } From 2bd264bce238cedbf00bde1f28ad51ba45b9114e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:34 +0200 Subject: [PATCH 075/110] x86/fpu: Move xstate size to fpu_*_cfg Use the new kernel and user space config storage to store and retrieve the XSTATE buffer sizes. The default and the maximum size are the same for now, but will change when support for dynamically enabled features is added. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.296830097@linutronix.de --- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/init.c | 31 ++++++++++++++----------------- arch/x86/kernel/fpu/internal.h | 2 -- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 6 +++--- arch/x86/kernel/fpu/xstate.c | 32 ++++++++++++++++++-------------- arch/x86/kernel/fpu/xstate.h | 2 +- 7 files changed, 41 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3512bb241d95..69abf3a2299d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -298,7 +298,7 @@ void fpu_sync_fpstate(struct fpu *fpu) static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) - return fpu_kernel_xstate_size; + return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); @@ -347,8 +347,8 @@ void fpstate_reset(struct fpu *fpu) fpu->fpstate = &fpu->__fpstate; /* Initialize sizes and feature masks */ - fpu->fpstate->size = fpu_kernel_xstate_size; - fpu->fpstate->user_size = fpu_user_xstate_size; + fpu->fpstate->size = fpu_kernel_cfg.default_size; + fpu->fpstate->user_size = fpu_user_cfg.default_size; fpu->fpstate->xfeatures = xfeatures_mask_all; fpu->fpstate->user_xfeatures = xfeatures_mask_uabi(); } @@ -420,7 +420,7 @@ int fpu_clone(struct task_struct *dst) void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); - *size = fpu_kernel_xstate_size; + *size = fpu_kernel_cfg.default_size; } /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index c9293ade321d..58043ed08662 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -133,14 +133,6 @@ static void __init fpu__init_system_generic(void) fpu__init_system_mxcsr(); } -/* - * Size of the FPU context state. All tasks in the system use the - * same context size, regardless of what portion they use. - * This is inherent to the XSAVE architecture which puts all state - * components into a single, continuous memory block: - */ -unsigned int fpu_kernel_xstate_size __ro_after_init; - /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -171,7 +163,7 @@ static void __init fpu__init_task_struct_size(void) * Add back the dynamically-calculated register state * size. */ - task_size += fpu_kernel_xstate_size; + task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that @@ -195,25 +187,30 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { + unsigned int size; + /* - * Note that xstate sizes might be overwritten later during - * fpu__init_system_xstate(). + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). */ if (!cpu_feature_enabled(X86_FEATURE_FPU)) - fpu_kernel_xstate_size = sizeof(struct swregs_state); + size = sizeof(struct swregs_state); else if (cpu_feature_enabled(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = sizeof(struct fxregs_state); + size = sizeof(struct fxregs_state); else - fpu_kernel_xstate_size = sizeof(struct fregs_state); + size = sizeof(struct fregs_state); - fpu_user_xstate_size = fpu_kernel_xstate_size; + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; fpstate_reset(¤t->thread.fpu); } static void __init fpu__init_init_fpstate(void) { /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_xstate_size; + init_fpstate.size = fpu_kernel_cfg.max_size; init_fpstate.xfeatures = xfeatures_mask_all; } @@ -234,7 +231,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); - fpu__init_system_xstate(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index 5c4f71ff6ae9..e1d8a352f12d 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -2,8 +2,6 @@ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H -extern unsigned int fpu_kernel_xstate_size; -extern unsigned int fpu_user_xstate_size; extern struct fpstate init_fpstate; /* CPU feature check wrappers */ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index f8c485ab73f5..437d7c930c0b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -153,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * A whole standard-format XSAVE buffer is needed: */ - if (pos != 0 || count != fpu_user_xstate_size) + if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; if (!kbuf) { diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index f9af1747be6e..fab440369663 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -503,7 +503,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long __init fpu__get_fpstate_size(void) { - unsigned long ret = fpu_user_xstate_size; + unsigned long ret = fpu_user_cfg.max_size; if (use_xsave()) ret += FP_XSTATE_MAGIC2_SIZE; @@ -531,12 +531,12 @@ unsigned long __init fpu__get_fpstate_size(void) */ void __init fpu__init_prepare_fx_sw_frame(void) { - int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; + int size = fpu_user_cfg.default_size + FP_XSTATE_MAGIC2_SIZE; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); - fx_sw_reserved.xstate_size = fpu_user_xstate_size; + fx_sw_reserved.xstate_size = fpu_user_cfg.default_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5582bd16f7a..94f5e3739ae0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -77,13 +77,6 @@ static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init = static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; -/* - * The XSAVE area of kernel can be in standard or compacted format; - * it is always in standard format for user mode. This is the user - * mode standard format size used for signal and ptrace frames. - */ -unsigned int fpu_user_xstate_size __ro_after_init; - /* * Return whether the system supports a given xfeature. * @@ -716,8 +709,11 @@ static int __init init_xstate_size(void) if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; - fpu_kernel_xstate_size = kernel_size; - fpu_user_xstate_size = user_size; + /* Keep it the same for now */ + fpu_kernel_cfg.max_size = kernel_size; + fpu_kernel_cfg.default_size = kernel_size; + fpu_user_cfg.max_size = user_size; + fpu_user_cfg.default_size = user_size; return 0; } @@ -726,11 +722,18 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void __init fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + fpstate_reset(¤t->thread.fpu); } @@ -738,7 +741,7 @@ static void __init fpu__init_disable_system_xstate(void) * Enable and initialize the xsave feature. * Called once per system bootup. */ -void __init fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; @@ -810,7 +813,8 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi()); + update_regset_xstate_info(fpu_user_cfg.max_size, + xfeatures_mask_uabi()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -830,13 +834,13 @@ void __init fpu__init_system_xstate(void) print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask_all, - fpu_kernel_xstate_size, + fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ - fpu__init_disable_system_xstate(); + fpu__init_disable_system_xstate(legacy_size); } /* diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 379dbfa4f526..3d45eb04471b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -31,7 +31,7 @@ extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); From 1c253ff2287fe31307a67938c4487936db967ff5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:35 +0200 Subject: [PATCH 076/110] x86/fpu: Move xstate feature masks to fpu_*_cfg Move the feature mask storage to the kernel and user config structs. Default and maximum feature set are the same for now. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.352041752@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 10 +++--- arch/x86/kernel/fpu/core.c | 4 +-- arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/signal.c | 3 +- arch/x86/kernel/fpu/xstate.c | 57 ++++++++++++++++--------------- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 61fcb15d880a..fe7c9af9ea42 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -78,11 +78,9 @@ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) -extern u64 xfeatures_mask_all; - static inline u64 xfeatures_mask_supervisor(void) { - return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } /* @@ -91,7 +89,7 @@ static inline u64 xfeatures_mask_supervisor(void) */ static inline u64 xfeatures_mask_uabi(void) { - return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; + return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_SUPPORTED; } /* @@ -102,7 +100,7 @@ static inline u64 xfeatures_mask_uabi(void) */ static inline u64 xfeatures_mask_restore_user(void) { - return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE; + return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_RESTORE; } /* @@ -111,7 +109,7 @@ static inline u64 xfeatures_mask_restore_user(void) */ static inline u64 xfeatures_mask_fpstate(void) { - return xfeatures_mask_all & \ + return fpu_kernel_cfg.max_features & \ (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 69abf3a2299d..501e21c341f1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -349,8 +349,8 @@ void fpstate_reset(struct fpu *fpu) /* Initialize sizes and feature masks */ fpu->fpstate->size = fpu_kernel_cfg.default_size; fpu->fpstate->user_size = fpu_user_cfg.default_size; - fpu->fpstate->xfeatures = xfeatures_mask_all; - fpu->fpstate->user_xfeatures = xfeatures_mask_uabi(); + fpu->fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpu->fpstate->user_xfeatures = fpu_user_cfg.default_features; } #if IS_ENABLED(CONFIG_KVM) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 58043ed08662..7074154131e6 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -211,7 +211,7 @@ static void __init fpu__init_init_fpstate(void) { /* Bring init_fpstate size and features up to date */ init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = xfeatures_mask_all; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index fab440369663..c14f477f5651 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -417,7 +417,8 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpregs->xsave.header.xfeatures &= mask; - success = !os_xrstor_safe(&fpregs->xsave, xfeatures_mask_all); + success = !os_xrstor_safe(&fpregs->xsave, + fpu_kernel_cfg.max_features); } else { success = !fxrstor_safe(&fpregs->fxsave); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 94f5e3739ae0..8b496c0eb1a5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -62,12 +62,6 @@ static short xsave_cpuid_features[] __initdata = { X86_FEATURE_ENQCMD, }; -/* - * This represents the full set of bits that should ever be set in a kernel - * XSAVE buffer, both supervisor and user xstates. - */ -u64 xfeatures_mask_all __ro_after_init; - static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = @@ -84,7 +78,7 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -134,7 +128,7 @@ static bool xfeature_is_supervisor(int xfeature_nr) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); @@ -144,7 +138,7 @@ void fpu__init_cpu_xstate(void) * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. @@ -157,7 +151,7 @@ void fpu__init_cpu_xstate(void) static bool xfeature_enabled(enum xfeature xfeature) { - return xfeatures_mask_all & BIT_ULL(xfeature); + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* @@ -183,7 +177,7 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for_each_extended_xfeature(i, xfeatures_mask_all) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; @@ -288,14 +282,14 @@ static void __init setup_xstate_comp_offsets(void) xmm_space); if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { - for_each_extended_xfeature(i, xfeatures_mask_all) + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) xstate_comp_offsets[i] = xstate_offsets[i]; return; } next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for_each_extended_xfeature(i, xfeatures_mask_all) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (xfeature_is_aligned(i)) next_offset = ALIGN(next_offset, 64); @@ -319,7 +313,7 @@ static void __init setup_supervisor_only_offsets(void) next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for_each_extended_xfeature(i, xfeatures_mask_all) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!xfeature_is_supervisor(i)) continue; @@ -338,7 +332,7 @@ static void __init print_xstate_offset_size(void) { int i; - for_each_extended_xfeature(i, xfeatures_mask_all) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xstate_comp_offsets[i], i, xstate_sizes[i]); } @@ -401,7 +395,7 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, xfeatures_mask_all); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); /* * Init all the features state with header.xfeatures being 0x0 @@ -570,7 +564,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for_each_extended_xfeature(i, xfeatures_mask_all) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* @@ -724,7 +718,7 @@ static int __init init_xstate_size(void) */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { - xfeatures_mask_all = 0; + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); @@ -768,13 +762,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * Find user xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all = eax + ((u64)edx << 32); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all |= ecx + ((u64)edx << 32); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* @@ -783,7 +777,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", - xfeatures_mask_all); + fpu_kernel_cfg.max_features); goto out_disable; } @@ -792,14 +786,21 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask_all &= ~BIT_ULL(i); + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } - xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED | + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Identical for now */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + /* Store it for paranoia check at the end */ - xfeatures = xfeatures_mask_all; + xfeatures = fpu_kernel_cfg.max_features; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -825,15 +826,15 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * Paranoia check whether something in the setup modified the * xfeatures mask. */ - if (xfeatures != xfeatures_mask_all) { + if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", - xfeatures, xfeatures_mask_all); + xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask_all, + fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; @@ -908,7 +909,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * We should not ever be requesting features that we * have not enabled. */ - WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to From daddee24731938781b7876d20335ea3754d23484 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:37 +0200 Subject: [PATCH 077/110] x86/fpu: Mop up xfeatures_mask_uabi() Use the new fpu_user_cfg to retrieve the information instead of xfeatures_mask_uabi() which will be no longer correct when dynamically enabled features become available. Using fpu_user_cfg is appropriate when setting XCOMP_BV in the init_fpstate since it has space allocated for "max_features". But, normal fpstates might only have space for default xfeatures. Since XRSTOR* derives the format of the XSAVE buffer from XCOMP_BV, this can lead to XRSTOR reading out of bounds. So when copying actively used fpstate, simply read the XCOMP_BV features bits directly out of the fpstate instead. This correction courtesy of Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.408879849@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 9 --------- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 6 +++--- 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fe7c9af9ea42..3c890b97f195 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -83,15 +83,6 @@ static inline u64 xfeatures_mask_supervisor(void) return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } -/* - * The xfeatures which are enabled in XCR0 and expected to be in ptrace - * buffers and signal frames. - */ -static inline u64 xfeatures_mask_uabi(void) -{ - return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_SUPPORTED; -} - /* * The xfeatures which are restored by the kernel when returning to user * mode. This is not necessarily the same as xfeatures_mask_uabi() as the diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 501e21c341f1..5acc077cb9f1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -237,7 +237,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, } /* Ensure that XCOMP_BV is set up for XSAVES */ - xstate_init_xcomp_bv(&kstate->regs.xsave, xfeatures_mask_uabi()); + xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); return 0; } EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate); @@ -333,7 +333,7 @@ void fpstate_init_user(struct fpstate *fpstate) return; } - xstate_init_xcomp_bv(&fpstate->regs.xsave, xfeatures_mask_uabi()); + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index c14f477f5651..3e42e6e8b56c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -536,7 +536,7 @@ void __init fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); + fx_sw_reserved.xfeatures = fpu_user_cfg.default_features; fx_sw_reserved.xstate_size = fpu_user_cfg.default_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8b496c0eb1a5..9f92abd230db 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -770,7 +770,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); - if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue @@ -815,7 +815,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, - xfeatures_mask_uabi()); + fpu_user_cfg.max_features); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -853,7 +853,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support From eda32f4f93b452c5fe3c352523e7f7cc085c8205 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:38 +0200 Subject: [PATCH 078/110] x86/fpu: Rework restore_regs_from_fpstate() xfeatures_mask_fpstate() is no longer valid when dynamically enabled features come into play. Rework restore_regs_from_fpstate() so it takes a constant mask which will then be applied against the maximum feature set so that the restore operation brings all features which are not in the xsave buffer xfeature bitmap into init state. This ensures that if the previous task used a dynamically enabled feature that the task which restores has all unused components properly initialized. Cleanup the last user of xfeatures_mask_fpstate() as well and remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.461348278@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 29 ++++++++--------------------- arch/x86/kernel/fpu/context.h | 6 +----- arch/x86/kernel/fpu/core.c | 17 ++++++++++++++--- arch/x86/kernel/fpu/xstate.c | 2 +- 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 3c890b97f195..61ae396bc6e7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -78,32 +78,19 @@ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) +/* + * The feature mask required to restore FPU state: + * - All user states which are not eagerly switched in switch_to()/exec() + * - The suporvisor states + */ +#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ + XFEATURE_MASK_SUPERVISOR_SUPPORTED) + static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } -/* - * The xfeatures which are restored by the kernel when returning to user - * mode. This is not necessarily the same as xfeatures_mask_uabi() as the - * kernel does not manage all XCR0 enabled features via xsave/xrstor as - * some of them have to be switched eagerly on context switch and exec(). - */ -static inline u64 xfeatures_mask_restore_user(void) -{ - return fpu_kernel_cfg.max_features & XFEATURE_MASK_USER_RESTORE; -} - -/* - * Like xfeatures_mask_restore_user() but additionally restors the - * supported supervisor states. - */ -static inline u64 xfeatures_mask_fpstate(void) -{ - return fpu_kernel_cfg.max_features & \ - (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED); -} - static inline u64 xfeatures_mask_independent(void) { if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index f8f510519688..a06ebf315d83 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -61,8 +61,6 @@ static inline void fpregs_restore_userregs(void) return; if (!fpregs_state_valid(fpu, cpu)) { - u64 mask; - /* * This restores _all_ xstate which has not been * established yet. @@ -72,9 +70,7 @@ static inline void fpregs_restore_userregs(void) * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. */ - mask = xfeatures_mask_restore_user() | - xfeatures_mask_supervisor(); - restore_fpregs_from_fpstate(fpu->fpstate, mask); + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 5acc077cb9f1..0fb9defaba47 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -150,6 +150,17 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) } if (use_xsave()) { + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + os_xrstor(&fpstate->regs.xsave, mask); } else { if (use_fxsr()) @@ -161,7 +172,7 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) void fpu_reset_from_exception_fixup(void) { - restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) @@ -179,7 +190,7 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) } if (rstor) { - restore_mask &= xfeatures_mask_fpstate(); + restore_mask &= XFEATURE_MASK_FPSTATE; restore_fpregs_from_fpstate(rstor->fpstate, restore_mask); } @@ -518,7 +529,7 @@ void fpu__clear_user_states(struct fpu *fpu) } /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user()); + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9f92abd230db..cbba3812a160 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -344,7 +344,7 @@ static void __init print_xstate_offset_size(void) */ static __init void os_xrstor_booting(struct xregs_state *xstate) { - u64 mask = xfeatures_mask_fpstate(); + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; From d72c87018d00782c3ac0a844c372158087debc0a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2021 01:09:40 +0200 Subject: [PATCH 079/110] x86/fpu/xstate: Move remaining xfeature helpers to core Now that everything is mopped up, move all the helpers and prototypes into the core header. They are not required by the outside. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211014230739.514095101@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 13 ------------- arch/x86/kernel/fpu/xstate.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 61ae396bc6e7..43ae89d4bcd2 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -86,19 +86,6 @@ #define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ XFEATURE_MASK_SUPERVISOR_SUPPORTED) -static inline u64 xfeatures_mask_supervisor(void) -{ - return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; -} - -static inline u64 xfeatures_mask_independent(void) -{ - if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; - - return XFEATURE_MASK_INDEPENDENT; -} - extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 3d45eb04471b..a1aa0bad2c9c 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -35,6 +35,19 @@ extern void fpu__init_system_xstate(unsigned int legacy_size); extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 From 75c52dad5e327605f1025f399dafdf4aaf5dae9c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Oct 2021 20:55:49 +0200 Subject: [PATCH 080/110] x86/fpu: Prepare for sanitizing KVM FPU code For the upcoming AMX support it's necessary to do a proper integration with KVM. To avoid more nasty hackery in KVM which violate encapsulation extend struct fpu and fpstate so the fpstate switching can be consolidated and simplified. Currently KVM allocates two FPU structs which are used for saving the user state of the vCPU thread and restoring the guest state when entering vcpu_run() and doing the reverse operation before leaving vcpu_run(). With the new fpstate mechanism this can be reduced to one extra buffer by swapping the fpstate pointer in current::thread::fpu. This makes the upcoming support for AMX and XFD simpler because then fpstate information (features, sizes, xfd) are always consistent and it does not require any nasty workarounds. Add fpu::__task_fpstate to save the regular fpstate pointer while the task is inside vcpu_run(). Add some state fields to fpstate to indicate the nature of the state. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211022185312.896403942@linutronix.de --- arch/x86/include/asm/fpu/types.h | 44 +++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index a32be07f1418..c72cb2269adc 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,8 +322,32 @@ struct fpstate { /* @user_xfeatures: xfeatures valid in UABI buffers */ u64 user_xfeatures; + /* @is_valloc: Indicator for dynamically allocated state */ + unsigned int is_valloc : 1; + + /* @is_guest: Indicator for guest state (KVM) */ + unsigned int is_guest : 1; + + /* + * @is_confidential: Indicator for KVM confidential mode. + * The FPU registers are restored by the + * vmentry firmware from encrypted guest + * memory. On vmexit the FPU registers are + * saved by firmware to encrypted guest memory + * and the registers are scrubbed before + * returning to the host. So there is no + * content which is worth saving and restoring. + * The fpstate has to be there so that + * preemption and softirq FPU usage works + * without special casing. + */ + unsigned int is_confidential : 1; + + /* @in_use: State is in use */ + unsigned int in_use : 1; + /* @regs: The register state union for all supported formats */ - union fpregs_state regs; + union fpregs_state regs; /* @regs is dynamically sized! Don't add anything after @regs! */ } __aligned(64); @@ -363,6 +387,14 @@ struct fpu { */ struct fpstate *fpstate; + /* + * @__task_fpstate: + * + * Pointer to an inactive struct fpstate. Initialized to NULL. Is + * used only for KVM support to swap out the regular task fpstate. + */ + struct fpstate *__task_fpstate; + /* * @__fpstate: * @@ -378,6 +410,16 @@ struct fpu { */ }; +/* + * Guest pseudo FPU container + */ +struct fpu_guest { + /* + * @fpstate: Pointer to the allocated guest fpstate + */ + struct fpstate *fpstate; +}; + /* * FPU state configuration data. Initialized at boot time. Read only after init. */ From 69f6ed1d14c6bcf712f4bb22a231c15eeab401e7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Oct 2021 20:55:51 +0200 Subject: [PATCH 081/110] x86/fpu: Provide infrastructure for KVM FPU cleanup For the upcoming AMX support it's necessary to do a proper integration with KVM. Currently KVM allocates two FPU structs which are used for saving the user state of the vCPU thread and restoring the guest state when entering vcpu_run() and doing the reverse operation before leaving vcpu_run(). With the new fpstate mechanism this can be reduced to one extra buffer by swapping the fpstate pointer in current::thread::fpu. This makes the upcoming support for AMX and XFD simpler because then fpstate information (features, sizes, xfd) are always consistent and it does not require any nasty workarounds. Provide: - An allocator which initializes the state properly - A replacement for the existing FPU swap mechanim Aside of the reduced memory footprint, this also makes state switching more efficient when TIF_FPU_NEED_LOAD is set. It does not require a memcpy as the state is already correct in the to be swapped out fpstate. The existing interfaces will be removed once KVM is converted over. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211022185312.954684740@linutronix.de --- arch/x86/include/asm/fpu/api.h | 13 ++++++ arch/x86/kernel/fpu/core.c | 85 +++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 9ce83148058f..de85bcaae0c3 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -135,9 +135,22 @@ extern void fpu_init_fpstate_user(struct fpu *fpu); extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); /* KVM specific functions */ +extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); +extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); +extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru); extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru); +static inline void fpstate_set_confidential(struct fpu_guest *gfpu) +{ + gfpu->fpstate->is_confidential = true; +} + +static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) +{ + return gfpu->fpstate->is_confidential; +} + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0fb9defaba47..748d7b2fcacb 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -176,6 +176,75 @@ void fpu_reset_from_exception_fixup(void) } #if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate); + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + __fpstate_reset(fpstate); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) { fpregs_lock(); @@ -352,16 +421,20 @@ void fpstate_init_user(struct fpstate *fpstate) fpstate_init_fstate(fpstate); } +static void __fpstate_reset(struct fpstate *fpstate) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; +} + void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; - - /* Initialize sizes and feature masks */ - fpu->fpstate->size = fpu_kernel_cfg.default_size; - fpu->fpstate->user_size = fpu_user_cfg.default_size; - fpu->fpstate->xfeatures = fpu_kernel_cfg.default_features; - fpu->fpstate->user_xfeatures = fpu_user_cfg.default_features; + __fpstate_reset(fpu->fpstate); } #if IS_ENABLED(CONFIG_KVM) From d69c1382e1b73a0496a70872a035ca2b22d074e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Oct 2021 20:55:53 +0200 Subject: [PATCH 082/110] x86/kvm: Convert FPU handling to a single swap buffer For the upcoming AMX support it's necessary to do a proper integration with KVM. Currently KVM allocates two FPU structs which are used for saving the user state of the vCPU thread and restoring the guest state when entering vcpu_run() and doing the reverse operation before leaving vcpu_run(). With the new fpstate mechanism this can be reduced to one extra buffer by swapping the fpstate pointer in current::thread::fpu. This makes the upcoming support for AMX and XFD simpler because then fpstate information (features, sizes, xfd) are always consistent and it does not require any nasty workarounds. Convert the KVM FPU code over to this new scheme. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211022185313.019454292@linutronix.de --- arch/x86/include/asm/fpu/api.h | 4 +- arch/x86/include/asm/kvm_host.h | 7 +-- arch/x86/kernel/fpu/core.c | 16 +++--- arch/x86/kvm/svm/svm.c | 7 +-- arch/x86/kvm/x86.c | 88 +++++++++------------------------ 5 files changed, 40 insertions(+), 82 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index de85bcaae0c3..5e5f172c1a9d 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -140,8 +140,8 @@ extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); -extern int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, u32 *pkru); -extern void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8f48a7ec577..eb0d69bb2fc6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -691,11 +691,10 @@ struct kvm_vcpu_arch { * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The - * "guest_fpu" state here contains the guest FPU context, with the + * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu *user_fpu; - struct fpu *guest_fpu; + struct fpu_guest guest_fpu; u64 xcr0; u64 guest_supported_xcr0; @@ -1685,8 +1684,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu); - void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 748d7b2fcacb..01fbf7c3e799 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -268,10 +268,10 @@ void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); -void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, - unsigned int size, u32 pkru) +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u32 pkru) { - struct fpstate *kstate = fpu->fpstate; + struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; @@ -284,12 +284,12 @@ void fpu_copy_fpstate_to_kvm_uabi(struct fpu *fpu, void *buf, ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } -EXPORT_SYMBOL_GPL(fpu_copy_fpstate_to_kvm_uabi); +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); -int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, - u32 *vpkru) +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) { - struct fpstate *kstate = fpu->fpstate; + struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; struct pkru_state *xpkru; int ret; @@ -320,7 +320,7 @@ int fpu_copy_kvm_uabi_to_fpstate(struct fpu *fpu, const void *buf, u64 xcr0, xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); return 0; } -EXPORT_SYMBOL_GPL(fpu_copy_kvm_uabi_to_fpstate); +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 989685098b3e..f39c87ddc2e5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "trace.h" @@ -1346,10 +1347,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* * SEV-ES guests maintain an encrypted version of their FPU * state which is restored and saved on VMRUN and VMEXIT. - * Free the fpu structure to prevent KVM from attempting to - * access the FPU state. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. */ - kvm_free_guest_fpu(vcpu); + fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0eb1021e0275..c953ec24a75c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -295,8 +295,6 @@ u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); -static struct kmem_cache *x86_fpu_cache; - static struct kmem_cache *x86_emulator_cache; /* @@ -4705,23 +4703,24 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - fpu_copy_fpstate_to_kvm_uabi(vcpu->arch.guest_fpu, guest_xsave->region, - sizeof(guest_xsave->region), - vcpu->arch.pkru); + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; - return fpu_copy_kvm_uabi_to_fpstate(vcpu->arch.guest_fpu, - guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + supported_xcr0, &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -8301,18 +8300,11 @@ int kvm_arch_init(void *opaque) } r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; - } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out_free_x86_fpu_cache; + goto out; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); @@ -8350,8 +8342,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -8378,7 +8368,6 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); - kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); @@ -9801,23 +9790,17 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { /* - * Guests with protected state have guest_fpu == NULL which makes - * the swap only save the host state. Exclude PKRU from restore as - * it is restored separately in kvm_x86_ops.run(). + * Exclude PKRU from restore as restored separately in + * kvm_x86_ops.run(). */ - fpu_swap_kvm_fpu(vcpu->arch.user_fpu, vcpu->arch.guest_fpu, - ~XFEATURE_MASK_PKRU); + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - /* - * Guests with protected state have guest_fpu == NULL which makes - * swap only restore the host state. - */ - fpu_swap_kvm_fpu(vcpu->arch.guest_fpu, vcpu->arch.user_fpu, ~0ULL); + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -10398,12 +10381,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10421,12 +10404,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->fpstate->regs.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -10487,15 +10470,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_fpu) { - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - vcpu->arch.guest_fpu = NULL; - } -} -EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -10552,22 +10526,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; - } - - fpu_init_fpstate_user(vcpu->arch.user_fpu); - fpu_init_fpstate_user(vcpu->arch.guest_fpu); fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -10600,9 +10563,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kvm_free_guest_fpu(vcpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); free_emulate_ctxt: kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: @@ -10651,8 +10612,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kvm_free_guest_fpu(vcpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -10704,8 +10664,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - struct fpstate *fpstate = vcpu->arch.guest_fpu->fpstate; + if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* * To avoid have the INIT path from kvm_apic_has_events() that be From 582b01b6ab2714a0a4d554cea7f0d4efeaa2154d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Oct 2021 20:55:54 +0200 Subject: [PATCH 083/110] x86/fpu: Remove old KVM FPU interface No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211022185313.074853631@linutronix.de --- arch/x86/include/asm/fpu/api.h | 2 -- arch/x86/kernel/fpu/core.c | 32 -------------------------------- 2 files changed, 34 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 5e5f172c1a9d..e9379d7e7743 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -131,14 +131,12 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* fpstate-related functions which are exported to KVM */ -extern void fpu_init_fpstate_user(struct fpu *fpu); extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); -extern void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask); extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01fbf7c3e799..9c475e2efd4d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -245,29 +245,6 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); -void fpu_swap_kvm_fpu(struct fpu *save, struct fpu *rstor, u64 restore_mask) -{ - fpregs_lock(); - - if (save) { - struct fpstate *fpcur = current->thread.fpu.fpstate; - - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&save->fpstate->regs, &fpcur->regs, fpcur->size); - else - save_fpregs_to_fpstate(save); - } - - if (rstor) { - restore_mask &= XFEATURE_MASK_FPSTATE; - restore_fpregs_from_fpstate(rstor->fpstate, restore_mask); - } - - fpregs_mark_activate(); - fpregs_unlock(); -} -EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpu); - void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru) { @@ -437,15 +414,6 @@ void fpstate_reset(struct fpu *fpu) __fpstate_reset(fpu->fpstate); } -#if IS_ENABLED(CONFIG_KVM) -void fpu_init_fpstate_user(struct fpu *fpu) -{ - fpstate_reset(fpu); - fpstate_init_user(fpu->fpstate); -} -EXPORT_SYMBOL_GPL(fpu_init_fpstate_user); -#endif - /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst) { From 1bdda24c4af64cd2d65dec5192ab624c5fee7ca0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:05 -0700 Subject: [PATCH 084/110] signal: Add an optional check for altstack size New x86 FPU features will be very large, requiring ~10k of stack in signal handlers. These new features require a new approach called "dynamic features". The kernel currently tries to ensure that altstacks are reasonably sized. Right now, on x86, sys_sigaltstack() requires a size of >=2k. However, that 2k is a constant. Simply raising that 2k requirement to >10k for the new features would break existing apps which have a compiled-in size of 2k. Instead of universally enforcing a larger stack, prohibit a process from using dynamic features without properly-sized altstacks. This must be enforced in two places: * A dynamic feature can not be enabled without an large-enough altstack for each process thread. * Once a dynamic feature is enabled, any request to install a too-small altstack will be rejected The dynamic feature enabling code must examine each thread in a process to ensure that the altstacks are large enough. Add a new lock (sigaltstack_lock()) to ensure that threads can not race and change their altstack after being examined. Add the infrastructure in form of a config option and provide empty stubs for architectures which do not need dynamic altstack size checks. This implementation will be fleshed out for x86 in a future patch called x86/arch_prctl: Add controls for dynamic XSTATE components [dhansen: commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-2-chang.seok.bae@intel.com --- arch/Kconfig | 3 +++ include/linux/signal.h | 6 ++++++ kernel/signal.c | 35 +++++++++++++++++++++++++++++------ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 8df1c7102643..af5cf3009b4f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1288,6 +1288,9 @@ config ARCH_HAS_ELFCORE_COMPAT config ARCH_HAS_PARANOID_L1D_FLUSH bool +config DYNAMIC_SIGFRAME + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/linux/signal.h b/include/linux/signal.h index 3f96a6374e4f..7d34105e20c6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -464,6 +464,12 @@ int __save_altstack(stack_t __user *, unsigned long); unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); +#ifdef CONFIG_DYNAMIC_SIGFRAME +bool sigaltstack_size_valid(size_t ss_size); +#else +static inline bool sigaltstack_size_valid(size_t size) { return true; } +#endif /* !CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index 952741f6d0f9..9278f5291ed6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4151,11 +4151,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } +#ifdef CONFIG_DYNAMIC_SIGFRAME +static inline void sigaltstack_lock(void) + __acquires(¤t->sighand->siglock) +{ + spin_lock_irq(¤t->sighand->siglock); +} + +static inline void sigaltstack_unlock(void) + __releases(¤t->sighand->siglock) +{ + spin_unlock_irq(¤t->sighand->siglock); +} +#else +static inline void sigaltstack_lock(void) { } +static inline void sigaltstack_unlock(void) { } +#endif + static int do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, size_t min_ss_size) { struct task_struct *t = current; + int ret = 0; if (oss) { memset(oss, 0, sizeof(stack_t)); @@ -4179,19 +4197,24 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, ss_mode != 0)) return -EINVAL; + sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { if (unlikely(ss_size < min_ss_size)) - return -ENOMEM; + ret = -ENOMEM; + if (!sigaltstack_size_valid(ss_size)) + ret = -ENOMEM; } - - t->sas_ss_sp = (unsigned long) ss_sp; - t->sas_ss_size = ss_size; - t->sas_ss_flags = ss_flags; + if (!ret) { + t->sas_ss_sp = (unsigned long) ss_sp; + t->sas_ss_size = ss_size; + t->sas_ss_flags = ss_flags; + } + sigaltstack_unlock(); } - return 0; + return ret; } SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) From 3aac3ebea08f2d342364f827c8979ab0e1dd591e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:06 -0700 Subject: [PATCH 085/110] x86/signal: Implement sigaltstack size validation For historical reasons MINSIGSTKSZ is a constant which became already too small with AVX512 support. Add a mechanism to enforce strict checking of the sigaltstack size against the real size of the FPU frame. The strict check can be enabled via a config option and can also be controlled via the kernel command line option 'strict_sas_size' independent of the config switch. Enabling it might break existing applications which allocate a too small sigaltstack but 'work' because they never get a signal delivered. Though it can be handy to filter out binaries which are not yet aware of AT_MINSIGSTKSZ. Also the upcoming support for dynamically enabled FPU features requires a strict sanity check to ensure that: - Enabling of a dynamic feature, which changes the sigframe size fits into an enabled sigaltstack - Installing a too small sigaltstack after a dynamic feature has been added is not possible. Implement the base check which is controlled by config and command line options. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-3-chang.seok.bae@intel.com --- .../admin-guide/kernel-parameters.txt | 9 +++++ arch/x86/Kconfig | 17 +++++++++ arch/x86/kernel/signal.c | 35 +++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43dc35fe5bc0..eb9a73ad91a0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5497,6 +5497,15 @@ stifb= [HW] Format: bpp:[:[:...]] + strict_sas_size= + [X86] + Format: + Enable or disable strict sigaltstack size checks + against the required signal frame size which + depends on the supported FPU features. This can + be used to filter out binaries which have + not yet been made aware of AT_MINSIGSTKSZ. + sunrpc.min_resvport= sunrpc.max_resvport= [NFS,SUNRPC] diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d9830e7e1060..8584b30e2536 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -125,6 +125,7 @@ config X86 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG select DCACHE_WORD_ACCESS + select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) @@ -2388,6 +2389,22 @@ config MODIFY_LDT_SYSCALL Saying 'N' here may make sense for embedded or server kernels. +config STRICT_SIGALTSTACK_SIZE + bool "Enforce strict size checking for sigaltstack" + depends on DYNAMIC_SIGFRAME + help + For historical reasons MINSIGSTKSZ is a constant which became + already too small with AVX512 support. Add a mechanism to + enforce strict checking of the sigaltstack size against the + real size of the FPU frame. This option enables the check + by default. It can also be controlled via the kernel command + line option 'strict_sas_size' independent of this config + switch. Enabling it might break existing applications which + allocate a too small sigaltstack but 'work' because they + never get a signal delivered. + + Say 'N' unless you want to really enforce this check. + source "kernel/livepatch/Kconfig" endmenu diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 58bd07071d14..0111a6ae6e60 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #endif /* CONFIG_X86_64 */ #include @@ -907,6 +909,39 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV); } +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size); +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel config or + * command line option. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + lockdep_assert_held(¤t->sighand->siglock); + + if (strict_sigaltstack_size) + return ss_size > get_sigframe_size(); + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_X86_X32_ABI COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { From 84e4dccc8fce20b497388d756e12de5c9006eb48 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:07 -0700 Subject: [PATCH 086/110] x86/fpu/xstate: Provide xstate_calculate_size() Split out the size calculation from the paranoia check so it can be used for recalculating buffer sizes when dynamically enabled features are supported. Signed-off-by: Chang S. Bae [ tglx: Adopted to changed base code ] Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-4-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 46 ++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cbba3812a160..310c4201e056 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -549,6 +549,33 @@ static bool __init check_xstate_against_struct(int nr) return true; } +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for_each_extended_xfeature(i, xfeatures) { + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + size = ALIGN(size, 64); + /* + * In compacted format the enabled features are packed, + * i.e. disabled features do not occupy space. + * + * In non-compacted format the offsets are fixed and + * disabled states still occupy space in the memory buffer. + */ + if (!compacted) + size = xfeature_uncompacted_offset(i); + /* + * Add the feature size even for non-compacted format + * to make the end result correct + */ + size += xfeature_size(i); + } + return size; +} + /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating @@ -575,25 +602,8 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) XSTATE_WARN_ON(1); return false; } - - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - size = ALIGN(size, 64); - /* - * In compacted format the enabled features are packed, - * i.e. disabled features do not occupy space. - * - * In non-compacted format the offsets are fixed and - * disabled states still occupy space in the memory buffer. - */ - if (!compacted) - size = xfeature_uncompacted_offset(i); - /* - * Add the feature size even for non-compacted format - * to make the end result correct - */ - size += xfeature_size(i); } + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size); return size == kernel_size; } From 6f6a7c09c4065a5b140194dfcfe4cf7104fec4d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:08 -0700 Subject: [PATCH 087/110] x86/fpu: Add members to struct fpu to cache permission information Dynamically enabled features can be requested by any thread of a running process at any time. The request does neither enable the feature nor allocate larger buffers. It just stores the permission to use the feature by adding the features to the permission bitmap and by calculating the required sizes for kernel and user space. The reallocation of the kernel buffer happens when the feature is used for the first time which is caught by an exception. The permission bitmap is then checked and if the feature is permitted, then it becomes fully enabled. If not, the task dies similarly to a task which uses an undefined instruction. The size information is precomputed to allow proper sigaltstack size checks once the feature is permitted, but not yet in use because otherwise this would open race windows where too small stacks could be installed causing a later fail on signal delivery. Initialize them to the default feature set and sizes. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-5-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/types.h | 46 ++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 5 ++++ 2 files changed, 51 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index c72cb2269adc..c3ec56279767 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -352,6 +352,45 @@ struct fpstate { /* @regs is dynamically sized! Don't add anything after @regs! */ } __aligned(64); +struct fpu_state_perm { + /* + * @__state_perm: + * + * This bitmap indicates the permission for state components, which + * are available to a thread group. The permission prctl() sets the + * enabled state bits in thread_group_leader()->thread.fpu. + * + * All run time operations use the per thread information in the + * currently active fpu.fpstate which contains the xfeature masks + * and sizes for kernel and user space. + * + * This master permission field is only to be used when + * task.fpu.fpstate based checks fail to validate whether the task + * is allowed to expand it's xfeatures set which requires to + * allocate a larger sized fpstate buffer. + * + * Do not access this field directly. Use the provided helper + * function. Unlocked access is possible for quick checks. + */ + u64 __state_perm; + + /* + * @__state_size: + * + * The size required for @__state_perm. Only valid to access + * with sighand locked. + */ + unsigned int __state_size; + + /* + * @__user_state_size: + * + * The size required for @__state_perm user part. Only valid to + * access with sighand locked. + */ + unsigned int __user_state_size; +}; + /* * Highest level per task FPU state data structure that * contains the FPU register state plus various FPU @@ -395,6 +434,13 @@ struct fpu { */ struct fpstate *__task_fpstate; + /* + * @perm: + * + * Permission related information + */ + struct fpu_state_perm perm; + /* * @__fpstate: * diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9c475e2efd4d..b05f6a3b2057 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -412,6 +412,11 @@ void fpstate_reset(struct fpu *fpu) /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; } /* Clone current's FPU state on fork */ From c33f0a81a2cf3920465309ce683534751bb86485 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:09 -0700 Subject: [PATCH 088/110] x86/fpu: Add fpu_state_config::legacy_features The upcoming prctl() which is required to request the permission for a dynamically enabled feature will also provide an option to retrieve the supported features. If the CPU does not support XSAVE, the supported features would be 0 even when the CPU supports FP and SSE. Provide separate storage for the legacy feature set to avoid that and fill in the bits in the legacy init function. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-6-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/types.h | 7 +++++++ arch/x86/kernel/fpu/init.c | 9 ++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index c3ec56279767..595122fcaf51 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -503,6 +503,13 @@ struct fpu_state_config { * be requested by user space before usage. */ u64 default_features; + /* + * @legacy_features: + * + * Features which can be reported back to user space + * even without XSAVE support, i.e. legacy features FP + SSE + */ + u64 legacy_features; }; /* FPU state configuration information */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7074154131e6..621f4b6cac4a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -193,12 +193,15 @@ static void __init fpu__init_system_xstate_size_legacy(void) * Note that the size configuration might be overwritten later * during fpu__init_system_xstate(). */ - if (!cpu_feature_enabled(X86_FEATURE_FPU)) + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { size = sizeof(struct swregs_state); - else if (cpu_feature_enabled(X86_FEATURE_FXSR)) + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { size = sizeof(struct fxregs_state); - else + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; + } else { size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; + } fpu_kernel_cfg.max_size = size; fpu_kernel_cfg.default_size = size; From db8268df0983adc2bb1fb48c9e5f7bfbb5f617f3 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:10 -0700 Subject: [PATCH 089/110] x86/arch_prctl: Add controls for dynamic XSTATE components Dynamically enabled XSTATE features are by default disabled for all processes. A process has to request permission to use such a feature. To support this implement a architecture specific prctl() with the options: - ARCH_GET_XCOMP_SUPP Copies the supported feature bitmap into the user space provided u64 storage. The pointer is handed in via arg2 - ARCH_GET_XCOMP_PERM Copies the process wide permitted feature bitmap into the user space provided u64 storage. The pointer is handed in via arg2 - ARCH_REQ_XCOMP_PERM Request permission for a feature set. A feature set can be mapped to a facility, e.g. AMX, and can require one or more XSTATE components to be enabled. The feature argument is the number of the highest XSTATE component which is required for a facility to work. The request argument is not a user supplied bitmap because that makes filtering harder (think seccomp) and even impossible because to support 32bit tasks the argument would have to be a pointer. The permission mechanism works this way: Task asks for permission for a facility and kernel checks whether that's supported. If supported it does: 1) Check whether permission has already been granted 2) Compute the size of the required kernel and user space buffer (sigframe) size. 3) Validate that no task has a sigaltstack installed which is smaller than the resulting sigframe size 4) Add the requested feature bit(s) to the permission bitmap of current->group_leader->fpu and store the sizes in the group leaders fpu struct as well. If that is successful then the feature is still not enabled for any of the tasks. The first usage of a related instruction will result in a #NM trap. The trap handler validates the permission bit of the tasks group leader and if permitted it installs a larger kernel buffer and transfers the permission and size info to the new fpstate container which makes all the FPU functions which require per task information aware of the extended feature set. [ tglx: Adopted to new base code, added missing serialization, massaged namings, comments and changelog ] Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-7-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/api.h | 4 + arch/x86/include/asm/proto.h | 2 +- arch/x86/include/uapi/asm/prctl.h | 4 + arch/x86/kernel/fpu/xstate.c | 156 ++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xstate.h | 6 ++ arch/x86/kernel/process.c | 9 +- 6 files changed, 178 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index e9379d7e7743..798ae9225f0e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -151,4 +151,8 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) return gfpu->fpstate->is_confidential; } +/* prctl */ +struct task_struct; +extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8c5d1910a848..feed36d44d04 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -40,6 +40,6 @@ void x86_report_nx(void); extern int reboot_force; long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled); + unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 5a6aac9fa41f..754a07856817 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -10,6 +10,10 @@ #define ARCH_GET_CPUID 0x1011 #define ARCH_SET_CPUID 0x1012 +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 + #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 310c4201e056..c837cffebd4a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include +#include +#include #include "internal.h" #include "legacy.h" @@ -1298,6 +1301,159 @@ void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif +#ifdef CONFIG_X86_64 +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); + + lockdep_assert_held(¤t->sighand->siglock); + + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; + } + return 0; +} + +static int __xstate_request_perm(u64 permitted, u64 requested) +{ + /* + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states. + */ + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + unsigned int ksize, usize; + u64 mask; + int ret; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; + + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + ret = validate_sigaltstack(usize); + if (ret) + return ret; + + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(fpu->perm.__state_perm, requested); + /* Protected by sighand lock */ + fpu->perm.__state_size = ksize; + fpu->perm.__user_state_size = usize; + return ret; +} + +/* + * Permissions array to map facilities with more than one component + */ +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + /* [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE, */ +}; + +static int xstate_request_perm(unsigned long idx) +{ + u64 permitted, requested; + int ret; + + if (idx >= XFEATURE_MAX) + return -EINVAL; + + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; + + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; + + /* Lockless quick check */ + permitted = xstate_get_host_group_perm(); + if ((permitted & requested) == requested) + return 0; + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_host_group_perm(); + ret = __xstate_request_perm(permitted, requested); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + + if (tsk != current) + return -EPERM; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx); + + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index a1aa0bad2c9c..4ce1dc030f38 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -15,6 +15,12 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } +static inline u64 xstate_get_host_group_perm(void) +{ + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm); +} + enum xstate_copy_mode { XSTATE_COPY_FP, XSTATE_COPY_FX, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c74c7e889e9d..97fea1649a5e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1003,13 +1004,17 @@ out: } long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled) + unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, cpuid_enabled); + return set_cpuid_mode(task, arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + return fpu_xstate_prctl(task, option, arg2); } return -EINVAL; From 23686ef25d4ae81bc12fe3994d1905191fcf71f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:11 -0700 Subject: [PATCH 090/110] x86/fpu: Add basic helpers for dynamically enabled features To allow building up the infrastructure required to support dynamically enabled FPU features, add: - XFEATURES_MASK_DYNAMIC This constant will hold xfeatures which can be dynamically enabled. - fpu_state_size_dynamic() A static branch for 64-bit and a simple 'return false' for 32-bit. This helper allows to add dynamic-feature-specific changes to common code which is shared between 32-bit and 64-bit without #ifdeffery. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-8-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/xstate.h | 21 +++++++++++++++++++++ arch/x86/kernel/fpu/core.c | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 43ae89d4bcd2..cf285464eabe 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -43,6 +43,9 @@ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) +/* Features which are dynamically enabled for a process on request */ +#define XFEATURE_MASK_USER_DYNAMIC 0ULL + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) @@ -96,4 +99,22 @@ int xfeature_size(int xfeature_nr); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +#endif + +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); + +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return static_branch_unlikely(&__fpu_state_size_dynamic); +} +#else +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return false; +} +#endif + #endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b05f6a3b2057..4018083c5b36 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -25,6 +25,10 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +#endif + /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; From 4b7ca609a33dd8696bcbd2f1ad949e26a591592f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:12 -0700 Subject: [PATCH 091/110] x86/signal: Use fpu::__state_user_size for sigalt stack validation Use the current->group_leader->fpu to check for pending permissions to use extended features and validate against the resulting user space size which is stored in the group leaders fpu struct as well. This prevents a task from installing a too small sized sigaltstack after permissions to use dynamically enabled features have been granted, but the task has not (yet) used a related instruction. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-9-chang.seok.bae@intel.com --- arch/x86/kernel/signal.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 0111a6ae6e60..ec71e06ae364 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -720,12 +721,15 @@ badframe: /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; void __init init_sigframe_size(void) { + fpu_default_state_size = fpu__get_fpstate_size(); + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; - max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING; + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); @@ -928,15 +932,38 @@ __setup("strict_sas_size", strict_sas_size); * sigaltstack they just continued to work. While always checking against * the real size would be correct, this might be considered a regression. * - * Therefore avoid the sanity check, unless enforced by kernel config or - * command line option. + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. */ bool sigaltstack_size_valid(size_t ss_size) { + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + lockdep_assert_held(¤t->sighand->siglock); + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + if (strict_sigaltstack_size) - return ss_size > get_sigframe_size(); + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; return true; } From 53599b4d54b9b8dda1d537a558946869d2acbddc Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:13 -0700 Subject: [PATCH 092/110] x86/fpu/signal: Prepare for variable sigframe length The software reserved portion of the fxsave frame in the signal frame is copied from structures which have been set up at boot time. With dynamically enabled features the content of these structures is no longer correct because the xfeatures and size can be different per task. Calculate the software reserved portion at runtime and fill in the xfeatures and size values from the tasks active fpstate. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-10-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/internal.h | 3 -- arch/x86/kernel/fpu/signal.c | 62 ++++++++++++++-------------------- arch/x86/kernel/fpu/xstate.c | 1 - 3 files changed, 26 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index e1d8a352f12d..dbdb31f55fc7 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -21,9 +21,6 @@ static __always_inline __pure bool use_fxsr(void) # define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif -/* Init functions */ -extern void fpu__init_prepare_fx_sw_frame(void); - /* Used in init.c */ extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3e42e6e8b56c..3b7f7d07c0b5 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -20,9 +20,6 @@ #include "legacy.h" #include "xstate.h" -static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; -static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; - /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -98,23 +95,42 @@ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) return true; } +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); +} + static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, - unsigned int usize) + struct fpstate *fpstate) { struct xregs_state __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; + struct _fpx_sw_bytes sw_bytes; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + usize)); + (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -173,7 +189,7 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; - int ia32_fxstate = (buf != buf_fx); + bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || @@ -226,8 +242,7 @@ retry: if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) return false; - if (use_fxsr() && - !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate->user_size)) + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) return false; return true; @@ -523,28 +538,3 @@ unsigned long __init fpu__get_fpstate_size(void) return ret; } -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -void __init fpu__init_prepare_fx_sw_frame(void) -{ - int size = fpu_user_cfg.default_size + FP_XSTATE_MAGIC2_SIZE; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = fpu_user_cfg.default_features; - fx_sw_reserved.xstate_size = fpu_user_cfg.default_size; - - if (IS_ENABLED(CONFIG_IA32_EMULATION) || - IS_ENABLED(CONFIG_X86_32)) { - int fsave_header_size = sizeof(struct fregs_state); - - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size = size + fsave_header_size; - } -} - diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c837cffebd4a..bf42ee22de15 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -830,7 +830,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); - fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); setup_supervisor_only_offsets(); From 9e798e9aa14c45fb94e47b30bf6347b369ce9df7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:14 -0700 Subject: [PATCH 093/110] x86/fpu: Prepare fpu_clone() for dynamically enabled features The default portion of the parent's FPU state is saved in a child task. With dynamic features enabled, the non-default portion is not saved in a child's fpstate because these register states are defined to be caller-saved. The new task's fpstate is therefore the default buffer. Fork inherits the permission of the parent. Also, do not use memcpy() when TIF_NEED_FPU_LOAD is set because it is invalid when the parent has dynamic features. Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-11-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/kernel/fpu/core.c | 35 +++++++++++++++++++++++--------- arch/x86/kernel/process.c | 2 +- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index cdb78d590c86..99a8820e8cc4 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4018083c5b36..1ff6b83094a1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -423,8 +423,20 @@ void fpstate_reset(struct fpu *fpu) fpu->perm.__user_state_size = fpu_user_cfg.default_size; } +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + spin_unlock_irq(¤t->sighand->siglock); + } +} + /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -455,17 +467,20 @@ int fpu_clone(struct task_struct *dst) } /* - * If the FPU registers are not owned by current just memcpy() the - * state. Otherwise save the FPU registers directly into the - * child's FPU context, without any memory-to-memory copying. + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. + * + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. */ fpregs_lock(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) { - memcpy(&dst_fpu->fpstate->regs, &src_fpu->fpstate->regs, - dst_fpu->fpstate->size); - } else { - save_fpregs_to_fpstate(dst_fpu); - } + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); fpregs_unlock(); trace_x86_fpu_copy_src(src_fpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 97fea1649a5e..99025e32f105 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p); + fpu_clone(p, clone_flags); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { From e61d6310a0f80cb986fd2076d432760b3619fb6d Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:15 -0700 Subject: [PATCH 094/110] x86/fpu: Reset permission and fpstate on exec() On exec(), extended register states saved in the buffer is cleared. With dynamic features, each task carries variables besides the register states. The struct fpu has permission information and struct fpstate contains buffer size and feature masks. They are all dynamically updated with dynamic features. Reset the current task's entire FPU data before an exec() so that the new task starts with default permission and fpstate. Rename the register state reset function because the old naming confuses as it does not reset struct fpstate. Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-12-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1ff6b83094a1..3349068cef7d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -544,7 +544,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpstate(void) +static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -579,7 +579,7 @@ void fpu__clear_user_states(struct fpu *fpu) fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpstate(); + fpu_reset_fpregs(); fpregs_unlock(); return; } @@ -609,7 +609,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpu_reset_fpstate(); + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. From c351101678ce54492b6e09810ec02efc0df036a9 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:16 -0700 Subject: [PATCH 095/110] x86/cpufeatures: Add eXtended Feature Disabling (XFD) feature bit Intel's eXtended Feature Disable (XFD) feature is an extension of the XSAVE architecture. XFD allows the kernel to enable a feature state in XCR0 and to receive a #NM trap when a task uses instructions accessing that state. This is going to be used to postpone the allocation of a larger XSTATE buffer for a task to the point where it is actually using a related instruction after the permission to use that facility has been granted. XFD is not used by the kernel, but only applied to userspace. This is a matter of policy as the kernel knows how a fpstate is reallocated and the XFD state. The compacted XSAVE format is adjustable for dynamic features. Make XFD depend on XSAVES. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-13-chang.seok.bae@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d0ce5cfd3ac1..ab7b3a2de85d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -277,6 +277,7 @@ #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index defda61f372d..d9ead9c20408 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, {} }; From dae1bd58389615d401a84aedc38fa075ef8f7de6 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:17 -0700 Subject: [PATCH 096/110] x86/msr-index: Add MSRs for XFD XFD introduces two MSRs: - IA32_XFD to enable/disable a feature controlled by XFD - IA32_XFD_ERR to expose to the #NM trap handler which feature was tried to be used for the first time. Both use the same xstate-component bitmap format, used by XCR0. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-14-chang.seok.bae@intel.com --- arch/x86/include/asm/msr-index.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a7c413432b33..01e2650b9585 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -625,6 +625,8 @@ #define MSR_IA32_BNDCFGS_RSVD 0x00000ffc +#define MSR_IA32_XFD 0x000001c4 +#define MSR_IA32_XFD_ERR 0x000001c5 #define MSR_IA32_XSS 0x00000da0 #define MSR_IA32_APICBASE 0x0000001b From 8bf26758ca9659866b844dd51037314b4c0fa6bd Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:18 -0700 Subject: [PATCH 097/110] x86/fpu: Add XFD state to fpstate Add storage for XFD register state to struct fpstate. This will be used to store the XFD MSR state. This will be used for switching the XFD MSR when FPU content is restored. Add a per-CPU variable to cache the current MSR value so the MSR has only to be written when the values are different. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-15-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/core.c | 2 ++ arch/x86/kernel/fpu/xstate.h | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 595122fcaf51..b1897638d68d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,6 +322,9 @@ struct fpstate { /* @user_xfeatures: xfeatures valid in UABI buffers */ u64 user_xfeatures; + /* @xfd: xfeatures disabled to trap userspace use. */ + u64 xfd; + /* @is_valloc: Indicator for dynamically allocated state */ unsigned int is_valloc : 1; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3349068cef7d..3b72cddf990d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -27,6 +27,7 @@ #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ @@ -409,6 +410,7 @@ static void __fpstate_reset(struct fpstate *fpstate) fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = init_fpstate.xfd; } void fpstate_reset(struct fpu *fpu) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 4ce1dc030f38..32a4dee4de3b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -5,6 +5,10 @@ #include #include +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) { /* From 5529acf47ec31ece0815f69d43f5e6a1e485a0f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:19 -0700 Subject: [PATCH 098/110] x86/fpu: Add sanity checks for XFD Add debug functionality to ensure that the XFD MSR is up to date for XSAVE* and XRSTOR* operations. [ tglx: Improve comment. ] Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-16-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/core.c | 9 +++--- arch/x86/kernel/fpu/signal.c | 6 ++-- arch/x86/kernel/fpu/xstate.c | 58 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/fpu/xstate.h | 34 ++++++++++++++++++--- 4 files changed, 95 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3b72cddf990d..b5f5b08b84d7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -166,7 +166,7 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) */ mask = fpu_kernel_cfg.max_features & mask; - os_xrstor(&fpstate->regs.xsave, mask); + os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); @@ -534,7 +534,7 @@ void fpu__drop(struct fpu *fpu) static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) - os_xrstor(&init_fpstate.regs.xsave, features_mask); + os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else @@ -591,9 +591,8 @@ void fpu__clear_user_states(struct fpu *fpu) * corresponding registers. */ if (xfeatures_mask_supervisor() && - !fpregs_state_valid(fpu, smp_processor_id())) { - os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor()); - } + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3b7f7d07c0b5..16fdecd02341 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -261,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) - os_xrstor(&init_fpstate.regs.xsave, init_bv); + os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); @@ -322,7 +322,7 @@ retry: * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) - os_xrstor(&fpu->fpstate->regs.xsave, xfeatures_mask_supervisor()); + os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); @@ -432,7 +432,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpregs->xsave.header.xfeatures &= mask; - success = !os_xrstor_safe(&fpregs->xsave, + success = !os_xrstor_safe(fpu->fpstate, fpu_kernel_cfg.max_features); } else { success = !fxrstor_safe(&fpregs->fxsave); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bf42ee22de15..603edeb7b913 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1301,6 +1301,64 @@ EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. + */ +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) +{ + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; + + /* + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. + */ + if (fpstate == &init_fpstate) + return rstor; + + /* + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() + */ + + /* + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. + */ + mask &= ~xfd; + + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; + + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} + +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ + static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 32a4dee4de3b..29024244965b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -130,6 +130,12 @@ static inline u64 xfeatures_mask_independent(void) : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + /* * Save processor xstate to xsave area. * @@ -144,6 +150,7 @@ static inline void os_xsave(struct fpstate *fpstate) int err; WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); @@ -156,12 +163,23 @@ static inline void os_xsave(struct fpstate *fpstate) * * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. */ -static inline void os_xrstor(struct xregs_state *xstate, u64 mask) +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - XSTATE_XRESTORE(xstate, lmask, hmask); + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* @@ -184,11 +202,14 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - u64 mask = current->thread.fpu.fpstate->user_xfeatures; + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; + xfd_validate_state(fpstate, mask, false); + stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); @@ -206,6 +227,8 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 u32 hmask = mask >> 32; int err; + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); @@ -217,12 +240,15 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ -static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) { + struct xregs_state *xstate = &fpstate->regs.xsave; u32 lmask = mask; u32 hmask = mask >> 32; int err; + /* Must enforce XFD update here */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else From 672365477ae8afca5a1cca98c1deb733235e4525 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:20 -0700 Subject: [PATCH 099/110] x86/fpu: Update XFD state where required The IA32_XFD_MSR allows to arm #NM traps for XSTATE components which are enabled in XCR0. The register has to be restored before the tasks XSTATE is restored. The life time rules are the same as for FPU state. XFD is updated on return to userspace only when the FPU state of the task is not up to date in the registers. It's updated before the XRSTORS so that eventually enabled dynamic features are restored as well and not brought into init state. Also in signal handling for restoring FPU state from user space the correctness of the XFD state has to be ensured. Add it to CPU initialization and resume as well. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-17-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/context.h | 2 ++ arch/x86/kernel/fpu/core.c | 28 +++++++++++++++++++++++++++- arch/x86/kernel/fpu/signal.c | 2 ++ arch/x86/kernel/fpu/xstate.c | 12 ++++++++++++ arch/x86/kernel/fpu/xstate.h | 19 ++++++++++++++++++- 5 files changed, 61 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index a06ebf315d83..958accf2ccf0 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -69,6 +69,8 @@ static inline void fpregs_restore_userregs(void) * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b5f5b08b84d7..12ca174891dc 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -155,6 +155,23 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) } if (use_xsave()) { + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use @@ -241,8 +258,17 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) cur_fps = fpu->fpstate; - if (!cur_fps->is_confidential) + if (!cur_fps->is_confidential) { + /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } fpregs_mark_activate(); fpregs_unlock(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 16fdecd02341..cc977da6e128 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -282,6 +282,8 @@ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, retry: fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); pagefault_disable(); ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, xrestore, fx_only); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 603edeb7b913..77739b0a56d5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -136,6 +136,15 @@ void fpu__init_cpu_xstate(void) cr4_set_bits(X86_CR4_OSXSAVE); + /* + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. + */ + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user @@ -875,6 +884,9 @@ void fpu__resume_cpu(void) wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 29024244965b..e18210dff88c 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -136,6 +136,22 @@ extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } #endif +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } +#endif + /* * Save processor xstate to xsave area. * @@ -247,7 +263,8 @@ static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) u32 hmask = mask >> 32; int err; - /* Must enforce XFD update here */ + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); From 783e87b404956f8958657aed8a6a72aa98d5b7e1 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:21 -0700 Subject: [PATCH 100/110] x86/fpu/xstate: Add XFD #NM handler If the XFD MSR has feature bits set then #NM will be raised when user space attempts to use an instruction related to one of these features. When the task has no permissions to use that feature, raise SIGILL, which is the same behavior as #UD. If the task has permissions, calculate the new buffer size for the extended feature set and allocate a larger fpstate. In the unlikely case that vzalloc() fails, SIGSEGV is raised. The allocation function will be added in the next step. Provide a stub which fails for now. [ tglx: Updated serialization ] Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-18-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/xstate.h | 2 ++ arch/x86/kernel/fpu/xstate.c | 47 +++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 38 +++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index cf285464eabe..b7b145cad019 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -99,6 +99,8 @@ int xfeature_size(int xfeature_nr); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); +int xfd_enable_feature(u64 xfd_err); + #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 77739b0a56d5..3d38558d594f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1464,6 +1464,53 @@ static int xstate_request_perm(unsigned long idx) spin_unlock_irq(¤t->sighand->siglock); return ret; } + +/* Place holder for now */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize) +{ + return -ENOMEM; +} + +int xfd_enable_feature(u64 xfd_err) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + ksize = fpu->perm.__state_size; + usize = fpu->perm.__user_state_size; + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize)) + return -EFAULT; + return 0; +} #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bae7582c58f5..6ca1454a65d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1108,10 +1108,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug) */ } +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; +} + DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); + if (handle_xfd_event(regs)) + return; + #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; From 500afbf645a040a39e1af0dba2fdf6ebf224bd47 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:22 -0700 Subject: [PATCH 101/110] x86/fpu/xstate: Add fpstate_realloc()/free() The fpstate embedded in struct fpu is the default state for storing the FPU registers. It's sized so that the default supported features can be stored. For dynamically enabled features the register buffer is too small. The #NM handler detects first use of a feature which is disabled in the XFD MSR. After handling permission checks it recalculates the size for kernel space and user space state and invokes fpstate_realloc() which tries to reallocate fpstate and install it. Provide the allocator function which checks whether the current buffer size is sufficient and if not allocates one. If allocation is successful the new fpstate is initialized with the new features and sizes and the now enabled features is removed from the task's XFD mask. realloc_fpstate() uses vzalloc(). If use of this mechanism grows to re-allocate buffers larger than 64KB, a more sophisticated allocation scheme that includes purpose-built reclaim capability might be justified. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-19-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/api.h | 7 +++ arch/x86/kernel/fpu/xstate.c | 97 +++++++++++++++++++++++++++++++--- arch/x86/kernel/process.c | 10 ++++ 3 files changed, 106 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 798ae9225f0e..b7267b9e452f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -130,6 +130,13 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} /* State tracking */ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); +/* Process cleanup */ +#ifdef CONFIG_X86_64 +extern void fpstate_free(struct fpu *fpu); +#else +static inline void fpstate_free(struct fpu *fpu) { } +#endif + /* fpstate-related functions which are exported to KVM */ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3d38558d594f..db0bfc2db8bf 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,7 @@ #include #include +#include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" @@ -1371,6 +1373,91 @@ void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) } #endif /* CONFIG_X86_DEBUG_FPU */ +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate || fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} + +/** + * fpu_install_fpstate - Update the active fpstate in the FPU + * + * @fpu: A struct fpu * pointer + * @newfps: A struct fpstate * pointer + * + * Returns: A null pointer if the last active fpstate is the embedded + * one or the new fpstate is already installed; + * otherwise, a pointer to the old fpstate which has to + * be freed by the caller. + */ +static struct fpstate *fpu_install_fpstate(struct fpu *fpu, + struct fpstate *newfps) +{ + struct fpstate *oldfps = fpu->fpstate; + + if (fpu->fpstate == newfps) + return NULL; + + fpu->fpstate = newfps; + return oldfps != &fpu->__fpstate ? oldfps : NULL; +} + +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + + curfps = fpu->fpstate; + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + fpregs_lock(); + /* + * Ensure that the current state is in the registers before + * swapping fpstate as that might invalidate it due to layout + * changes. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + newfps->xfeatures = curfps->xfeatures | xfeatures; + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; + + curfps = fpu_install_fpstate(fpu, newfps); + + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); + xfd_update_state(newfps); + + fpregs_unlock(); + + vfree(curfps); + return 0; +} + static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; @@ -1393,7 +1480,8 @@ static int __xstate_request_perm(u64 permitted, u64 requested) /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon - * vendors into extending XFD for the pre AMX states. + * vendors into extending XFD for the pre AMX states, especially + * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); struct fpu *fpu = ¤t->group_leader->thread.fpu; @@ -1465,13 +1553,6 @@ static int xstate_request_perm(unsigned long idx) return ret; } -/* Place holder for now */ -static int fpstate_realloc(u64 xfeatures, unsigned int ksize, - unsigned int usize) -{ - return -ENOMEM; -} - int xfd_enable_feature(u64 xfd_err) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 99025e32f105..f3f251787b99 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -90,9 +91,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #endif /* Drop the copied pointer to current's fpstate */ dst->thread.fpu.fpstate = NULL; + return 0; } +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); +} +#endif + /* * Free thread data structures etc.. */ From 70c3f1671b0cbc386b387f1de33b7837e276a195 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:23 -0700 Subject: [PATCH 102/110] x86/fpu/xstate: Prepare XSAVE feature table for gaps in state component numbers The kernel checks at boot time which features are available by walking a XSAVE feature table which contains the CPUID feature bit numbers which need to be checked whether a feature is available on a CPU or not. So far the feature numbers have been linear, but AMX will create a gap which the current code cannot handle. Make the table entries explicitly indexed and adjust the loop code accordingly to prepare for that. No functional change. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Len Brown Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-20-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index db0bfc2db8bf..e3d1898a3823 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -53,18 +53,18 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; -static short xsave_cpuid_features[] __initdata = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, - X86_FEATURE_ENQCMD, +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -809,7 +809,10 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { - if (!boot_cpu_has(xsave_cpuid_features[i])) + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } From eec2113eabd92b7bfbaf1033fa82dc8eb4951203 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:24 -0700 Subject: [PATCH 103/110] x86/fpu/amx: Define AMX state components and have it used for boot-time checks The XSTATE initialization uses check_xstate_against_struct() to sanity check the size of XSTATE-enabled features. AMX is a XSAVE-enabled feature, and its size is not hard-coded but discoverable at run-time via CPUID. The AMX state is composed of state components 17 and 18, which are all user state components. The first component is the XTILECFG state of a 64-byte tile-related control register. The state component 18, called XTILEDATA, contains the actual tile data, and the state size varies on implementations. The architectural maximum, as defined in the CPUID(0x1d, 1): EAX[15:0], is a byte less than 64KB. The first implementation supports 8KB. Check the XTILEDATA state size dynamically. The feature introduces the new tile register, TMM. Define one register struct only and read the number of registers from CPUID. Cross-check the overall size with CPUID again. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-21-chang.seok.bae@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/fpu/types.h | 32 ++++++++++++ arch/x86/include/asm/fpu/xstate.h | 2 + arch/x86/kernel/fpu/xstate.c | 80 +++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ab7b3a2de85d..d5b5f2ab87a0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -299,6 +299,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index b1897638d68d..3c06c82ab355 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -120,6 +120,9 @@ enum xfeature { XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILE_CFG, + XFEATURE_XTILE_DATA, XFEATURE_MAX, }; @@ -136,12 +139,21 @@ enum xfeature { #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) +#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) +#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ | XFEATURE_MASK_ZMM_Hi256 \ | XFEATURE_MASK_Hi16_ZMM) +#ifdef CONFIG_X86_64 +# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \ + | XFEATURE_MASK_XTILE_CFG) +#else +# define XFEATURE_MASK_XTILE (0) +#endif + #define FIRST_EXTENDED_XFEATURE XFEATURE_YMM struct reg_128_bit { @@ -153,6 +165,9 @@ struct reg_256_bit { struct reg_512_bit { u8 regbytes[512/8]; }; +struct reg_1024_byte { + u8 regbytes[1024]; +}; /* * State component 2: @@ -255,6 +270,23 @@ struct arch_lbr_state { u64 ler_to; u64 ler_info; struct lbr_entry entries[]; +}; + +/* + * State component 17: 64-byte tile configuration register. + */ +struct xtile_cfg { + u64 tcfg[8]; +} __packed; + +/* + * State component 18: 1KB tile data register. + * Each register represents 16 64-byte rows of the matrix + * data. But the number of registers depends on the actual + * implementation. + */ +struct xtile_data { + struct reg_1024_byte tmm; } __packed; /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index b7b145cad019..10adf1376f0c 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -14,6 +14,8 @@ #define XSTATE_CPUID 0x0000000d +#define TILE_CPUID 0x0000001d + #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e3d1898a3823..3372da871a40 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -51,6 +51,14 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "AMX Tile config" , + "AMX Tile data" , + "unknown xstate feature" , }; static unsigned short xsave_cpuid_features[] __initdata = { @@ -65,6 +73,8 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_PKU, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -240,6 +250,8 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* @@ -523,6 +535,67 @@ static void __init __xstate_dump_leaves(void) } \ } while (0) +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU @@ -546,6 +619,11 @@ static bool __init check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); + + /* The tile data size varies between implementations. */ + if (nr == XFEATURE_XTILE_DATA) + check_xtile_data_against_struct(sz); /* * Make *SURE* to add any feature numbers in below if @@ -555,7 +633,7 @@ static bool __init check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); return false; From 2ae996e0c1a38ca57a52438ab9deec6761dcba62 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:25 -0700 Subject: [PATCH 104/110] x86/fpu: Calculate the default sizes independently When dynamically enabled states are supported the maximum and default sizes for the kernel buffers and user space interfaces are not longer identical. Put the necessary calculations in place which only take the default enabled features into account. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-22-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3372da871a40..b0f6e9a0cf2e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -781,35 +781,40 @@ static bool __init is_supported_xstate_size(unsigned int test_xstate_size) static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int user_size, kernel_size; + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and - * uses compacted format. + * uses compacted format when available. * * XSAVE does not support supervisor states so * kernel and user size is identical. */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (compacted) kernel_size = get_xsaves_size_no_independent(); else kernel_size = user_size; - /* Ensure we have the space to store all enabled features. */ - if (!is_supported_xstate_size(kernel_size)) + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); + + /* Ensure we have the space to store all default enabled features. */ + if (!is_supported_xstate_size(kernel_default_size)) return -EINVAL; if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; - /* Keep it the same for now */ fpu_kernel_cfg.max_size = kernel_size; - fpu_kernel_cfg.default_size = kernel_size; fpu_user_cfg.max_size = user_size; - fpu_user_cfg.default_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); return 0; } @@ -894,15 +899,21 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; - /* Identical for now */ + /* Clean out dynamic features from default */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; @@ -913,6 +924,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (err) goto out_disable; + /* Reset the state for the current task */ fpstate_reset(¤t->thread.fpu); /* From db3e7321b4b84b1cb39598ff79b90d1252481378 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:26 -0700 Subject: [PATCH 105/110] x86/fpu: Add XFD handling for dynamic states To handle the dynamic sizing of buffers on first use the XFD MSR has to be armed. Store the delta between the maximum available and the default feature bits in init_fpstate where it can be retrieved for task creation. If the delta is non zero then dynamic features are enabled. This needs also to enable the static key which guards the XFD updates. This is delayed to an initcall because the FPU setup runs before jump labels are initialized. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-23-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b0f6e9a0cf2e..987a07bc668b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -835,6 +835,12 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + fpstate_reset(¤t->thread.fpu); } @@ -918,6 +924,14 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); err = init_xstate_size(); @@ -1466,9 +1480,21 @@ void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) } #endif /* CONFIG_X86_DEBUG_FPU */ +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) + void fpstate_free(struct fpu *fpu) { - if (fpu->fpstate || fpu->fpstate != &fpu->__fpstate) + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } From 2308ee57d93d896618dd65c996429c9d3e469fe0 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 21 Oct 2021 15:55:27 -0700 Subject: [PATCH 106/110] x86/fpu/amx: Enable the AMX feature in 64-bit mode Add the AMX state components in XFEATURE_MASK_USER_SUPPORTED and the TILE_DATA component to the dynamic states and update the permission check table accordingly. This is only effective on 64 bit kernels as for 32bit kernels XFEATURE_MASK_TILE is defined as 0. TILE_DATA is caller-saved state and the only dynamic state. Add build time sanity check to ensure the assumption that every dynamic feature is caller- saved. Make AMX state depend on XFD as it is dynamic feature. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211021225527.10184-24-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/xstate.h | 5 +++-- arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/fpu/core.c | 6 ++++++ arch/x86/kernel/fpu/xstate.c | 5 +++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 10adf1376f0c..0f8b90ab18c9 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -35,7 +35,8 @@ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_XTILE) /* * Features which are restored when returning to user space. @@ -46,7 +47,7 @@ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) /* Features which are dynamically enabled for a process on request */ -#define XFEATURE_MASK_USER_DYNAMIC 0ULL +#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d9ead9c20408..cb2fdd130aae 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -76,6 +76,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12ca174891dc..290836d1f2a7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -494,6 +494,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) return 0; } + /* + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! + */ + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 987a07bc668b..d28829403ed0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -404,7 +404,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PASID) + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state @@ -1636,7 +1637,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested) * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { - /* [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE, */ + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx) From 6a3e0651b4a00daa314c59d6e4228dfa7a986983 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 26 Oct 2021 05:25:24 -0700 Subject: [PATCH 107/110] selftests/x86/amx: Add test cases for AMX state management AMX TILEDATA is a very large XSAVE feature. It could have caused nasty XSAVE buffer space waste in two places: * Signal stacks * Kernel task_struct->fpu buffers To avoid this waste, neither of these buffers have AMX state by default. The non-default features are called "dynamic" features. There is an arch_prctl(ARCH_REQ_XCOMP_PERM) which allows a task to declare that it wants to use AMX or other "dynamic" XSAVE features. This arch_prctl() ensures that sufficient sigaltstack space is available before it will succeed. It also expands the task_struct buffer. Functions of this test: * Test arch_prctl(ARCH_REQ_XCOMP_PERM). Ensure that it checks for proper sigaltstack sizing and that the sizing is enforced for future sigaltstack calls. * Ensure that ARCH_REQ_XCOMP_PERM is inherited across fork() * Ensure that TILEDATA use before the prctl() is fatal * Ensure that TILEDATA is cleared across fork() Note: Generally, compiler support is needed to do something with AMX. Instead, directly load AMX state from userspace with a plain XSAVE. Do not depend on the compiler. [ dhansen: bunches of cleanups ] Signed-off-by: Chang S. Bae Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211026122524.7BEDAA95@davehans-spike.ostc.intel.com --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/amx.c | 697 +++++++++++++++++++++++++++ 2 files changed, 698 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/amx.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index b4142cd1c5c2..8a1f62ab3c8e 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header + corrupt_xstate_header amx # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c new file mode 100644 index 000000000000..ce012ad15fa5 --- /dev/null +++ b/tools/testing/selftests/x86/amx.c @@ -0,0 +1,697 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#ifndef __x86_64__ +# error This test is 64-bit only +#endif + +#define XSAVE_HDR_OFFSET 512 +#define XSAVE_HDR_SIZE 64 + +struct xsave_buffer { + union { + struct { + char legacy[XSAVE_HDR_OFFSET]; + char header[XSAVE_HDR_SIZE]; + char extended[0]; + }; + char bytes[0]; + }; +}; + +static inline uint64_t xgetbv(uint32_t index) +{ + uint32_t eax, edx; + + asm volatile("xgetbv;" + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((uint64_t)edx << 32); +} + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + asm volatile("cpuid;" + : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_lo = rfbm; + uint32_t rfbm_hi = rfbm >> 32; + + asm volatile("xsave (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) + : "memory"); +} + +static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_lo = rfbm; + uint32_t rfbm_hi = rfbm >> 32; + + asm volatile("xrstor (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); +} + +/* err() exits and will not return */ +#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__) + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + fatal_error("sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + fatal_error("sigaction"); +} + +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 +#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) +#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) +#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) + +#define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) +#define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) +static inline void check_cpuid_xsave(void) +{ + uint32_t eax, ebx, ecx, edx; + + /* + * CPUID.1:ECX.XSAVE[bit 26] enumerates general + * support for the XSAVE feature set, including + * XGETBV. + */ + eax = 1; + ecx = 0; + cpuid(&eax, &ebx, &ecx, &edx); + if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK)) + fatal_error("cpuid: no CPU xsave support"); + if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK)) + fatal_error("cpuid: no OS xsave support"); +} + +static uint32_t xbuf_size; + +static struct { + uint32_t xbuf_offset; + uint32_t size; +} xtiledata; + +#define CPUID_LEAF_XSTATE 0xd +#define CPUID_SUBLEAF_XSTATE_USER 0x0 +#define TILE_CPUID 0x1d +#define TILE_PALETTE_ID 0x1 + +static void check_cpuid_xtiledata(void) +{ + uint32_t eax, ebx, ecx, edx; + + eax = CPUID_LEAF_XSTATE; + ecx = CPUID_SUBLEAF_XSTATE_USER; + cpuid(&eax, &ebx, &ecx, &edx); + + /* + * EBX enumerates the size (in bytes) required by the XSAVE + * instruction for an XSAVE area containing all the user state + * components corresponding to bits currently set in XCR0. + * + * Stash that off so it can be used to allocate buffers later. + */ + xbuf_size = ebx; + + eax = CPUID_LEAF_XSTATE; + ecx = XFEATURE_XTILEDATA; + + cpuid(&eax, &ebx, &ecx, &edx); + /* + * eax: XTILEDATA state component size + * ebx: XTILEDATA state component offset in user buffer + */ + if (!eax || !ebx) + fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", + eax, ebx); + + xtiledata.size = eax; + xtiledata.xbuf_offset = ebx; +} + +/* The helpers for managing XSAVE buffer and tile states: */ + +struct xsave_buffer *alloc_xbuf(void) +{ + struct xsave_buffer *xbuf; + + /* XSAVE buffer should be 64B-aligned. */ + xbuf = aligned_alloc(64, xbuf_size); + if (!xbuf) + fatal_error("aligned_alloc()"); + return xbuf; +} + +static inline void clear_xstate_header(struct xsave_buffer *buffer) +{ + memset(&buffer->header, 0, sizeof(buffer->header)); +} + +static inline uint64_t get_xstatebv(struct xsave_buffer *buffer) +{ + /* XSTATE_BV is at the beginning of the header: */ + return *(uint64_t *)&buffer->header; +} + +static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv) +{ + /* XSTATE_BV is at the beginning of the header: */ + *(uint64_t *)(&buffer->header) = bv; +} + +static void set_rand_tiledata(struct xsave_buffer *xbuf) +{ + int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset]; + int data; + int i; + + /* + * Ensure that 'data' is never 0. This ensures that + * the registers are never in their initial configuration + * and thus never tracked as being in the init state. + */ + data = rand() | 1; + + for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++) + *ptr = data; +} + +struct xsave_buffer *stashed_xsave; + +static void init_stashed_xsave(void) +{ + stashed_xsave = alloc_xbuf(); + if (!stashed_xsave) + fatal_error("failed to allocate stashed_xsave\n"); + clear_xstate_header(stashed_xsave); +} + +static void free_stashed_xsave(void) +{ + free(stashed_xsave); +} + +/* See 'struct _fpx_sw_bytes' at sigcontext.h */ +#define SW_BYTES_OFFSET 464 +/* N.B. The struct's field name varies so read from the offset. */ +#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) + +static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer) +{ + return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET); +} + +static inline uint64_t get_fpx_sw_bytes_features(void *buffer) +{ + return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); +} + +/* Work around printf() being unsafe in signals: */ +#define SIGNAL_BUF_LEN 1000 +char signal_message_buffer[SIGNAL_BUF_LEN]; +void sig_print(char *msg) +{ + int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1; + + strncat(signal_message_buffer, msg, left); +} + +static volatile bool noperm_signaled; +static int noperm_errs; + +/* + * Signal handler for when AMX is used but + * permission has not been obtained. + */ +static void handle_noperm(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + void *xbuf = ctx->uc_mcontext.fpregs; + struct _fpx_sw_bytes *sw_bytes; + uint64_t features; + + /* Reset the signal message buffer: */ + signal_message_buffer[0] = '\0'; + sig_print("\tAt SIGILL handler,\n"); + + if (si->si_code != ILL_ILLOPC) { + noperm_errs++; + sig_print("[FAIL]\tInvalid signal code.\n"); + } else { + sig_print("[OK]\tValid signal code (ILL_ILLOPC).\n"); + } + + sw_bytes = get_fpx_sw_bytes(xbuf); + /* + * Without permission, the signal XSAVE buffer should not + * have room for AMX register state (aka. xtiledata). + * Check that the size does not overlap with where xtiledata + * will reside. + * + * This also implies that no state components *PAST* + * XTILEDATA (features >=19) can be present in the buffer. + */ + if (sw_bytes->xstate_size <= xtiledata.xbuf_offset) { + sig_print("[OK]\tValid xstate size\n"); + } else { + noperm_errs++; + sig_print("[FAIL]\tInvalid xstate size\n"); + } + + features = get_fpx_sw_bytes_features(xbuf); + /* + * Without permission, the XTILEDATA feature + * bit should not be set. + */ + if ((features & XFEATURE_MASK_XTILEDATA) == 0) { + sig_print("[OK]\tValid xstate mask\n"); + } else { + noperm_errs++; + sig_print("[FAIL]\tInvalid xstate mask\n"); + } + + noperm_signaled = true; + ctx->uc_mcontext.gregs[REG_RIP] += 3; /* Skip the faulting XRSTOR */ +} + +/* Return true if XRSTOR is successful; otherwise, false. */ +static inline bool xrstor_safe(struct xsave_buffer *xbuf, uint64_t mask) +{ + noperm_signaled = false; + xrstor(xbuf, mask); + + /* Print any messages produced by the signal code: */ + printf("%s", signal_message_buffer); + /* + * Reset the buffer to make sure any future printing + * only outputs new messages: + */ + signal_message_buffer[0] = '\0'; + + if (noperm_errs) + fatal_error("saw %d errors in noperm signal handler\n", noperm_errs); + + return !noperm_signaled; +} + +/* + * Use XRSTOR to populate the XTILEDATA registers with + * random data. + * + * Return true if successful; otherwise, false. + */ +static inline bool load_rand_tiledata(struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA); + set_rand_tiledata(xbuf); + return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA); +} + +/* Return XTILEDATA to its initial configuration. */ +static inline void init_xtiledata(void) +{ + clear_xstate_header(stashed_xsave); + xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA); +} + +enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; + +/* arch_prctl() and sigaltstack() test */ + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 + +static void req_xtiledata_perm(void) +{ + syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); +} + +static void validate_req_xcomp_perm(enum expected_result exp) +{ + unsigned long bitmask; + long rc; + + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + if (exp == FAIL_EXPECTED) { + if (rc) { + printf("[OK]\tARCH_REQ_XCOMP_PERM saw expected failure..\n"); + return; + } + + fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected success.\n"); + } else if (rc) { + fatal_error("ARCH_REQ_XCOMP_PERM saw unexpected failure.\n"); + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask); + if (rc) { + fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc); + } else if (bitmask & XFEATURE_MASK_XTILE) { + printf("\tARCH_REQ_XCOMP_PERM is successful.\n"); + } +} + +static void validate_xcomp_perm(enum expected_result exp) +{ + bool load_success = load_rand_tiledata(stashed_xsave); + + if (exp == FAIL_EXPECTED) { + if (load_success) { + noperm_errs++; + printf("[FAIL]\tLoad tiledata succeeded.\n"); + } else { + printf("[OK]\tLoad tiledata failed.\n"); + } + } else if (exp == SUCCESS_EXPECTED) { + if (load_success) { + printf("[OK]\tLoad tiledata succeeded.\n"); + } else { + noperm_errs++; + printf("[FAIL]\tLoad tiledata failed.\n"); + } + } +} + +#ifndef AT_MINSIGSTKSZ +# define AT_MINSIGSTKSZ 51 +#endif + +static void *alloc_altstack(unsigned int size) +{ + void *altstack; + + altstack = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + + if (altstack == MAP_FAILED) + fatal_error("mmap() for altstack"); + + return altstack; +} + +static void setup_altstack(void *addr, unsigned long size, enum expected_result exp) +{ + stack_t ss; + int rc; + + memset(&ss, 0, sizeof(ss)); + ss.ss_size = size; + ss.ss_sp = addr; + + rc = sigaltstack(&ss, NULL); + + if (exp == FAIL_EXPECTED) { + if (rc) { + printf("[OK]\tsigaltstack() failed.\n"); + } else { + fatal_error("sigaltstack() succeeded unexpectedly.\n"); + } + } else if (rc) { + fatal_error("sigaltstack()"); + } +} + +static void test_dynamic_sigaltstack(void) +{ + unsigned int small_size, enough_size; + unsigned long minsigstksz; + void *altstack; + + minsigstksz = getauxval(AT_MINSIGSTKSZ); + printf("\tAT_MINSIGSTKSZ = %lu\n", minsigstksz); + /* + * getauxval() itself can return 0 for failure or + * success. But, in this case, AT_MINSIGSTKSZ + * will always return a >=0 value if implemented. + * Just check for 0. + */ + if (minsigstksz == 0) { + printf("no support for AT_MINSIGSTKSZ, skipping sigaltstack tests\n"); + return; + } + + enough_size = minsigstksz * 2; + + altstack = alloc_altstack(enough_size); + printf("\tAllocate memory for altstack (%u bytes).\n", enough_size); + + /* + * Try setup_altstack() with a size which can not fit + * XTILEDATA. ARCH_REQ_XCOMP_PERM should fail. + */ + small_size = minsigstksz - xtiledata.size; + printf("\tAfter sigaltstack() with small size (%u bytes).\n", small_size); + setup_altstack(altstack, small_size, SUCCESS_EXPECTED); + validate_req_xcomp_perm(FAIL_EXPECTED); + + /* + * Try setup_altstack() with a size derived from + * AT_MINSIGSTKSZ. It should be more than large enough + * and thus ARCH_REQ_XCOMP_PERM should succeed. + */ + printf("\tAfter sigaltstack() with enough size (%u bytes).\n", enough_size); + setup_altstack(altstack, enough_size, SUCCESS_EXPECTED); + validate_req_xcomp_perm(SUCCESS_EXPECTED); + + /* + * Try to coerce setup_altstack() to again accept a + * too-small altstack. This ensures that big-enough + * sigaltstacks can not shrink to a too-small value + * once XTILEDATA permission is established. + */ + printf("\tThen, sigaltstack() with small size (%u bytes).\n", small_size); + setup_altstack(altstack, small_size, FAIL_EXPECTED); +} + +static void test_dynamic_state(void) +{ + pid_t parent, child, grandchild; + + parent = fork(); + if (parent < 0) { + /* fork() failed */ + fatal_error("fork"); + } else if (parent > 0) { + int status; + /* fork() succeeded. Now in the parent. */ + + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + fatal_error("arch_prctl test parent exit"); + return; + } + /* fork() succeeded. Now in the child . */ + + printf("[RUN]\tCheck ARCH_REQ_XCOMP_PERM around process fork() and sigaltack() test.\n"); + + printf("\tFork a child.\n"); + child = fork(); + if (child < 0) { + fatal_error("fork"); + } else if (child > 0) { + int status; + + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + fatal_error("arch_prctl test child exit"); + _exit(0); + } + + /* + * The permission request should fail without an + * XTILEDATA-compatible signal stack + */ + printf("\tTest XCOMP_PERM at child.\n"); + validate_xcomp_perm(FAIL_EXPECTED); + + /* + * Set up an XTILEDATA-compatible signal stack and + * also obtain permission to populate XTILEDATA. + */ + printf("\tTest dynamic sigaltstack at child:\n"); + test_dynamic_sigaltstack(); + + /* Ensure that XTILEDATA can be populated. */ + printf("\tTest XCOMP_PERM again at child.\n"); + validate_xcomp_perm(SUCCESS_EXPECTED); + + printf("\tFork a grandchild.\n"); + grandchild = fork(); + if (grandchild < 0) { + /* fork() failed */ + fatal_error("fork"); + } else if (!grandchild) { + /* fork() succeeded. Now in the (grand)child. */ + printf("\tTest XCOMP_PERM at grandchild.\n"); + + /* + * Ensure that the grandchild inherited + * permission and a compatible sigaltstack: + */ + validate_xcomp_perm(SUCCESS_EXPECTED); + } else { + int status; + /* fork() succeeded. Now in the parent. */ + + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + fatal_error("fork test grandchild"); + } + + _exit(0); +} + +/* + * Save current register state and compare it to @xbuf1.' + * + * Returns false if @xbuf1 matches the registers. + * Returns true if @xbuf1 differs from the registers. + */ +static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1) +{ + struct xsave_buffer *xbuf2; + int ret; + + xbuf2 = alloc_xbuf(); + if (!xbuf2) + fatal_error("failed to allocate XSAVE buffer\n"); + + xsave(xbuf2, XFEATURE_MASK_XTILEDATA); + ret = memcmp(&xbuf1->bytes[xtiledata.xbuf_offset], + &xbuf2->bytes[xtiledata.xbuf_offset], + xtiledata.size); + + free(xbuf2); + + if (ret == 0) + return false; + return true; +} + +static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf) +{ + int ret = __validate_tiledata_regs(xbuf); + + if (ret != 0) + fatal_error("TILEDATA registers changed"); +} + +static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf) +{ + int ret = __validate_tiledata_regs(xbuf); + + if (ret == 0) + fatal_error("TILEDATA registers did not change"); +} + +/* tiledata inheritance test */ + +static void test_fork(void) +{ + pid_t child, grandchild; + + child = fork(); + if (child < 0) { + /* fork() failed */ + fatal_error("fork"); + } else if (child > 0) { + /* fork() succeeded. Now in the parent. */ + int status; + + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + fatal_error("fork test child"); + return; + } + /* fork() succeeded. Now in the child. */ + printf("[RUN]\tCheck tile data inheritance.\n\tBefore fork(), load tiledata\n"); + + load_rand_tiledata(stashed_xsave); + + grandchild = fork(); + if (grandchild < 0) { + /* fork() failed */ + fatal_error("fork"); + } else if (grandchild > 0) { + /* fork() succeeded. Still in the first child. */ + int status; + + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + fatal_error("fork test grand child"); + _exit(0); + } + /* fork() succeeded. Now in the (grand)child. */ + + /* + * TILEDATA registers are not preserved across fork(). + * Ensure that their value has changed: + */ + validate_tiledata_regs_changed(stashed_xsave); + + _exit(0); +} + +int main(void) +{ + /* Check hardware availability at first */ + check_cpuid_xsave(); + check_cpuid_xtiledata(); + + init_stashed_xsave(); + sethandler(SIGILL, handle_noperm, 0); + + test_dynamic_state(); + + /* Request permission for the following tests */ + req_xtiledata_perm(); + + test_fork(); + + clearhandler(SIGILL); + free_stashed_xsave(); + + return 0; +} From 101c669d165d341b8c35424eb3878138044394ef Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 26 Oct 2021 05:25:25 -0700 Subject: [PATCH 108/110] selftests/x86/amx: Add context switch test XSAVE state is thread-local. The kernel switches between thread state at context switch time. Generally, running a selftest for a while will naturally expose it to some context switching and and will test the XSAVE code. Instead of just hoping that the tests get context-switched at random times, force context-switches on purpose. Spawn off a few userspace threads and force context-switches between them. Ensure that the kernel correctly context switches each thread's unique AMX state. [ dhansen: bunches of cleanups ] Signed-off-by: Chang S. Bae Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211026122525.6EFD5758@davehans-spike.ostc.intel.com --- tools/testing/selftests/x86/amx.c | 160 +++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index ce012ad15fa5..3615ef4a48bb 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -10,8 +11,6 @@ #include #include -#include - #include #include #include @@ -259,7 +258,6 @@ void sig_print(char *msg) static volatile bool noperm_signaled; static int noperm_errs; - /* * Signal handler for when AMX is used but * permission has not been obtained. @@ -674,6 +672,158 @@ static void test_fork(void) _exit(0); } +/* Context switching test */ + +static struct _ctxtswtest_cfg { + unsigned int iterations; + unsigned int num_threads; +} ctxtswtest_config; + +struct futex_info { + pthread_t thread; + int nr; + pthread_mutex_t mutex; + struct futex_info *next; +}; + +static void *check_tiledata(void *info) +{ + struct futex_info *finfo = (struct futex_info *)info; + struct xsave_buffer *xbuf; + int i; + + xbuf = alloc_xbuf(); + if (!xbuf) + fatal_error("unable to allocate XSAVE buffer"); + + /* + * Load random data into 'xbuf' and then restore + * it to the tile registers themselves. + */ + load_rand_tiledata(xbuf); + for (i = 0; i < ctxtswtest_config.iterations; i++) { + pthread_mutex_lock(&finfo->mutex); + + /* + * Ensure the register values have not + * diverged from those recorded in 'xbuf'. + */ + validate_tiledata_regs_same(xbuf); + + /* Load new, random values into xbuf and registers */ + load_rand_tiledata(xbuf); + + /* + * The last thread's last unlock will be for + * thread 0's mutex. However, thread 0 will + * have already exited the loop and the mutex + * will already be unlocked. + * + * Because this is not an ERRORCHECK mutex, + * that inconsistency will be silently ignored. + */ + pthread_mutex_unlock(&finfo->next->mutex); + } + + free(xbuf); + /* + * Return this thread's finfo, which is + * a unique value for this thread. + */ + return finfo; +} + +static int create_threads(int num, struct futex_info *finfo) +{ + int i; + + for (i = 0; i < num; i++) { + int next_nr; + + finfo[i].nr = i; + /* + * Thread 'i' will wait on this mutex to + * be unlocked. Lock it immediately after + * initialization: + */ + pthread_mutex_init(&finfo[i].mutex, NULL); + pthread_mutex_lock(&finfo[i].mutex); + + next_nr = (i + 1) % num; + finfo[i].next = &finfo[next_nr]; + + if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i])) + fatal_error("pthread_create()"); + } + return 0; +} + +static void affinitize_cpu0(void) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + fatal_error("sched_setaffinity to CPU 0"); +} + +static void test_context_switch(void) +{ + struct futex_info *finfo; + int i; + + /* Affinitize to one CPU to force context switches */ + affinitize_cpu0(); + + req_xtiledata_perm(); + + printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n", + ctxtswtest_config.iterations, + ctxtswtest_config.num_threads); + + + finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads); + if (!finfo) + fatal_error("malloc()"); + + create_threads(ctxtswtest_config.num_threads, finfo); + + /* + * This thread wakes up thread 0 + * Thread 0 will wake up 1 + * Thread 1 will wake up 2 + * ... + * the last thread will wake up 0 + * + * ... this will repeat for the configured + * number of iterations. + */ + pthread_mutex_unlock(&finfo[0].mutex); + + /* Wait for all the threads to finish: */ + for (i = 0; i < ctxtswtest_config.num_threads; i++) { + void *thread_retval; + int rc; + + rc = pthread_join(finfo[i].thread, &thread_retval); + + if (rc) + fatal_error("pthread_join() failed for thread %d err: %d\n", + i, rc); + + if (thread_retval != &finfo[i]) + fatal_error("unexpected thread retval for thread %d: %p\n", + i, thread_retval); + + } + + printf("[OK]\tNo incorrect case was found.\n"); + + free(finfo); +} + int main(void) { /* Check hardware availability at first */ @@ -690,6 +840,10 @@ int main(void) test_fork(); + ctxtswtest_config.iterations = 10; + ctxtswtest_config.num_threads = 5; + test_context_switch(); + clearhandler(SIGILL); free_stashed_xsave(); From 868c250bb4639531ff33b2d879fbef39c1d9ed39 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 25 Oct 2021 15:04:13 +1100 Subject: [PATCH 109/110] x86/fpu: Include vmalloc.h for vzalloc() Explicitly include that header to avoid build errors when vzalloc() becomes "invisible" to the compiler due to header reorganizations. This is not a problem in the tip tree but occurred when integrating linux-next. [ bp: Commit message. ] Link: https://lore.kernel.org/r/20211025151144.552c60ca@canb.auug.org.au Fixes: 69f6ed1d14c6 ("x86/fpu: Provide infrastructure for KVM FPU cleanup") Signed-off-by: Stephen Rothwell Signed-off-by: Borislav Petkov --- arch/x86/kernel/fpu/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 290836d1f2a7..8ea306b1bf8e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -16,6 +16,7 @@ #include #include +#include #include "context.h" #include "internal.h" From d7a9590f608dbedd917eb0857a074accdf0d3919 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 26 Oct 2021 02:11:57 -0700 Subject: [PATCH 110/110] Documentation/x86: Add documentation for using dynamic XSTATE features Explain how dynamic XSTATE features can be enabled via the architecture-specific prctl() along with dynamic sigframe size and first use trap handling. Fix: Documentation/x86/xstate.rst:15: WARNING: Title underline too short. as reported by Stephen Rothwell Originally-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211026091157.16711-1-chang.seok.bae@intel.com --- Documentation/x86/index.rst | 1 + Documentation/x86/xstate.rst | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 Documentation/x86/xstate.rst diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index 383048396336..f498f1d36cd3 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -37,3 +37,4 @@ x86-specific Documentation sgx features elf_auxvec + xstate diff --git a/Documentation/x86/xstate.rst b/Documentation/x86/xstate.rst new file mode 100644 index 000000000000..65de3f054ba5 --- /dev/null +++ b/Documentation/x86/xstate.rst @@ -0,0 +1,65 @@ +Using XSTATE features in user space applications +================================================ + +The x86 architecture supports floating-point extensions which are +enumerated via CPUID. Applications consult CPUID and use XGETBV to +evaluate which features have been enabled by the kernel XCR0. + +Up to AVX-512 and PKRU states, these features are automatically enabled by +the kernel if available. Features like AMX TILE_DATA (XSTATE component 18) +are enabled by XCR0 as well, but the first use of related instruction is +trapped by the kernel because by default the required large XSTATE buffers +are not allocated automatically. + +Using dynamically enabled XSTATE features in user space applications +-------------------------------------------------------------------- + +The kernel provides an arch_prctl(2) based mechanism for applications to +request the usage of such features. The arch_prctl(2) options related to +this are: + +-ARCH_GET_XCOMP_SUPP + + arch_prctl(ARCH_GET_XCOMP_SUPP, &features); + + ARCH_GET_XCOMP_SUPP stores the supported features in userspace storage of + type uint64_t. The second argument is a pointer to that storage. + +-ARCH_GET_XCOMP_PERM + + arch_prctl(ARCH_GET_XCOMP_PERM, &features); + + ARCH_GET_XCOMP_PERM stores the features for which the userspace process + has permission in userspace storage of type uint64_t. The second argument + is a pointer to that storage. + +-ARCH_REQ_XCOMP_PERM + + arch_prctl(ARCH_REQ_XCOMP_PERM, feature_nr); + + ARCH_REQ_XCOMP_PERM allows to request permission for a dynamically enabled + feature or a feature set. A feature set can be mapped to a facility, e.g. + AMX, and can require one or more XSTATE components to be enabled. + + The feature argument is the number of the highest XSTATE component which + is required for a facility to work. + +When requesting permission for a feature, the kernel checks the +availability. The kernel ensures that sigaltstacks in the process's tasks +are large enough to accommodate the resulting large signal frame. It +enforces this both during ARCH_REQ_XCOMP_SUPP and during any subsequent +sigaltstack(2) calls. If an installed sigaltstack is smaller than the +resulting sigframe size, ARCH_REQ_XCOMP_SUPP results in -ENOSUPP. Also, +sigaltstack(2) results in -ENOMEM if the requested altstack is too small +for the permitted features. + +Permission, when granted, is valid per process. Permissions are inherited +on fork(2) and cleared on exec(3). + +The first use of an instruction related to a dynamically enabled feature is +trapped by the kernel. The trap handler checks whether the process has +permission to use the feature. If the process has no permission then the +kernel sends SIGILL to the application. If the process has permission then +the handler allocates a larger xstate buffer for the task so the large +state can be context switched. In the unlikely cases that the allocation +fails, the kernel sends SIGSEGV.