From fddf8ba1e48860211c9639d00883833b42fcc1e0 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:47 -0400 Subject: [PATCH 01/21] x86/ptrace: Prevent ptrace from clearing the FS/GS selector When a ptracer writes a ptracee's FS/GSBASE with a different value, the selector is also cleared. This behavior is not correct as the selector should be preserved. Update only the base value and leave the selector intact. To simplify the code further remove the conditional checking for the same value as this code is not performance critical. The only recognizable downside of this change is when the selector is already nonzero on write. The base will be reloaded according to the selector. But the case is highly unexpected in real usages. [ tglx: Massage changelog ] Suggested-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/9040CFCD-74BD-4C17-9A01-B9B713CF6B10@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-2-sashal@kernel.org --- arch/x86/kernel/ptrace.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 44130588987f..1c7646c8d0fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -379,25 +379,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } From dd649bd0b3aa012740059b1ba31ecad28a408f7f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 May 2020 16:13:48 -0400 Subject: [PATCH 02/21] x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE This is temporary. It will allow the next few patches to be tested incrementally. Setting unsafe_fsgsbase is a root hole. Don't do it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Andy Lutomirski Link: https://lkml.kernel.org/r/1557309753-24073-4-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-3-sashal@kernel.org --- .../admin-guide/kernel-parameters.txt | 3 +++ arch/x86/kernel/cpu/common.c | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..7308db7308ba 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3079,6 +3079,9 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. + unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be + replaced with a nofsgsbase flag. + no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 043d93cdcaad..7438a318e5aa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -441,6 +441,22 @@ static void __init setup_cr_pinning(void) static_key_enable(&cr_pinning.key); } +/* + * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are + * updated. This allows us to get the kernel ready incrementally. + * + * Once all the pieces are in place, these will go away and be replaced with + * a nofsgsbase chicken flag. + */ +static bool unsafe_fsgsbase; + +static __init int setup_unsafe_fsgsbase(char *arg) +{ + unsafe_fsgsbase = true; + return 1; +} +__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1495,6 +1511,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + if (unsafe_fsgsbase) + cr4_set_bits(X86_CR4_FSGSBASE); + else + clear_cpu_cap(c, X86_FEATURE_FSGSBASE); + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." From b15378ca50810c1350086cafad3fe19979262a83 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 May 2020 16:13:49 -0400 Subject: [PATCH 03/21] x86/fsgsbase/64: Add intrinsics for FSGSBASE instructions [ luto: Rename the variables from FS and GS to FSBASE and GSBASE and make safe to include on 32-bit kernels. ] Signed-off-by: Andi Kleen Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1557309753-24073-6-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-4-sashal@kernel.org --- arch/x86/include/asm/fsgsbase.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index bca4c743de77..fdd1177499b4 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,6 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); +/* Must be protected by X86_FEATURE_FSGSBASE check. */ + +static __always_inline unsigned long rdfsbase(void) +{ + unsigned long fsbase; + + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + + return fsbase; +} + +static __always_inline unsigned long rdgsbase(void) +{ + unsigned long gsbase; + + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); + + return gsbase; +} + +static __always_inline void wrfsbase(unsigned long fsbase) +{ + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); +} + +static __always_inline void wrgsbase(unsigned long gsbase) +{ + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); +} + /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) From 58edfd2e0a93c9adc2f29902a0335af0584041a0 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:50 -0400 Subject: [PATCH 04/21] x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions Add cpu feature conditional FSGSBASE access to the relevant helper functions. That allows to accelerate certain FS/GS base operations in subsequent changes. Note, that while possible, the user space entry/exit GSBASE operations are not going to use the new FSGSBASE instructions. The reason is that it would require additional storage for the user space value which adds more complexity to the low level code and experiments have shown marginal benefit. This may be revisited later but for now the SWAPGS based handling in the entry code is preserved except for the paranoid entry/exit code. To preserve the SWAPGS entry mechanism introduce __[rd|wr]gsbase_inactive() helpers. Note, for Xen PV, paravirt hooks can be added later as they might allow a very efficient but different implementation. [ tglx: Massaged changelog, convert it to noinstr and force inline native_swapgs() ] Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-7-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-5-sashal@kernel.org --- arch/x86/include/asm/fsgsbase.h | 27 ++++++------- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/process_64.c | 68 ++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index fdd1177499b4..aefd53767a5d 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -49,35 +49,32 @@ static __always_inline void wrgsbase(unsigned long gsbase) asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } +#include + /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - rdmsrl(MSR_FS_BASE, fsbase); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + fsbase = rdfsbase(); + else + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static inline unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - - return gsbase; -} - static inline void x86_fsbase_write_cpu(unsigned long fsbase) { - wrmsrl(MSR_FS_BASE, fsbase); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + wrfsbase(fsbase); + else + wrmsrl(MSR_FS_BASE, fsbase); } -static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); -} +extern unsigned long x86_gsbase_read_cpu_inactive(void); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 42cd333616c4..f66202d6121a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -575,7 +575,7 @@ native_load_sp0(unsigned long sp0) this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } -static inline void native_swapgs(void) +static __always_inline void native_swapgs(void) { #ifdef CONFIG_X86_64 asm volatile("swapgs" ::: "memory"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..c41e0aae8426 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -149,6 +149,44 @@ enum which_selector { GS }; +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); +} + /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -327,6 +365,36 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; From 6758034e4d6a7f0e26b748789ab1f83f3116d1b9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:13:52 -0400 Subject: [PATCH 05/21] x86/process/64: Make save_fsgs_for_kvm() ready for FSGSBASE save_fsgs_for_kvm() is invoked via vcpu_enter_guest() kvm_x86_ops.prepare_guest_switch(vcpu) vmx_prepare_switch_to_guest() save_fsgs_for_kvm() with preemption disabled, but interrupts enabled. The upcoming FSGSBASE based GS safe needs interrupts to be disabled. This could be done in the helper function, but that function is also called from switch_to() which has interrupts disabled already. Disable interrupts inside save_fsgs_for_kvm() and rename the function to current_save_fsgs() so it can be invoked from other places. Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528201402.1708239-7-sashal@kernel.org --- arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/process_64.c | 15 +++++++++------ arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f66202d6121a..7c2ecdf7e064 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -457,10 +457,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); -#if IS_ENABLED(CONFIG_KVM) /* Save actual FS/GS selectors and bases to current->thread */ -void save_fsgs_for_kvm(void); -#endif +void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c41e0aae8426..ef2f75588f41 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -240,18 +240,21 @@ static __always_inline void save_fsgs(struct task_struct *task) save_base_legacy(task, task->thread.gsindex, GS); } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 36c771728c8c..ccd5b7bb617c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1172,7 +1172,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); + current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; From 673903495c85137791d5820d690229efe09c8f7b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 May 2020 16:13:51 -0400 Subject: [PATCH 06/21] x86/process/64: Use FSBSBASE in switch_to() if available With the new FSGSBASE instructions, FS and GSABSE can be efficiently read and writen in __switch_to(). Use that capability to preserve the full state. This will enable user code to do whatever it wants with the new instructions without any kernel-induced gotchas. (There can still be architectural gotchas: movl %gs,%eax; movl %eax,%gs may change GSBASE if WRGSBASE was used, but users are expected to read the CPU manual before doing things like that.) This is a considerable speedup. It seems to save about 100 cycles per context switch compared to the baseline 4.6-rc1 behavior on a Skylake laptop. This is mostly due to avoiding the WRMSR operation. [ chang: 5~10% performance improvements were seen with a context switch benchmark that ran threads with different FS/GSBASE values (to the baseline 4.16). Minor edit on the changelog. ] [ tglx: Masaage changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1557309753-24073-8-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-6-sashal@kernel.org --- arch/x86/kernel/process_64.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ef2f75588f41..8ccc587c311b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -236,8 +236,18 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } /* @@ -319,10 +329,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, From 005f141e5d5e05d3986539567d0bc5aa2f4dc640 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:53 -0400 Subject: [PATCH 07/21] x86/process/64: Use FSGSBASE instructions on thread copy and ptrace When FSGSBASE is enabled, copying threads and reading fsbase and gsbase using ptrace must read the actual values. When copying a thread, use save_fsgs() and copy the saved values. For ptrace, the bases must be read from memory regardless of the selector if FSGSBASE is enabled. [ tglx: Invoke __rdgsbase_inactive() with interrupts disabled ] [ luto: Massage changelog ] Suggested-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-9-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-8-sashal@kernel.org --- arch/x86/kernel/process.c | 10 ++++++---- arch/x86/kernel/process_64.c | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f362ce0d5ac0..216c88df1919 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ccc587c311b..d618969cea66 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -426,7 +426,8 @@ unsigned long x86_fsbase_read_task(struct task_struct *task) if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -440,7 +441,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); From 978e1342c3c4d7b20808fd5875d9ac0d57db22ee Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 28 May 2020 16:13:54 -0400 Subject: [PATCH 08/21] x86/speculation/swapgs: Check FSGSBASE in enabling SWAPGS mitigation Before enabling FSGSBASE the kernel could safely assume that the content of GS base was a user address. Thus any speculative access as the result of a mispredicted branch controlling the execution of SWAPGS would be to a user address. So systems with speculation-proof SMAP did not need to add additional LFENCE instructions to mitigate. With FSGSBASE enabled a hostile user can set GS base to a kernel address. So they can make the kernel speculatively access data they wish to leak via a side channel. This means that SMAP provides no protection. Add FSGSBASE as an additional condition to enable the fence-based SWAPGS mitigation. Signed-off-by: Tony Luck Signed-off-by: Chang S. Bae Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528201402.1708239-9-sashal@kernel.org --- arch/x86/kernel/cpu/bugs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0b71970d2d3d..5ea5fbd76ad0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation From 96b2371413e8f636a5f25c42a933af21c35a2a41 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:55 -0400 Subject: [PATCH 09/21] x86/entry/64: Switch CR3 before SWAPGS in paranoid entry When FSGSBASE is enabled, the GSBASE handling in paranoid entry will need to retrieve the kernel GSBASE which requires that the kernel page table is active. As the CR3 switch to the kernel page tables (PTI is active) does not depend on kernel GSBASE, move the CR3 switch in front of the GSBASE handling. Comment the EBX content while at it. No functional change. [ tglx: Rewrote changelog and comments ] Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-11-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-10-sashal@kernel.org --- arch/x86/entry/entry_64.S | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..04d1eea3ac3b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -807,13 +807,6 @@ SYM_CODE_START_LOCAL(paranoid_entry) cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx 1: /* @@ -825,9 +818,29 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence @@ -835,6 +848,8 @@ SYM_CODE_START_LOCAL(paranoid_entry) */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -852,7 +867,8 @@ SYM_CODE_END(paranoid_entry) */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - testl %ebx, %ebx /* swapgs needed? */ + /* If EBX is 0, SWAPGS is required */ + testl %ebx, %ebx jnz .Lparanoid_exit_no_swapgs /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 From eaad981291ee36efee15a5e515d4598ae94ace07 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:56 -0400 Subject: [PATCH 10/21] x86/entry/64: Introduce the FIND_PERCPU_BASE macro GSBASE is used to find per-CPU data in the kernel. But when GSBASE is unknown, the per-CPU base can be found from the per_cpu_offset table with a CPU NR. The CPU NR is extracted from the limit field of the CPUNODE entry in GDT, or by the RDPID instruction. This is a prerequisite for using FSGSBASE in the low level entry code. Also, add the GAS-compatible RDPID macro as binutils 2.23 do not support it. Support is added in version 2.27. [ tglx: Massaged changelog ] Suggested-by: H. Peter Anvin Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-12-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-11-sashal@kernel.org --- arch/x86/entry/calling.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/inst.h | 15 +++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 4208c1e3f601..5c0cbb443030 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include #include #include +#include /* @@ -351,3 +352,36 @@ For 32-bit we have the following conventions - kernel is built with call stackleak_erase #endif .endm + +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..d063841a17e3 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,6 +306,21 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm + +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd + .else + R32_NUM rdpid_opd \opd + .endif + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 + .endif + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif From c82965f9e53005c1c62632c468968293262056cb Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:13:57 -0400 Subject: [PATCH 11/21] x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit Without FSGSBASE, user space cannot change GSBASE other than through a PRCTL. The kernel enforces that the user space GSBASE value is postive as negative values are used for detecting the kernel space GSBASE value in the paranoid entry code. If FSGSBASE is enabled, user space can set arbitrary GSBASE values without kernel intervention, including negative ones, which breaks the paranoid entry assumptions. To avoid this, paranoid entry needs to unconditionally save the current GSBASE value independent of the interrupted context, retrieve and write the kernel GSBASE and unconditionally restore the saved value on exit. The restore happens either in paranoid_exit or in the special exit path of the NMI low level code. All other entry code pathes which use unconditional SWAPGS are not affected as they do not depend on the actual content. [ tglx: Massaged changelogs and comments ] Suggested-by: H. Peter Anvin Suggested-by: Andy Lutomirski Suggested-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-13-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-12-sashal@kernel.org --- arch/x86/entry/calling.h | 6 +++ arch/x86/entry/entry_64.S | 111 +++++++++++++++++++++++++++++--------- 2 files changed, 91 insertions(+), 26 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 5c0cbb443030..98e4d8886f11 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -342,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + #else /* CONFIG_X86_64 */ # undef UNWIND_HINT_IRET_REGS # define UNWIND_HINT_IRET_REGS diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 04d1eea3ac3b..fb729f4c4fbc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "calling.h" @@ -426,10 +427,7 @@ SYM_CODE_START(\asmsym) testb $3, CS-ORIG_RAX(%rsp) jnz .Lfrom_usermode_switch_stack_\@ - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -458,10 +456,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ASM_CLAC - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -798,9 +793,14 @@ SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC @@ -808,7 +808,6 @@ SYM_CODE_START_LOCAL(paranoid_entry) PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -826,6 +825,28 @@ SYM_CODE_START_LOCAL(paranoid_entry) */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: /* EBX = 1 -> kernel GSBASE active, no restore required */ movl $1, %ebx /* @@ -860,24 +881,45 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - /* If EBX is 0, SWAPGS is required */ - testl %ebx, %ebx - jnz .Lparanoid_exit_no_swapgs - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1282,10 +1324,27 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS From b745cfba44c152c34363eea9e052367b6b1d652b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 May 2020 16:13:58 -0400 Subject: [PATCH 12/21] x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit Now that FSGSBASE is fully supported, remove unsafe_fsgsbase, enable FSGSBASE by default, and add nofsgsbase to disable it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1557309753-24073-17-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-13-sashal@kernel.org --- .../admin-guide/kernel-parameters.txt | 3 +- arch/x86/kernel/cpu/common.c | 32 ++++++++----------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7308db7308ba..8c0d045f1dc7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3079,8 +3079,7 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be - replaced with a nofsgsbase flag. + nofsgsbase [X86] Disables FSGSBASE instructions. no_console_suspend [HW] Never suspend the console diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7438a318e5aa..18857ceb6bce 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -441,21 +441,21 @@ static void __init setup_cr_pinning(void) static_key_enable(&cr_pinning.key); } -/* - * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are - * updated. This allows us to get the kernel ready incrementally. - * - * Once all the pieces are in place, these will go away and be replaced with - * a nofsgsbase chicken flag. - */ -static bool unsafe_fsgsbase; - -static __init int setup_unsafe_fsgsbase(char *arg) +static __init int x86_nofsgsbase_setup(char *arg) { - unsafe_fsgsbase = true; + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); return 1; } -__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); +__setup("nofsgsbase", x86_nofsgsbase_setup); /* * Protection Keys are not available in 32-bit mode. @@ -1512,12 +1512,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - if (unsafe_fsgsbase) - cr4_set_bits(X86_CR4_FSGSBASE); - else - clear_cpu_cap(c, X86_FEATURE_FSGSBASE); - } + if (cpu_has(c, X86_FEATURE_FSGSBASE)) + cr4_set_bits(X86_CR4_FSGSBASE); /* * The vendor-specific functions might have changed features. From 742c45c3ecc9255e15eddbbcee44fd8de401cf1c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 May 2020 16:13:59 -0400 Subject: [PATCH 13/21] x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2 The kernel needs to explicitly enable FSGSBASE. So, the application needs to know if it can safely use these instructions. Just looking at the CPUID bit is not enough because it may be running in a kernel that does not enable the instructions. One way for the application would be to just try and catch the SIGILL. But that is difficult to do in libraries which may not want to overwrite the signal handlers of the main application. Enumerate the enabled FSGSBASE capability in bit 1 of AT_HWCAP2 in the ELF aux vector. AT_HWCAP2 is already used by PPC for similar purposes. The application can access it open coded or by using the getauxval() function in newer versions of glibc. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1557309753-24073-18-git-send-email-chang.seok.bae@intel.com Link: https://lkml.kernel.org/r/20200528201402.1708239-14-sashal@kernel.org --- arch/x86/include/uapi/asm/hwcap2.h | 3 +++ arch/x86/kernel/cpu/common.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 8b2effe6efb8..5fdfcb47000f 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 18857ceb6bce..fca56129ddc7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1512,8 +1512,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } /* * The vendor-specific functions might have changed features. From 82c0c7d24c1a034d94af4595dbd3910d7336a6dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:14:00 -0400 Subject: [PATCH 14/21] Documentation/x86/64: Add documentation for GS/FS addressing mode Explain how the GS/FS based addressing can be utilized in user space applications along with the differences between the generic prctl() based GS/FS base control and the FSGSBASE version available on newer CPUs. Originally-by: Andi Kleen Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200528201402.1708239-15-sashal@kernel.org --- Documentation/x86/x86_64/fsgs.rst | 199 +++++++++++++++++++++++++++++ Documentation/x86/x86_64/index.rst | 1 + 2 files changed, 200 insertions(+) create mode 100644 Documentation/x86/x86_64/fsgs.rst diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst new file mode 100644 index 000000000000..50960e09e1f6 --- /dev/null +++ b/Documentation/x86/x86_64/fsgs.rst @@ -0,0 +1,199 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Using FS and GS segments in user space applications +=================================================== + +The x86 architecture supports segmentation. Instructions which access +memory can use segment register based addressing mode. The following +notation is used to address a byte within a segment: + + Segment-register:Byte-address + +The segment base address is added to the Byte-address to compute the +resulting virtual address which is accessed. This allows to access multiple +instances of data with the identical Byte-address, i.e. the same code. The +selection of a particular instance is purely based on the base-address in +the segment register. + +In 32-bit mode the CPU provides 6 segments, which also support segment +limits. The limits can be used to enforce address space protections. + +In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is +always 0 to provide a full 64bit address space. The FS and GS segments are +still functional in 64-bit mode. + +Common FS and GS usage +------------------------------ + +The FS segment is commonly used to address Thread Local Storage (TLS). FS +is usually managed by runtime code or a threading library. Variables +declared with the '__thread' storage class specifier are instantiated per +thread and the compiler emits the FS: address prefix for accesses to these +variables. Each thread has its own FS base address so common code can be +used without complex address offset calculations to access the per thread +instances. Applications should not use FS for other purposes when they use +runtimes or threading libraries which manage the per thread FS. + +The GS segment has no common use and can be used freely by +applications. GCC and Clang support GS based addressing via address space +identifiers. + +Reading and writing the FS/GS base address +------------------------------------------ + +There exist two mechanisms to read and write the FS/GS base address: + + - the arch_prctl() system call + + - the FSGSBASE instruction family + +Accessing FS/GS base with arch_prctl() +-------------------------------------- + + The arch_prctl(2) based mechanism is available on all 64-bit CPUs and all + kernel versions. + + Reading the base: + + arch_prctl(ARCH_GET_FS, &fsbase); + arch_prctl(ARCH_GET_GS, &gsbase); + + Writing the base: + + arch_prctl(ARCH_SET_FS, fsbase); + arch_prctl(ARCH_SET_GS, gsbase); + + The ARCH_SET_GS prctl may be disabled depending on kernel configuration + and security settings. + +Accessing FS/GS base with the FSGSBASE instructions +--------------------------------------------------- + + With the Ivy Bridge CPU generation Intel introduced a new set of + instructions to access the FS and GS base registers directly from user + space. These instructions are also supported on AMD Family 17H CPUs. The + following instructions are available: + + =============== =========================== + RDFSBASE %reg Read the FS base register + RDGSBASE %reg Read the GS base register + WRFSBASE %reg Write the FS base register + WRGSBASE %reg Write the GS base register + =============== =========================== + + The instructions avoid the overhead of the arch_prctl() syscall and allow + more flexible usage of the FS/GS addressing modes in user space + applications. This does not prevent conflicts between threading libraries + and runtimes which utilize FS and applications which want to use it for + their own purpose. + +FSGSBASE instructions enablement +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If + available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. + + The availability of the instructions does not enable them + automatically. The kernel has to enable them explicitly in CR4. The + reason for this is that older kernels make assumptions about the values in + the GS register and enforce them when GS base is set via + arch_prctl(). Allowing user space to write arbitrary values to GS base + would violate these assumptions and cause malfunction. + + On kernels which do not enable FSGSBASE the execution of the FSGSBASE + instructions will fault with a #UD exception. + + The kernel provides reliable information about the enabled state in the + ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the + kernel has FSGSBASE instructions enabled and applications can use them. + The following code example shows how this detection works:: + + #include + #include + + /* Will be eventually in asm/hwcap.h */ + #ifndef HWCAP2_FSGSBASE + #define HWCAP2_FSGSBASE (1 << 1) + #endif + + .... + + unsigned val = getauxval(AT_HWCAP2); + + if (val & HWCAP2_FSGSBASE) + printf("FSGSBASE enabled\n"); + +FSGSBASE instructions compiler support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE +instructions. Clang 5 supports them as well. + + =================== =========================== + _readfsbase_u64() Read the FS base register + _readfsbase_u64() Read the GS base register + _writefsbase_u64() Write the FS base register + _writegsbase_u64() Write the GS base register + =================== =========================== + +To utilize these instrinsics must be included in the source +code and the compiler option -mfsgsbase has to be added. + +Compiler support for FS/GS based addressing +------------------------------------------- + +GCC version 6 and newer provide support for FS/GS based addressing via +Named Address Spaces. GCC implements the following address space +identifiers for x86: + + ========= ==================================== + __seg_fs Variable is addressed relative to FS + __seg_gs Variable is addressed relative to GS + ========= ==================================== + +The preprocessor symbols __SEG_FS and __SEG_GS are defined when these +address spaces are supported. Code which implements fallback modes should +check whether these symbols are defined. Usage example:: + + #ifdef __SEG_GS + + long data0 = 0; + long data1 = 1; + + long __seg_gs *ptr; + + /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ + .... + + /* Set GS base to point to data0 */ + _writegsbase_u64(&data0); + + /* Access offset 0 of GS */ + ptr = 0; + printf("data0 = %ld\n", *ptr); + + /* Set GS base to point to data1 */ + _writegsbase_u64(&data1); + /* ptr still addresses offset 0! */ + printf("data1 = %ld\n", *ptr); + + +Clang does not provide the GCC address space identifiers, but it provides +address spaces via an attribute based mechanism in Clang 2.6 and newer +versions: + + ==================================== ===================================== + __attribute__((address_space(256)) Variable is addressed relative to GS + __attribute__((address_space(257)) Variable is addressed relative to FS + ==================================== ===================================== + +FS/GS based addressing with inline assembly +------------------------------------------- + +In case the compiler does not support address spaces, inline assembly can +be used for FS/GS based addressing mode:: + + mov %fs:offset, %reg + mov %gs:offset, %reg + + mov %reg, %fs:offset + mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index d6eaaa5a35fc..a56070fc8e77 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,3 +14,4 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck + fsgs From 291fd83569e10f3d305cd8adb62f6ec00f759dc6 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:14:01 -0400 Subject: [PATCH 15/21] selftests/x86/fsgsbase: Test GS selector on ptracer-induced GS base write The test validates that the selector is not changed when a ptracer writes the ptracee's GS base. Originally-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200528201402.1708239-16-sashal@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 15a329da59fa..950a48b2e366 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -465,7 +465,7 @@ static void test_ptrace_write_gsbase(void) wait(&status); if (WSTOPSIG(status) == SIGTRAP) { - unsigned long gs, base; + unsigned long gs; unsigned long gs_offset = USER_REGS_OFFSET(gs); unsigned long base_offset = USER_REGS_OFFSET(gs_base); @@ -481,7 +481,6 @@ static void test_ptrace_write_gsbase(void) err(1, "PTRACE_POKEUSER"); gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); - base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL); /* * In a non-FSGSBASE system, the nonzero selector will load @@ -489,11 +488,21 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs == 0 && base == 0xFF) { - printf("[OK]\tGS was reset as expected\n"); - } else { + if (gs != *shared_scratch) { nerrs++; - printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); + printf("[FAIL]\tGS changed to %lx\n", gs); + + /* + * On older kernels, poking a nonzero value into the + * base would zero the selector. On newer kernels, + * this behavior has changed -- poking the base + * changes only the base and, if FSGSBASE is not + * available, this may not effect. + */ + if (gs == 0) + printf("\tNote: this is expected behavior on older kernels.\n"); + } else { + printf("[OK]\tGS remained 0x%hx\n", *shared_scratch); } } From 5e7ec8578fa3dada50c50f5b234fa8d154b76349 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 28 May 2020 16:14:02 -0400 Subject: [PATCH 16/21] selftests/x86/fsgsbase: Test ptracer-induced GS base write with FSGSBASE This validates that GS selector and base are independently preserved in ptrace commands. Suggested-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Sasha Levin Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200528201402.1708239-17-sashal@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 950a48b2e366..9a4349813a30 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -465,7 +465,7 @@ static void test_ptrace_write_gsbase(void) wait(&status); if (WSTOPSIG(status) == SIGTRAP) { - unsigned long gs; + unsigned long gs, base; unsigned long gs_offset = USER_REGS_OFFSET(gs); unsigned long base_offset = USER_REGS_OFFSET(gs_base); @@ -481,6 +481,7 @@ static void test_ptrace_write_gsbase(void) err(1, "PTRACE_POKEUSER"); gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL); + base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL); /* * In a non-FSGSBASE system, the nonzero selector will load @@ -501,8 +502,14 @@ static void test_ptrace_write_gsbase(void) */ if (gs == 0) printf("\tNote: this is expected behavior on older kernels.\n"); + } else if (have_fsgsbase && (base != 0xFF)) { + nerrs++; + printf("[FAIL]\tGSBASE changed to %lx\n", base); } else { - printf("[OK]\tGS remained 0x%hx\n", *shared_scratch); + printf("[OK]\tGS remained 0x%hx", *shared_scratch); + if (have_fsgsbase) + printf(" and GSBASE changed to 0xFF"); + printf("\n"); } } From a5d25e01c8146ad8846da4760422e12242fceafe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 27 May 2020 16:02:36 -0700 Subject: [PATCH 17/21] selftests/x86: Add a syscall_arg_fault_64 test for negative GSBASE If the kernel erroneously allows WRGSBASE and user code writes a negative value, paranoid_entry will get confused. Check for this by writing a negative value to GSBASE and doing SYSENTER with TF set. A successful run looks like: [RUN] SYSENTER with TF, invalid state, and GSBASE < 0 [SKIP] Illegal instruction A failed run causes a kernel hang, and I believe it's because we double-fault and then get a never ending series of page faults and, when we exhaust the double fault stack we double fault again, starting the process over. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/f4f71efc91b9eae5e3dae21c9aee1c70cf5f370e.1590620529.git.luto@kernel.org --- .../testing/selftests/x86/syscall_arg_fault.c | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index bc0ecc2e862e..62fba40866d5 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -72,6 +72,7 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) if (ax != -EFAULT && ax != -ENOSYS) { printf("[FAIL]\tAX had the wrong value: 0x%lx\n", (unsigned long)ax); + printf("\tIP = 0x%lx\n", (unsigned long)ctx->uc_mcontext.gregs[REG_IP]); n_errs++; } else { printf("[OK]\tSeems okay\n"); @@ -226,5 +227,30 @@ int main() } set_eflags(get_eflags() & ~X86_EFLAGS_TF); +#ifdef __x86_64__ + printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n"); + + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + + asm volatile ("wrgsbase %%rax\n\t" + :: "a" (0xffffffffffff0000UL)); + + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "sysenter" + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); +#endif + return 0; } From 979c2c4247cafd8a91628a7306b6871efbd12fdb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:24:27 -0700 Subject: [PATCH 18/21] selftests/x86/fsgsbase: Fix a comment in the ptrace_write_gsbase test A comment was unclear. Fix it. Fixes: 5e7ec8578fa3 ("selftests/x86/fsgsbase: Test ptracer-induced GS base write with FSGSBASE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/901034a91a40169ec84f1f699ea86704dff762e4.1593192140.git.luto@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 9a4349813a30..f47495d2f070 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -498,7 +498,8 @@ static void test_ptrace_write_gsbase(void) * base would zero the selector. On newer kernels, * this behavior has changed -- poking the base * changes only the base and, if FSGSBASE is not - * available, this may not effect. + * available, this may have no effect once the tracee + * is resumed. */ if (gs == 0) printf("\tNote: this is expected behavior on older kernels.\n"); From 8e259031c67a5ea0666428edb64c89e8c6ebd18e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:24:28 -0700 Subject: [PATCH 19/21] selftests/x86/fsgsbase: Add a missing memory constraint The manual call to set_thread_area() via int $0x80 was missing any indication that the descriptor was a pointer, causing gcc to occasionally generate wrong code. Add the missing constraint. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/432968af67259ca92d68b774a731aff468eae610.1593192140.git.luto@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index f47495d2f070..998319553523 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -285,7 +285,8 @@ static unsigned short load_gs(void) /* 32-bit set_thread_area */ long ret; asm volatile ("int $0x80" - : "=a" (ret) : "a" (243), "b" (low_desc) + : "=a" (ret), "+m" (*low_desc) + : "a" (243), "b" (low_desc) : "r8", "r9", "r10", "r11"); memcpy(&desc, low_desc, sizeof(desc)); munmap(low_desc, sizeof(desc)); From 40c45904f818c1f6555294ca27afc5fda4f09e68 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:24:29 -0700 Subject: [PATCH 20/21] x86/ptrace: Fix 32-bit PTRACE_SETREGS vs fsbase and gsbase Debuggers expect that doing PTRACE_GETREGS, then poking at a tracee and maybe letting it run for a while, then doing PTRACE_SETREGS will put the tracee back where it was. In the specific case of a 32-bit tracer and tracee, the PTRACE_GETREGS/SETREGS data structure doesn't have fs_base or gs_base fields, so FSBASE and GSBASE fields are never stored anywhere. Everything used to still work because nonzero FS or GS would result full reloads of the segment registers when the tracee resumes, and the bases associated with FS==0 or GS==0 are irrelevant to 32-bit code. Adding FSGSBASE support broke this: when FSGSBASE is enabled, FSBASE and GSBASE are now restored independently of FS and GS for all tasks when context-switched in. This means that, if a 32-bit tracer restores a previous state using PTRACE_SETREGS but the tracee's pre-restore and post-restore bases don't match, then the tracee is resumed with the wrong base. Fix it by explicitly loading the base when a 32-bit tracer pokes FS or GS on a 64-bit kernel. Also add a test case. Fixes: 673903495c85 ("x86/process/64: Use FSBSBASE in switch_to() if available") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/229cc6a50ecbb701abd50fe4ddaf0eda888898cd.1593192140.git.luto@kernel.org --- arch/x86/include/asm/fsgsbase.h | 2 + arch/x86/kernel/process_64.c | 4 +- arch/x86/kernel/ptrace.c | 43 ++- tools/testing/selftests/x86/Makefile | 2 +- .../testing/selftests/x86/fsgsbase_restore.c | 245 ++++++++++++++++++ 5 files changed, 280 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/x86/fsgsbase_restore.c diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index aefd53767a5d..d552646411a9 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -75,6 +75,8 @@ static inline void x86_fsbase_write_cpu(unsigned long fsbase) extern unsigned long x86_gsbase_read_cpu_inactive(void); extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); +extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d618969cea66..cb8e37d3acaa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -347,8 +347,8 @@ static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, } } -static unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector) +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 1c7646c8d0fe..3f006489087f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -281,17 +281,9 @@ static int set_segment_reg(struct task_struct *task, return -EIO; /* - * This function has some ABI oddities. - * - * A 32-bit ptracer probably expects that writing FS or GS will change - * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, - * this code indeed has that effect. When FSGSBASE is added, this - * will require a special case. - * - * For existing 64-bit ptracers, writing FS or GS *also* currently - * changes the base if the selector is nonzero the next time the task - * is run. This behavior may not be needed, and trying to preserve it - * when FSGSBASE is added would be complicated at best. + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. */ switch (offset) { @@ -867,14 +859,39 @@ long arch_ptrace(struct task_struct *child, long request, static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); + int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); - SEG32(fs); - SEG32(gs); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + SEG32(ss); R32(ebx, bx); diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5f16821c7f63..3ff94575d02d 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ test_vdso test_vsyscall mov_ss_trap \ - syscall_arg_fault + syscall_arg_fault fsgsbase_restore TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/fsgsbase_restore.c b/tools/testing/selftests/x86/fsgsbase_restore.c new file mode 100644 index 000000000000..6fffadc51579 --- /dev/null +++ b/tools/testing/selftests/x86/fsgsbase_restore.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fsgsbase_restore.c, test ptrace vs fsgsbase + * Copyright (c) 2020 Andy Lutomirski + * + * This test case simulates a tracer redirecting tracee execution to + * a function and then restoring tracee state using PTRACE_GETREGS and + * PTRACE_SETREGS. This is similar to what gdb does when doing + * 'p func()'. The catch is that this test has the called function + * modify a segment register. This makes sure that ptrace correctly + * restores segment state when using PTRACE_SETREGS. + * + * This is not part of fsgsbase.c, because that test is 64-bit only. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EXPECTED_VALUE 0x1337f00d + +#ifdef __x86_64__ +# define SEG "%gs" +#else +# define SEG "%fs" +#endif + +static unsigned int dereference_seg_base(void) +{ + int ret; + asm volatile ("mov %" SEG ":(0), %0" : "=rm" (ret)); + return ret; +} + +static void init_seg(void) +{ + unsigned int *target = mmap( + NULL, sizeof(unsigned int), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); + if (target == MAP_FAILED) + err(1, "mmap"); + + *target = EXPECTED_VALUE; + + printf("\tsegment base address = 0x%lx\n", (unsigned long)target); + + struct user_desc desc = { + .entry_number = 0, + .base_addr = (unsigned int)(uintptr_t)target, + .limit = sizeof(unsigned int) - 1, + .seg_32bit = 1, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { + printf("\tusing LDT slot 0\n"); + asm volatile ("mov %0, %" SEG :: "rm" ((unsigned short)0x7)); + } else { + /* No modify_ldt for us (configured out, perhaps) */ + + struct user_desc *low_desc = mmap( + NULL, sizeof(desc), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); + memcpy(low_desc, &desc, sizeof(desc)); + + low_desc->entry_number = -1; + + /* 32-bit set_thread_area */ + long ret; + asm volatile ("int $0x80" + : "=a" (ret), "+m" (*low_desc) + : "a" (243), "b" (low_desc) +#ifdef __x86_64__ + : "r8", "r9", "r10", "r11" +#endif + ); + memcpy(&desc, low_desc, sizeof(desc)); + munmap(low_desc, sizeof(desc)); + + if (ret != 0) { + printf("[NOTE]\tcould not create a segment -- can't test anything\n"); + exit(0); + } + printf("\tusing GDT slot %d\n", desc.entry_number); + + unsigned short sel = (unsigned short)((desc.entry_number << 3) | 0x3); + asm volatile ("mov %0, %" SEG :: "rm" (sel)); + } +} + +static void tracee_zap_segment(void) +{ + /* + * The tracer will redirect execution here. This is meant to + * work like gdb's 'p func()' feature. The tricky bit is that + * we modify a segment register in order to make sure that ptrace + * can correctly restore segment registers. + */ + printf("\tTracee: in tracee_zap_segment()\n"); + + /* + * Write a nonzero selector with base zero to the segment register. + * Using a null selector would defeat the test on AMD pre-Zen2 + * CPUs, as such CPUs don't clear the base when loading a null + * selector. + */ + unsigned short sel; + asm volatile ("mov %%ss, %0\n\t" + "mov %0, %" SEG + : "=rm" (sel)); + + pid_t pid = getpid(), tid = syscall(SYS_gettid); + + printf("\tTracee is going back to sleep\n"); + syscall(SYS_tgkill, pid, tid, SIGSTOP); + + /* Should not get here. */ + while (true) { + printf("[FAIL]\tTracee hit unreachable code\n"); + pause(); + } +} + +int main() +{ + printf("\tSetting up a segment\n"); + init_seg(); + + unsigned int val = dereference_seg_base(); + if (val != EXPECTED_VALUE) { + printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE); + return 1; + } + printf("[OK]\tThe segment points to the right place.\n"); + + pid_t chld = fork(); + if (chld < 0) + err(1, "fork"); + + if (chld == 0) { + prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0, 0); + + if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0) + err(1, "PTRACE_TRACEME"); + + pid_t pid = getpid(), tid = syscall(SYS_gettid); + + printf("\tTracee will take a nap until signaled\n"); + syscall(SYS_tgkill, pid, tid, SIGSTOP); + + printf("\tTracee was resumed. Will re-check segment.\n"); + + val = dereference_seg_base(); + if (val != EXPECTED_VALUE) { + printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE); + exit(1); + } + + printf("[OK]\tThe segment points to the right place.\n"); + exit(0); + } + + int status; + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + struct user_regs_struct regs; + + if (ptrace(PTRACE_GETREGS, chld, NULL, ®s) != 0) + err(1, "PTRACE_GETREGS"); + +#ifdef __x86_64__ + printf("\tChild GS=0x%lx, GSBASE=0x%lx\n", (unsigned long)regs.gs, (unsigned long)regs.gs_base); +#else + printf("\tChild FS=0x%lx\n", (unsigned long)regs.xfs); +#endif + + struct user_regs_struct regs2 = regs; +#ifdef __x86_64__ + regs2.rip = (unsigned long)tracee_zap_segment; + regs2.rsp -= 128; /* Don't clobber the redzone. */ +#else + regs2.eip = (unsigned long)tracee_zap_segment; +#endif + + printf("\tTracer: redirecting tracee to tracee_zap_segment()\n"); + if (ptrace(PTRACE_SETREGS, chld, NULL, ®s2) != 0) + err(1, "PTRACE_GETREGS"); + if (ptrace(PTRACE_CONT, chld, NULL, NULL) != 0) + err(1, "PTRACE_GETREGS"); + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + printf("\tTracer: restoring tracee state\n"); + if (ptrace(PTRACE_SETREGS, chld, NULL, ®s) != 0) + err(1, "PTRACE_GETREGS"); + if (ptrace(PTRACE_DETACH, chld, NULL, NULL) != 0) + err(1, "PTRACE_GETREGS"); + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); + + if (WIFSIGNALED(status)) { + printf("[FAIL]\tTracee crashed\n"); + return 1; + } + + if (!WIFEXITED(status)) { + printf("[FAIL]\tTracee stopped for an unexpected reason: %d\n", status); + return 1; + } + + int exitcode = WEXITSTATUS(status); + if (exitcode != 0) { + printf("[FAIL]\tTracee reported failure\n"); + return 1; + } + + printf("[OK]\tAll is well.\n"); + return 0; +} From d029bff60aa6c7eab281d52602b6a7a971615324 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:24:30 -0700 Subject: [PATCH 21/21] x86/fsgsbase: Fix Xen PV support On Xen PV, SWAPGS doesn't work. Teach __rdfsbase_inactive() and __wrgsbase_inactive() to use rdmsrl()/wrmsrl() on Xen PV. The Xen pvop code will understand this and issue the correct hypercalls. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/f07c08f178fe9711915862b656722a207cd52c28.1593192140.git.luto@kernel.org --- arch/x86/kernel/process_64.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cb8e37d3acaa..e14476f0c533 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -163,9 +163,15 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - native_swapgs(); - gsbase = rdgsbase(); - native_swapgs(); + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } return gsbase; } @@ -182,9 +188,15 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - native_swapgs(); - wrgsbase(gsbase); - native_swapgs(); + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } } /*