Merge branch 'x86/cpu' into perf/core, to pick up revert
perf/core has an earlier version of the x86/cpu tree merged, to avoid conflicts, and due to this we want to pick up this ABI impacting revert as well: 049331f277fe: ("x86/fsgsbase: Revert FSGSBASE support") Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
f584dd32ed
|
@ -2857,8 +2857,6 @@
|
||||||
no5lvl [X86-64] Disable 5-level paging mode. Forces
|
no5lvl [X86-64] Disable 5-level paging mode. Forces
|
||||||
kernel to use 4-level paging instead.
|
kernel to use 4-level paging instead.
|
||||||
|
|
||||||
nofsgsbase [X86] Disables FSGSBASE instructions.
|
|
||||||
|
|
||||||
no_console_suspend
|
no_console_suspend
|
||||||
[HW] Never suspend the console
|
[HW] Never suspend the console
|
||||||
Disable suspending of consoles during suspend and
|
Disable suspending of consoles during suspend and
|
||||||
|
|
|
@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors
|
||||||
that absolutely need the more expensive check for the GS base - and we
|
that absolutely need the more expensive check for the GS base - and we
|
||||||
generate all 'normal' entry points with the regular (faster) paranoid=0
|
generate all 'normal' entry points with the regular (faster) paranoid=0
|
||||||
variant.
|
variant.
|
||||||
|
|
||||||
On a FSGSBASE system, however, user space can set GS without kernel
|
|
||||||
interaction. It means the value of GS base itself does not imply anything,
|
|
||||||
whether a kernel value or a user space value. So, there is no longer a safe
|
|
||||||
way to check whether the exception is entering from user mode or kernel
|
|
||||||
mode in the paranoid entry code path. So the GSBASE value needs to be read
|
|
||||||
out, saved and the kernel GSBASE value written. On exit the saved GSBASE
|
|
||||||
value needs to be restored unconditionally. The non paranoid entry/exit
|
|
||||||
code still uses SWAPGS unconditionally as the state is known.
|
|
||||||
|
|
|
@ -1,199 +0,0 @@
|
||||||
.. SPDX-License-Identifier: GPL-2.0
|
|
||||||
|
|
||||||
Using FS and GS segments in user space applications
|
|
||||||
===================================================
|
|
||||||
|
|
||||||
The x86 architecture supports segmentation. Instructions which access
|
|
||||||
memory can use segment register based addressing mode. The following
|
|
||||||
notation is used to address a byte within a segment:
|
|
||||||
|
|
||||||
Segment-register:Byte-address
|
|
||||||
|
|
||||||
The segment base address is added to the Byte-address to compute the
|
|
||||||
resulting virtual address which is accessed. This allows to access multiple
|
|
||||||
instances of data with the identical Byte-address, i.e. the same code. The
|
|
||||||
selection of a particular instance is purely based on the base-address in
|
|
||||||
the segment register.
|
|
||||||
|
|
||||||
In 32-bit mode the CPU provides 6 segments, which also support segment
|
|
||||||
limits. The limits can be used to enforce address space protections.
|
|
||||||
|
|
||||||
In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
|
|
||||||
always 0 to provide a full 64bit address space. The FS and GS segments are
|
|
||||||
still functional in 64-bit mode.
|
|
||||||
|
|
||||||
Common FS and GS usage
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
The FS segment is commonly used to address Thread Local Storage (TLS). FS
|
|
||||||
is usually managed by runtime code or a threading library. Variables
|
|
||||||
declared with the '__thread' storage class specifier are instantiated per
|
|
||||||
thread and the compiler emits the FS: address prefix for accesses to these
|
|
||||||
variables. Each thread has its own FS base address so common code can be
|
|
||||||
used without complex address offset calculations to access the per thread
|
|
||||||
instances. Applications should not use FS for other purposes when they use
|
|
||||||
runtimes or threading libraries which manage the per thread FS.
|
|
||||||
|
|
||||||
The GS segment has no common use and can be used freely by
|
|
||||||
applications. GCC and Clang support GS based addressing via address space
|
|
||||||
identifiers.
|
|
||||||
|
|
||||||
Reading and writing the FS/GS base address
|
|
||||||
------------------------------------------
|
|
||||||
|
|
||||||
There exist two mechanisms to read and write the FS/FS base address:
|
|
||||||
|
|
||||||
- the arch_prctl() system call
|
|
||||||
|
|
||||||
- the FSGSBASE instruction family
|
|
||||||
|
|
||||||
Accessing FS/GS base with arch_prctl()
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
The arch_prctl(2) based mechanism is available on all 64bit CPUs and all
|
|
||||||
kernel versions.
|
|
||||||
|
|
||||||
Reading the base:
|
|
||||||
|
|
||||||
arch_prctl(ARCH_GET_FS, &fsbase);
|
|
||||||
arch_prctl(ARCH_GET_GS, &gsbase);
|
|
||||||
|
|
||||||
Writing the base:
|
|
||||||
|
|
||||||
arch_prctl(ARCH_SET_FS, fsbase);
|
|
||||||
arch_prctl(ARCH_SET_GS, gsbase);
|
|
||||||
|
|
||||||
The ARCH_SET_GS prctl may be disabled depending on kernel configuration
|
|
||||||
and security settings.
|
|
||||||
|
|
||||||
Accessing FS/GS base with the FSGSBASE instructions
|
|
||||||
---------------------------------------------------
|
|
||||||
|
|
||||||
With the Ivy Bridge CPU generation Intel introduced a new set of
|
|
||||||
instructions to access the FS and GS base registers directly from user
|
|
||||||
space. These instructions are also supported on AMD Family 17H CPUs. The
|
|
||||||
following instructions are available:
|
|
||||||
|
|
||||||
=============== ===========================
|
|
||||||
RDFSBASE %reg Read the FS base register
|
|
||||||
RDGSBASE %reg Read the GS base register
|
|
||||||
WRFSBASE %reg Write the FS base register
|
|
||||||
WRGSBASE %reg Write the GS base register
|
|
||||||
=============== ===========================
|
|
||||||
|
|
||||||
The instructions avoid the overhead of the arch_prctl() syscall and allow
|
|
||||||
more flexible usage of the FS/GS addressing modes in user space
|
|
||||||
applications. This does not prevent conflicts between threading libraries
|
|
||||||
and runtimes which utilize FS and applications which want to use it for
|
|
||||||
their own purpose.
|
|
||||||
|
|
||||||
FSGSBASE instructions enablement
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
|
|
||||||
available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
|
|
||||||
|
|
||||||
The availability of the instructions does not enable them
|
|
||||||
automatically. The kernel has to enable them explicitly in CR4. The
|
|
||||||
reason for this is that older kernels make assumptions about the values in
|
|
||||||
the GS register and enforce them when GS base is set via
|
|
||||||
arch_prctl(). Allowing user space to write arbitrary values to GS base
|
|
||||||
would violate these assumptions and cause malfunction.
|
|
||||||
|
|
||||||
On kernels which do not enable FSGSBASE the execution of the FSGSBASE
|
|
||||||
instructions will fault with a #UD exception.
|
|
||||||
|
|
||||||
The kernel provides reliable information about the enabled state in the
|
|
||||||
ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
|
|
||||||
kernel has FSGSBASE instructions enabled and applications can use them.
|
|
||||||
The following code example shows how this detection works::
|
|
||||||
|
|
||||||
#include <sys/auxv.h>
|
|
||||||
#include <elf.h>
|
|
||||||
|
|
||||||
/* Will be eventually in asm/hwcap.h */
|
|
||||||
#ifndef HWCAP2_FSGSBASE
|
|
||||||
#define HWCAP2_FSGSBASE (1 << 1)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
....
|
|
||||||
|
|
||||||
unsigned val = getauxval(AT_HWCAP2);
|
|
||||||
|
|
||||||
if (val & HWCAP2_FSGSBASE)
|
|
||||||
printf("FSGSBASE enabled\n");
|
|
||||||
|
|
||||||
FSGSBASE instructions compiler support
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
|
|
||||||
instructions. Clang supports them as well.
|
|
||||||
|
|
||||||
=================== ===========================
|
|
||||||
_readfsbase_u64() Read the FS base register
|
|
||||||
_readfsbase_u64() Read the GS base register
|
|
||||||
_writefsbase_u64() Write the FS base register
|
|
||||||
_writegsbase_u64() Write the GS base register
|
|
||||||
=================== ===========================
|
|
||||||
|
|
||||||
To utilize these instrinsics <immintrin.h> must be included in the source
|
|
||||||
code and the compiler option -mfsgsbase has to be added.
|
|
||||||
|
|
||||||
Compiler support for FS/GS based addressing
|
|
||||||
-------------------------------------------
|
|
||||||
|
|
||||||
GCC version 6 and newer provide support for FS/GS based addressing via
|
|
||||||
Named Address Spaces. GCC implements the following address space
|
|
||||||
identifiers for x86:
|
|
||||||
|
|
||||||
========= ====================================
|
|
||||||
__seg_fs Variable is addressed relative to FS
|
|
||||||
__seg_gs Variable is addressed relative to GS
|
|
||||||
========= ====================================
|
|
||||||
|
|
||||||
The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
|
|
||||||
address spaces are supported. Code which implements fallback modes should
|
|
||||||
check whether these symbols are defined. Usage example::
|
|
||||||
|
|
||||||
#ifdef __SEG_GS
|
|
||||||
|
|
||||||
long data0 = 0;
|
|
||||||
long data1 = 1;
|
|
||||||
|
|
||||||
long __seg_gs *ptr;
|
|
||||||
|
|
||||||
/* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
|
|
||||||
....
|
|
||||||
|
|
||||||
/* Set GS to point to data0 */
|
|
||||||
_writegsbase_u64(&data0);
|
|
||||||
|
|
||||||
/* Access offset 0 of GS */
|
|
||||||
ptr = 0;
|
|
||||||
printf("data0 = %ld\n", *ptr);
|
|
||||||
|
|
||||||
/* Set GS to point to data1 */
|
|
||||||
_writegsbase_u64(&data1);
|
|
||||||
/* ptr still addresses offset 0! */
|
|
||||||
printf("data1 = %ld\n", *ptr);
|
|
||||||
|
|
||||||
|
|
||||||
Clang does not provide the GCC address space identifiers, but it provides
|
|
||||||
address spaces via an attribute based mechanism in Clang 5 and newer
|
|
||||||
versions:
|
|
||||||
|
|
||||||
==================================== =====================================
|
|
||||||
__attribute__((address_space(256)) Variable is addressed relative to GS
|
|
||||||
__attribute__((address_space(257)) Variable is addressed relative to FS
|
|
||||||
==================================== =====================================
|
|
||||||
|
|
||||||
FS/GS based addressing with inline assembly
|
|
||||||
-------------------------------------------
|
|
||||||
|
|
||||||
In case the compiler does not support address spaces, inline assembly can
|
|
||||||
be used for FS/GS based addressing mode::
|
|
||||||
|
|
||||||
mov %fs:offset, %reg
|
|
||||||
mov %gs:offset, %reg
|
|
||||||
|
|
||||||
mov %reg, %fs:offset
|
|
||||||
mov %reg, %gs:offset
|
|
|
@ -14,4 +14,3 @@ x86_64 Support
|
||||||
fake-numa-for-cpusets
|
fake-numa-for-cpusets
|
||||||
cpu-hotplug-spec
|
cpu-hotplug-spec
|
||||||
machinecheck
|
machinecheck
|
||||||
fsgs
|
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
#include <asm/percpu.h>
|
#include <asm/percpu.h>
|
||||||
#include <asm/asm-offsets.h>
|
#include <asm/asm-offsets.h>
|
||||||
#include <asm/processor-flags.h>
|
#include <asm/processor-flags.h>
|
||||||
#include <asm/inst.h>
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
|
@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with
|
||||||
#endif
|
#endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
|
|
||||||
rdgsbase \save_reg
|
|
||||||
GET_PERCPU_BASE \scratch_reg
|
|
||||||
wrgsbase \scratch_reg
|
|
||||||
.endm
|
|
||||||
|
|
||||||
#endif /* CONFIG_X86_64 */
|
#endif /* CONFIG_X86_64 */
|
||||||
|
|
||||||
.macro STACKLEAK_ERASE
|
.macro STACKLEAK_ERASE
|
||||||
|
@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with
|
||||||
#endif
|
#endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CPU/node NR is loaded from the limit (size) field of a special segment
|
|
||||||
* descriptor entry in GDT.
|
|
||||||
*/
|
|
||||||
.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
|
|
||||||
movq $__CPUNODE_SEG, \reg
|
|
||||||
lsl \reg, \reg
|
|
||||||
.endm
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fetch the per-CPU GSBASE value for this processor and put it in @reg.
|
|
||||||
* We normally use %gs for accessing per-CPU data, but we are setting up
|
|
||||||
* %gs here and obviously can not use %gs itself to access per-CPU data.
|
|
||||||
*/
|
|
||||||
.macro GET_PERCPU_BASE reg:req
|
|
||||||
ALTERNATIVE \
|
|
||||||
"LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
|
|
||||||
"RDPID \reg", \
|
|
||||||
X86_FEATURE_RDPID
|
|
||||||
andq $VDSO_CPUNODE_MASK, \reg
|
|
||||||
movq __per_cpu_offset(, \reg, 8), \reg
|
|
||||||
.endm
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
.macro GET_PERCPU_BASE reg:req
|
|
||||||
movq pcpu_unit_offsets(%rip), \reg
|
|
||||||
.endm
|
|
||||||
|
|
||||||
#endif /* CONFIG_SMP */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This does 'call enter_from_user_mode' unless we can avoid it based on
|
* This does 'call enter_from_user_mode' unless we can avoid it based on
|
||||||
* kernel config or using the static jump infrastructure.
|
* kernel config or using the static jump infrastructure.
|
||||||
|
|
|
@ -38,7 +38,6 @@
|
||||||
#include <asm/export.h>
|
#include <asm/export.h>
|
||||||
#include <asm/frame.h>
|
#include <asm/frame.h>
|
||||||
#include <asm/nospec-branch.h>
|
#include <asm/nospec-branch.h>
|
||||||
#include <asm/fsgsbase.h>
|
|
||||||
#include <linux/err.h>
|
#include <linux/err.h>
|
||||||
|
|
||||||
#include "calling.h"
|
#include "calling.h"
|
||||||
|
@ -948,6 +947,7 @@ ENTRY(\sym)
|
||||||
addq $\ist_offset, CPU_TSS_IST(\shift_ist)
|
addq $\ist_offset, CPU_TSS_IST(\shift_ist)
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
|
/* these procedures expect "no swapgs" flag in ebx */
|
||||||
.if \paranoid
|
.if \paranoid
|
||||||
jmp paranoid_exit
|
jmp paranoid_exit
|
||||||
.else
|
.else
|
||||||
|
@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Save all registers in pt_regs. Return GSBASE related information
|
* Save all registers in pt_regs, and switch gs if needed.
|
||||||
* in EBX depending on the availability of the FSGSBASE instructions:
|
* Use slow, but surefire "are we in kernel?" check.
|
||||||
*
|
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
||||||
* FSGSBASE R/EBX
|
|
||||||
* N 0 -> SWAPGS on exit
|
|
||||||
* 1 -> no SWAPGS on exit
|
|
||||||
*
|
|
||||||
* Y GSBASE value at entry, must be restored in paranoid_exit
|
|
||||||
*/
|
*/
|
||||||
ENTRY(paranoid_entry)
|
ENTRY(paranoid_entry)
|
||||||
UNWIND_HINT_FUNC
|
UNWIND_HINT_FUNC
|
||||||
cld
|
cld
|
||||||
PUSH_AND_CLEAR_REGS save_ret=1
|
PUSH_AND_CLEAR_REGS save_ret=1
|
||||||
ENCODE_FRAME_POINTER 8
|
ENCODE_FRAME_POINTER 8
|
||||||
|
movl $1, %ebx
|
||||||
|
movl $MSR_GS_BASE, %ecx
|
||||||
|
rdmsr
|
||||||
|
testl %edx, %edx
|
||||||
|
js 1f /* negative -> in kernel */
|
||||||
|
SWAPGS
|
||||||
|
xorl %ebx, %ebx
|
||||||
|
|
||||||
|
1:
|
||||||
/*
|
/*
|
||||||
* Always stash CR3 in %r14. This value will be restored,
|
* Always stash CR3 in %r14. This value will be restored,
|
||||||
* verbatim, at exit. Needed if paranoid_entry interrupted
|
* verbatim, at exit. Needed if paranoid_entry interrupted
|
||||||
|
@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry)
|
||||||
* This is also why CS (stashed in the "iret frame" by the
|
* This is also why CS (stashed in the "iret frame" by the
|
||||||
* hardware at entry) can not be used: this may be a return
|
* hardware at entry) can not be used: this may be a return
|
||||||
* to kernel code, but with a user CR3 value.
|
* to kernel code, but with a user CR3 value.
|
||||||
*
|
|
||||||
* Switching CR3 does not depend on kernel GSBASE so it can
|
|
||||||
* be done before switching to the kernel GSBASE. This is
|
|
||||||
* required for FSGSBASE because the kernel GSBASE has to
|
|
||||||
* be retrieved from a kernel internal table.
|
|
||||||
*/
|
*/
|
||||||
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
||||||
|
|
||||||
/*
|
|
||||||
* Handling GSBASE depends on the availability of FSGSBASE.
|
|
||||||
*
|
|
||||||
* Without FSGSBASE the kernel enforces that negative GSBASE
|
|
||||||
* values indicate kernel GSBASE. With FSGSBASE no assumptions
|
|
||||||
* can be made about the GSBASE value when entering from user
|
|
||||||
* space.
|
|
||||||
*/
|
|
||||||
ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Read the current GSBASE and store it in in %rbx unconditionally,
|
|
||||||
* retrieve and set the current CPUs kernel GSBASE. The stored value
|
|
||||||
* has to be restored in paranoid_exit unconditionally.
|
|
||||||
*/
|
|
||||||
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
|
|
||||||
ret
|
|
||||||
|
|
||||||
.Lparanoid_entry_checkgs:
|
|
||||||
/* EBX = 1 -> kernel GSBASE active, no restore required */
|
|
||||||
movl $1, %ebx
|
|
||||||
/*
|
|
||||||
* The kernel-enforced convention is a negative GSBASE indicates
|
|
||||||
* a kernel value. No SWAPGS needed on entry and exit.
|
|
||||||
*/
|
|
||||||
movl $MSR_GS_BASE, %ecx
|
|
||||||
rdmsr
|
|
||||||
testl %edx, %edx
|
|
||||||
jns .Lparanoid_entry_swapgs
|
|
||||||
ret
|
|
||||||
|
|
||||||
.Lparanoid_entry_swapgs:
|
|
||||||
SWAPGS
|
|
||||||
/* EBX = 0 -> SWAPGS required on exit */
|
|
||||||
xorl %ebx, %ebx
|
|
||||||
ret
|
ret
|
||||||
END(paranoid_entry)
|
END(paranoid_entry)
|
||||||
|
|
||||||
|
@ -1241,47 +1204,28 @@ END(paranoid_entry)
|
||||||
*
|
*
|
||||||
* We may be returning to very strange contexts (e.g. very early
|
* We may be returning to very strange contexts (e.g. very early
|
||||||
* in syscall entry), so checking for preemption here would
|
* in syscall entry), so checking for preemption here would
|
||||||
* be complicated. Fortunately, there's no good reason to try
|
* be complicated. Fortunately, we there's no good reason
|
||||||
* to handle preemption here.
|
* to try to handle preemption here.
|
||||||
*
|
*
|
||||||
* R/EBX contains the GSBASE related information depending on the
|
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
||||||
* availability of the FSGSBASE instructions:
|
|
||||||
*
|
|
||||||
* FSGSBASE R/EBX
|
|
||||||
* N 0 -> SWAPGS on exit
|
|
||||||
* 1 -> no SWAPGS on exit
|
|
||||||
*
|
|
||||||
* Y User space GSBASE, must be restored unconditionally
|
|
||||||
*/
|
*/
|
||||||
ENTRY(paranoid_exit)
|
ENTRY(paranoid_exit)
|
||||||
UNWIND_HINT_REGS
|
UNWIND_HINT_REGS
|
||||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||||
TRACE_IRQS_OFF_DEBUG
|
TRACE_IRQS_OFF_DEBUG
|
||||||
|
testl %ebx, %ebx /* swapgs needed? */
|
||||||
/* Handle GS depending on FSGSBASE availability */
|
|
||||||
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
|
|
||||||
|
|
||||||
/* With FSGSBASE enabled, unconditionally restore GSBASE */
|
|
||||||
wrgsbase %rbx
|
|
||||||
jmp .Lparanoid_exit_no_swapgs;
|
|
||||||
|
|
||||||
.Lparanoid_exit_checkgs:
|
|
||||||
/* On non-FSGSBASE systems, conditionally do SWAPGS */
|
|
||||||
testl %ebx, %ebx
|
|
||||||
jnz .Lparanoid_exit_no_swapgs
|
jnz .Lparanoid_exit_no_swapgs
|
||||||
TRACE_IRQS_IRETQ
|
TRACE_IRQS_IRETQ
|
||||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||||
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
||||||
SWAPGS_UNSAFE_STACK
|
SWAPGS_UNSAFE_STACK
|
||||||
jmp .Lparanoid_exit_restore
|
jmp .Lparanoid_exit_restore
|
||||||
|
|
||||||
.Lparanoid_exit_no_swapgs:
|
.Lparanoid_exit_no_swapgs:
|
||||||
TRACE_IRQS_IRETQ_DEBUG
|
TRACE_IRQS_IRETQ_DEBUG
|
||||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||||
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
||||||
|
|
||||||
.Lparanoid_exit_restore:
|
.Lparanoid_exit_restore:
|
||||||
jmp restore_regs_and_return_to_kernel
|
jmp restore_regs_and_return_to_kernel
|
||||||
END(paranoid_exit)
|
END(paranoid_exit)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1692,27 +1636,10 @@ end_repeat_nmi:
|
||||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||||
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
|
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
|
||||||
|
|
||||||
/*
|
testl %ebx, %ebx /* swapgs needed? */
|
||||||
* The above invocation of paranoid_entry stored the GSBASE
|
|
||||||
* related information in R/EBX depending on the availability
|
|
||||||
* of FSGSBASE.
|
|
||||||
*
|
|
||||||
* If FSGSBASE is enabled, restore the saved GSBASE value
|
|
||||||
* unconditionally, otherwise take the conditional SWAPGS path.
|
|
||||||
*/
|
|
||||||
ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
|
|
||||||
|
|
||||||
wrgsbase %rbx
|
|
||||||
jmp nmi_restore
|
|
||||||
|
|
||||||
nmi_no_fsgsbase:
|
|
||||||
/* EBX == 0 -> invoke SWAPGS */
|
|
||||||
testl %ebx, %ebx
|
|
||||||
jnz nmi_restore
|
jnz nmi_restore
|
||||||
|
|
||||||
nmi_swapgs:
|
nmi_swapgs:
|
||||||
SWAPGS_UNSAFE_STACK
|
SWAPGS_UNSAFE_STACK
|
||||||
|
|
||||||
nmi_restore:
|
nmi_restore:
|
||||||
POP_REGS
|
POP_REGS
|
||||||
|
|
||||||
|
@ -1743,11 +1670,17 @@ nmi_restore:
|
||||||
iretq
|
iretq
|
||||||
END(nmi)
|
END(nmi)
|
||||||
|
|
||||||
|
#ifndef CONFIG_IA32_EMULATION
|
||||||
|
/*
|
||||||
|
* This handles SYSCALL from 32-bit code. There is no way to program
|
||||||
|
* MSRs to fully disable 32-bit SYSCALL.
|
||||||
|
*/
|
||||||
ENTRY(ignore_sysret)
|
ENTRY(ignore_sysret)
|
||||||
UNWIND_HINT_EMPTY
|
UNWIND_HINT_EMPTY
|
||||||
mov $-ENOSYS, %eax
|
mov $-ENOSYS, %eax
|
||||||
sysret
|
sysret
|
||||||
END(ignore_sysret)
|
END(ignore_sysret)
|
||||||
|
#endif
|
||||||
|
|
||||||
ENTRY(rewind_stack_do_exit)
|
ENTRY(rewind_stack_do_exit)
|
||||||
UNWIND_HINT_FUNC
|
UNWIND_HINT_FUNC
|
||||||
|
|
|
@ -19,62 +19,35 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
|
||||||
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
|
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
|
||||||
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
|
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
|
||||||
|
|
||||||
/* Must be protected by X86_FEATURE_FSGSBASE check. */
|
|
||||||
|
|
||||||
static __always_inline unsigned long rdfsbase(void)
|
|
||||||
{
|
|
||||||
unsigned long fsbase;
|
|
||||||
|
|
||||||
asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
|
|
||||||
|
|
||||||
return fsbase;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline unsigned long rdgsbase(void)
|
|
||||||
{
|
|
||||||
unsigned long gsbase;
|
|
||||||
|
|
||||||
asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
|
|
||||||
|
|
||||||
return gsbase;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline void wrfsbase(unsigned long fsbase)
|
|
||||||
{
|
|
||||||
asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline void wrgsbase(unsigned long gsbase)
|
|
||||||
{
|
|
||||||
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#include <asm/cpufeature.h>
|
|
||||||
|
|
||||||
/* Helper functions for reading/writing FS/GS base */
|
/* Helper functions for reading/writing FS/GS base */
|
||||||
|
|
||||||
static inline unsigned long x86_fsbase_read_cpu(void)
|
static inline unsigned long x86_fsbase_read_cpu(void)
|
||||||
{
|
{
|
||||||
unsigned long fsbase;
|
unsigned long fsbase;
|
||||||
|
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
rdmsrl(MSR_FS_BASE, fsbase);
|
||||||
fsbase = rdfsbase();
|
|
||||||
else
|
|
||||||
rdmsrl(MSR_FS_BASE, fsbase);
|
|
||||||
|
|
||||||
return fsbase;
|
return fsbase;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void x86_fsbase_write_cpu(unsigned long fsbase)
|
static inline unsigned long x86_gsbase_read_cpu_inactive(void)
|
||||||
{
|
{
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
unsigned long gsbase;
|
||||||
wrfsbase(fsbase);
|
|
||||||
else
|
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||||
wrmsrl(MSR_FS_BASE, fsbase);
|
|
||||||
|
return gsbase;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern unsigned long x86_gsbase_read_cpu_inactive(void);
|
static inline void x86_fsbase_write_cpu(unsigned long fsbase)
|
||||||
extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
|
{
|
||||||
|
wrmsrl(MSR_FS_BASE, fsbase);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
||||||
|
{
|
||||||
|
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* CONFIG_X86_64 */
|
#endif /* CONFIG_X86_64 */
|
||||||
|
|
||||||
|
|
|
@ -306,21 +306,6 @@
|
||||||
.endif
|
.endif
|
||||||
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
|
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro RDPID opd
|
|
||||||
REG_TYPE rdpid_opd_type \opd
|
|
||||||
.if rdpid_opd_type == REG_TYPE_R64
|
|
||||||
R64_NUM rdpid_opd \opd
|
|
||||||
.else
|
|
||||||
R32_NUM rdpid_opd \opd
|
|
||||||
.endif
|
|
||||||
.byte 0xf3
|
|
||||||
.if rdpid_opd > 7
|
|
||||||
PFX_REX rdpid_opd 0
|
|
||||||
.endif
|
|
||||||
.byte 0x0f, 0xc7
|
|
||||||
MODRM 0xc0 rdpid_opd 0x7
|
|
||||||
.endm
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -5,7 +5,4 @@
|
||||||
/* MONITOR/MWAIT enabled in Ring 3 */
|
/* MONITOR/MWAIT enabled in Ring 3 */
|
||||||
#define HWCAP2_RING3MWAIT (1 << 0)
|
#define HWCAP2_RING3MWAIT (1 << 0)
|
||||||
|
|
||||||
/* Kernel allows FSGSBASE instructions available in Ring 3 */
|
|
||||||
#define HWCAP2_FSGSBASE BIT(1)
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -366,22 +366,6 @@ out:
|
||||||
cr4_clear_bits(X86_CR4_UMIP);
|
cr4_clear_bits(X86_CR4_UMIP);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __init int x86_nofsgsbase_setup(char *arg)
|
|
||||||
{
|
|
||||||
/* Require an exact match without trailing characters. */
|
|
||||||
if (strlen(arg))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* Do not emit a message if the feature is not present. */
|
|
||||||
if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
|
|
||||||
pr_info("FSGSBASE disabled via kernel command line\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
__setup("nofsgsbase", x86_nofsgsbase_setup);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Protection Keys are not available in 32-bit mode.
|
* Protection Keys are not available in 32-bit mode.
|
||||||
*/
|
*/
|
||||||
|
@ -1387,12 +1371,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
||||||
setup_smap(c);
|
setup_smap(c);
|
||||||
setup_umip(c);
|
setup_umip(c);
|
||||||
|
|
||||||
/* Enable FSGSBASE instructions if available. */
|
|
||||||
if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
|
|
||||||
cr4_set_bits(X86_CR4_FSGSBASE);
|
|
||||||
elf_hwcap2 |= HWCAP2_FSGSBASE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The vendor-specific functions might have changed features.
|
* The vendor-specific functions might have changed features.
|
||||||
* Now we do "generic changes."
|
* Now we do "generic changes."
|
||||||
|
|
|
@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Processors which have self-snooping capability can handle conflicting
|
||||||
|
* memory type across CPUs by snooping its own cache. However, there exists
|
||||||
|
* CPU models in which having conflicting memory types still leads to
|
||||||
|
* unpredictable behavior, machine check errors, or hangs. Clear this
|
||||||
|
* feature to prevent its use on machines with known erratas.
|
||||||
|
*/
|
||||||
|
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
|
||||||
|
{
|
||||||
|
switch (c->x86_model) {
|
||||||
|
case INTEL_FAM6_CORE_YONAH:
|
||||||
|
case INTEL_FAM6_CORE2_MEROM:
|
||||||
|
case INTEL_FAM6_CORE2_MEROM_L:
|
||||||
|
case INTEL_FAM6_CORE2_PENRYN:
|
||||||
|
case INTEL_FAM6_CORE2_DUNNINGTON:
|
||||||
|
case INTEL_FAM6_NEHALEM:
|
||||||
|
case INTEL_FAM6_NEHALEM_G:
|
||||||
|
case INTEL_FAM6_NEHALEM_EP:
|
||||||
|
case INTEL_FAM6_NEHALEM_EX:
|
||||||
|
case INTEL_FAM6_WESTMERE:
|
||||||
|
case INTEL_FAM6_WESTMERE_EP:
|
||||||
|
case INTEL_FAM6_SANDYBRIDGE:
|
||||||
|
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool ring3mwait_disabled __read_mostly;
|
static bool ring3mwait_disabled __read_mostly;
|
||||||
|
|
||||||
static int __init ring3mwait_disable(char *__unused)
|
static int __init ring3mwait_disable(char *__unused)
|
||||||
|
@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
|
||||||
}
|
}
|
||||||
|
|
||||||
check_mpx_erratum(c);
|
check_mpx_erratum(c);
|
||||||
|
check_memory_type_self_snoop_errata(c);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the number of SMT siblings early from the extended topology
|
* Get the number of SMT siblings early from the extended topology
|
||||||
|
|
|
@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
|
||||||
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
|
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
|
||||||
cr0 = read_cr0() | X86_CR0_CD;
|
cr0 = read_cr0() | X86_CR0_CD;
|
||||||
write_cr0(cr0);
|
write_cr0(cr0);
|
||||||
wbinvd();
|
|
||||||
|
/*
|
||||||
|
* Cache flushing is the most time-consuming step when programming
|
||||||
|
* the MTRRs. Fortunately, as per the Intel Software Development
|
||||||
|
* Manual, we can skip it if the processor supports cache self-
|
||||||
|
* snooping.
|
||||||
|
*/
|
||||||
|
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
|
||||||
|
wbinvd();
|
||||||
|
|
||||||
/* Save value of CR4 and clear Page Global Enable (bit 7) */
|
/* Save value of CR4 and clear Page Global Enable (bit 7) */
|
||||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||||
|
@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
|
||||||
|
|
||||||
/* Disable MTRRs, and set the default type to uncached */
|
/* Disable MTRRs, and set the default type to uncached */
|
||||||
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
|
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
|
||||||
wbinvd();
|
|
||||||
|
/* Again, only flush caches if we have to. */
|
||||||
|
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
|
||||||
|
wbinvd();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void post_set(void) __releases(set_atomicity_lock)
|
static void post_set(void) __releases(set_atomicity_lock)
|
||||||
|
|
|
@ -161,40 +161,6 @@ enum which_selector {
|
||||||
GS
|
GS
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Out of line to be protected from kprobes. It is not used on Xen
|
|
||||||
* paravirt. When paravirt support is needed, it needs to be renamed
|
|
||||||
* with native_ prefix.
|
|
||||||
*/
|
|
||||||
static noinline unsigned long __rdgsbase_inactive(void)
|
|
||||||
{
|
|
||||||
unsigned long gsbase;
|
|
||||||
|
|
||||||
lockdep_assert_irqs_disabled();
|
|
||||||
|
|
||||||
native_swapgs();
|
|
||||||
gsbase = rdgsbase();
|
|
||||||
native_swapgs();
|
|
||||||
|
|
||||||
return gsbase;
|
|
||||||
}
|
|
||||||
NOKPROBE_SYMBOL(__rdgsbase_inactive);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Out of line to be protected from kprobes. It is not used on Xen
|
|
||||||
* paravirt. When paravirt support is needed, it needs to be renamed
|
|
||||||
* with native_ prefix.
|
|
||||||
*/
|
|
||||||
static noinline void __wrgsbase_inactive(unsigned long gsbase)
|
|
||||||
{
|
|
||||||
lockdep_assert_irqs_disabled();
|
|
||||||
|
|
||||||
native_swapgs();
|
|
||||||
wrgsbase(gsbase);
|
|
||||||
native_swapgs();
|
|
||||||
}
|
|
||||||
NOKPROBE_SYMBOL(__wrgsbase_inactive);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
|
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
|
||||||
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
|
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
|
||||||
|
@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task)
|
||||||
{
|
{
|
||||||
savesegment(fs, task->thread.fsindex);
|
savesegment(fs, task->thread.fsindex);
|
||||||
savesegment(gs, task->thread.gsindex);
|
savesegment(gs, task->thread.gsindex);
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
save_base_legacy(task, task->thread.fsindex, FS);
|
||||||
unsigned long flags;
|
save_base_legacy(task, task->thread.gsindex, GS);
|
||||||
|
|
||||||
/*
|
|
||||||
* If FSGSBASE is enabled, we can't make any useful guesses
|
|
||||||
* about the base, and user code expects us to save the current
|
|
||||||
* value. Fortunately, reading the base directly is efficient.
|
|
||||||
*/
|
|
||||||
task->thread.fsbase = rdfsbase();
|
|
||||||
local_irq_save(flags);
|
|
||||||
task->thread.gsbase = __rdgsbase_inactive();
|
|
||||||
local_irq_restore(flags);
|
|
||||||
} else {
|
|
||||||
save_base_legacy(task, task->thread.fsindex, FS);
|
|
||||||
save_base_legacy(task, task->thread.gsindex, GS);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if IS_ENABLED(CONFIG_KVM)
|
#if IS_ENABLED(CONFIG_KVM)
|
||||||
|
@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
|
||||||
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
|
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
|
||||||
struct thread_struct *next)
|
struct thread_struct *next)
|
||||||
{
|
{
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
load_seg_legacy(prev->fsindex, prev->fsbase,
|
||||||
/* Update the FS and GS selectors if they could have changed. */
|
next->fsindex, next->fsbase, FS);
|
||||||
if (unlikely(prev->fsindex || next->fsindex))
|
load_seg_legacy(prev->gsindex, prev->gsbase,
|
||||||
loadseg(FS, next->fsindex);
|
next->gsindex, next->gsbase, GS);
|
||||||
if (unlikely(prev->gsindex || next->gsindex))
|
|
||||||
loadseg(GS, next->gsindex);
|
|
||||||
|
|
||||||
/* Update the bases. */
|
|
||||||
wrfsbase(next->fsbase);
|
|
||||||
__wrgsbase_inactive(next->gsbase);
|
|
||||||
} else {
|
|
||||||
load_seg_legacy(prev->fsindex, prev->fsbase,
|
|
||||||
next->fsindex, next->fsbase, FS);
|
|
||||||
load_seg_legacy(prev->gsindex, prev->gsbase,
|
|
||||||
next->gsindex, next->gsbase, GS);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
||||||
|
@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
||||||
return base;
|
return base;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long x86_gsbase_read_cpu_inactive(void)
|
|
||||||
{
|
|
||||||
unsigned long gsbase;
|
|
||||||
|
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/* Interrupts are disabled here. */
|
|
||||||
local_irq_save(flags);
|
|
||||||
gsbase = __rdgsbase_inactive();
|
|
||||||
local_irq_restore(flags);
|
|
||||||
} else {
|
|
||||||
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
||||||
}
|
|
||||||
|
|
||||||
return gsbase;
|
|
||||||
}
|
|
||||||
|
|
||||||
void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
|
||||||
{
|
|
||||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/* Interrupts are disabled here. */
|
|
||||||
local_irq_save(flags);
|
|
||||||
__wrgsbase_inactive(gsbase);
|
|
||||||
local_irq_restore(flags);
|
|
||||||
} else {
|
|
||||||
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long x86_fsbase_read_task(struct task_struct *task)
|
unsigned long x86_fsbase_read_task(struct task_struct *task)
|
||||||
{
|
{
|
||||||
unsigned long fsbase;
|
unsigned long fsbase;
|
||||||
|
|
||||||
if (task == current)
|
if (task == current)
|
||||||
fsbase = x86_fsbase_read_cpu();
|
fsbase = x86_fsbase_read_cpu();
|
||||||
else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
else if (task->thread.fsindex == 0)
|
||||||
(task->thread.fsindex == 0))
|
|
||||||
fsbase = task->thread.fsbase;
|
fsbase = task->thread.fsbase;
|
||||||
else
|
else
|
||||||
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
|
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
|
||||||
|
@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
|
||||||
|
|
||||||
if (task == current)
|
if (task == current)
|
||||||
gsbase = x86_gsbase_read_cpu_inactive();
|
gsbase = x86_gsbase_read_cpu_inactive();
|
||||||
else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
else if (task->thread.gsindex == 0)
|
||||||
(task->thread.gsindex == 0))
|
|
||||||
gsbase = task->thread.gsbase;
|
gsbase = task->thread.gsbase;
|
||||||
else
|
else
|
||||||
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
|
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
|
||||||
|
@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
|
||||||
p->thread.sp = (unsigned long) fork_frame;
|
p->thread.sp = (unsigned long) fork_frame;
|
||||||
p->thread.io_bitmap_ptr = NULL;
|
p->thread.io_bitmap_ptr = NULL;
|
||||||
|
|
||||||
save_fsgs(me);
|
savesegment(gs, p->thread.gsindex);
|
||||||
p->thread.fsindex = me->thread.fsindex;
|
p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
|
||||||
p->thread.fsbase = me->thread.fsbase;
|
savesegment(fs, p->thread.fsindex);
|
||||||
p->thread.gsindex = me->thread.gsindex;
|
p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
|
||||||
p->thread.gsbase = me->thread.gsbase;
|
|
||||||
savesegment(es, p->thread.es);
|
savesegment(es, p->thread.es);
|
||||||
savesegment(ds, p->thread.ds);
|
savesegment(ds, p->thread.ds);
|
||||||
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
|
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
|
||||||
|
|
|
@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
|
||||||
|
|
||||||
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
|
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
|
||||||
check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
|
check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
|
||||||
protection_keys test_vdso test_vsyscall mov_ss_trap
|
protection_keys test_vdso test_vsyscall mov_ss_trap \
|
||||||
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
|
syscall_arg_fault
|
||||||
|
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
|
||||||
test_FCMOV test_FCOMI test_FISTTP \
|
test_FCMOV test_FCOMI test_FISTTP \
|
||||||
vdso_restorer
|
vdso_restorer
|
||||||
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
|
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
|
||||||
|
|
|
@ -35,6 +35,8 @@
|
||||||
static volatile sig_atomic_t want_segv;
|
static volatile sig_atomic_t want_segv;
|
||||||
static volatile unsigned long segv_addr;
|
static volatile unsigned long segv_addr;
|
||||||
|
|
||||||
|
static unsigned short *shared_scratch;
|
||||||
|
|
||||||
static int nerrs;
|
static int nerrs;
|
||||||
|
|
||||||
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
||||||
|
@ -242,16 +244,11 @@ static void do_remote_base()
|
||||||
|
|
||||||
static __thread int set_thread_area_entry_number = -1;
|
static __thread int set_thread_area_entry_number = -1;
|
||||||
|
|
||||||
static void do_unexpected_base(void)
|
static unsigned short load_gs(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* The goal here is to try to arrange for GS == 0, GSBASE !=
|
* Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think
|
||||||
* 0, and for the the kernel the think that GSBASE == 0.
|
* that GSBASE == 0 (i.e. thread.gsbase == 0).
|
||||||
*
|
|
||||||
* To make the test as reliable as possible, this uses
|
|
||||||
* explicit descriptors. (This is not the only way. This
|
|
||||||
* could use ARCH_SET_GS with a low, nonzero base, but the
|
|
||||||
* relevant side effect of ARCH_SET_GS could change.)
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Step 1: tell the kernel that we have GSBASE == 0. */
|
/* Step 1: tell the kernel that we have GSBASE == 0. */
|
||||||
|
@ -271,8 +268,9 @@ static void do_unexpected_base(void)
|
||||||
.useable = 0
|
.useable = 0
|
||||||
};
|
};
|
||||||
if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
|
if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
|
||||||
printf("\tother thread: using LDT slot 0\n");
|
printf("\tusing LDT slot 0\n");
|
||||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
|
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
|
||||||
|
return 0x7;
|
||||||
} else {
|
} else {
|
||||||
/* No modify_ldt for us (configured out, perhaps) */
|
/* No modify_ldt for us (configured out, perhaps) */
|
||||||
|
|
||||||
|
@ -294,20 +292,15 @@ static void do_unexpected_base(void)
|
||||||
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
|
printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
|
||||||
return;
|
return 0;
|
||||||
}
|
}
|
||||||
printf("\tother thread: using GDT slot %d\n", desc.entry_number);
|
printf("\tusing GDT slot %d\n", desc.entry_number);
|
||||||
set_thread_area_entry_number = desc.entry_number;
|
set_thread_area_entry_number = desc.entry_number;
|
||||||
|
|
||||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3)));
|
unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3);
|
||||||
|
asm volatile ("mov %0, %%gs" : : "rm" (gs));
|
||||||
|
return gs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Step 3: set the selector back to zero. On AMD chips, this will
|
|
||||||
* preserve GSBASE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_wrbase(unsigned short index, unsigned long base)
|
void test_wrbase(unsigned short index, unsigned long base)
|
||||||
|
@ -346,12 +339,19 @@ static void *threadproc(void *ctx)
|
||||||
if (ftx == 3)
|
if (ftx == 3)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (ftx == 1)
|
if (ftx == 1) {
|
||||||
do_remote_base();
|
do_remote_base();
|
||||||
else if (ftx == 2)
|
} else if (ftx == 2) {
|
||||||
do_unexpected_base();
|
/*
|
||||||
else
|
* On AMD chips, this causes GSBASE != 0, GS == 0, and
|
||||||
|
* thread.gsbase == 0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
load_gs();
|
||||||
|
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
|
||||||
|
} else {
|
||||||
errx(1, "helper thread got bad command");
|
errx(1, "helper thread got bad command");
|
||||||
|
}
|
||||||
|
|
||||||
ftx = 0;
|
ftx = 0;
|
||||||
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
|
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
|
||||||
|
@ -453,12 +453,7 @@ static void test_ptrace_write_gsbase(void)
|
||||||
if (child == 0) {
|
if (child == 0) {
|
||||||
printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
|
printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
|
||||||
|
|
||||||
/*
|
*shared_scratch = load_gs();
|
||||||
* Use the LDT setup and fetch the GSBASE from the LDT
|
|
||||||
* by switching to the (nonzero) selector (again)
|
|
||||||
*/
|
|
||||||
do_unexpected_base();
|
|
||||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
|
|
||||||
|
|
||||||
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
|
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
|
||||||
err(1, "PTRACE_TRACEME");
|
err(1, "PTRACE_TRACEME");
|
||||||
|
@ -476,7 +471,7 @@ static void test_ptrace_write_gsbase(void)
|
||||||
|
|
||||||
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
||||||
|
|
||||||
if (gs != 0x7) {
|
if (gs != *shared_scratch) {
|
||||||
nerrs++;
|
nerrs++;
|
||||||
printf("[FAIL]\tGS is not prepared with nonzero\n");
|
printf("[FAIL]\tGS is not prepared with nonzero\n");
|
||||||
goto END;
|
goto END;
|
||||||
|
@ -494,16 +489,24 @@ static void test_ptrace_write_gsbase(void)
|
||||||
* selector value is changed or not by the GSBASE write in
|
* selector value is changed or not by the GSBASE write in
|
||||||
* a ptracer.
|
* a ptracer.
|
||||||
*/
|
*/
|
||||||
if (gs != 0x7) {
|
if (gs != *shared_scratch) {
|
||||||
nerrs++;
|
nerrs++;
|
||||||
printf("[FAIL]\tGS changed to %lx\n", gs);
|
printf("[FAIL]\tGS changed to %lx\n", gs);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On older kernels, poking a nonzero value into the
|
||||||
|
* base would zero the selector. On newer kernels,
|
||||||
|
* this behavior has changed -- poking the base
|
||||||
|
* changes only the base and, if FSGSBASE is not
|
||||||
|
* available, this may have no effect.
|
||||||
|
*/
|
||||||
|
if (gs == 0)
|
||||||
|
printf("\tNote: this is expected behavior on older kernels.\n");
|
||||||
} else if (have_fsgsbase && (base != 0xFF)) {
|
} else if (have_fsgsbase && (base != 0xFF)) {
|
||||||
nerrs++;
|
nerrs++;
|
||||||
printf("[FAIL]\tGSBASE changed to %lx\n", base);
|
printf("[FAIL]\tGSBASE changed to %lx\n", base);
|
||||||
} else {
|
} else {
|
||||||
printf("[OK]\tGS remained 0x7 %s");
|
printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : "");
|
||||||
if (have_fsgsbase)
|
|
||||||
printf("and GSBASE changed to 0xFF");
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -516,6 +519,9 @@ int main()
|
||||||
{
|
{
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
|
|
||||||
|
shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
||||||
|
|
||||||
/* Probe FSGSBASE */
|
/* Probe FSGSBASE */
|
||||||
sethandler(SIGILL, sigill, 0);
|
sethandler(SIGILL, sigill, 0);
|
||||||
if (sigsetjmp(jmpbuf, 1) == 0) {
|
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||||
|
|
|
@ -15,9 +15,30 @@
|
||||||
#include <setjmp.h>
|
#include <setjmp.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
|
#ifdef __x86_64__
|
||||||
|
# define WIDTH "q"
|
||||||
|
#else
|
||||||
|
# define WIDTH "l"
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Our sigaltstack scratch space. */
|
/* Our sigaltstack scratch space. */
|
||||||
static unsigned char altstack_data[SIGSTKSZ];
|
static unsigned char altstack_data[SIGSTKSZ];
|
||||||
|
|
||||||
|
static unsigned long get_eflags(void)
|
||||||
|
{
|
||||||
|
unsigned long eflags;
|
||||||
|
asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
|
||||||
|
return eflags;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_eflags(unsigned long eflags)
|
||||||
|
{
|
||||||
|
asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
|
||||||
|
: : "rm" (eflags) : "flags");
|
||||||
|
}
|
||||||
|
|
||||||
|
#define X86_EFLAGS_TF (1UL << 8)
|
||||||
|
|
||||||
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
||||||
int flags)
|
int flags)
|
||||||
{
|
{
|
||||||
|
@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf;
|
||||||
|
|
||||||
static volatile sig_atomic_t n_errs;
|
static volatile sig_atomic_t n_errs;
|
||||||
|
|
||||||
|
#ifdef __x86_64__
|
||||||
|
#define REG_AX REG_RAX
|
||||||
|
#define REG_IP REG_RIP
|
||||||
|
#else
|
||||||
|
#define REG_AX REG_EAX
|
||||||
|
#define REG_IP REG_EIP
|
||||||
|
#endif
|
||||||
|
|
||||||
static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
|
static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
|
||||||
{
|
{
|
||||||
ucontext_t *ctx = (ucontext_t*)ctx_void;
|
ucontext_t *ctx = (ucontext_t*)ctx_void;
|
||||||
|
long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
|
||||||
|
|
||||||
if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
|
if (ax != -EFAULT && ax != -ENOSYS) {
|
||||||
printf("[FAIL]\tAX had the wrong value: 0x%x\n",
|
printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
|
||||||
ctx->uc_mcontext.gregs[REG_EAX]);
|
(unsigned long)ax);
|
||||||
n_errs++;
|
n_errs++;
|
||||||
} else {
|
} else {
|
||||||
printf("[OK]\tSeems okay\n");
|
printf("[OK]\tSeems okay\n");
|
||||||
|
@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
|
||||||
siglongjmp(jmpbuf, 1);
|
siglongjmp(jmpbuf, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static volatile sig_atomic_t sigtrap_consecutive_syscalls;
|
||||||
|
|
||||||
|
static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* KVM has some bugs that can cause us to stop making progress.
|
||||||
|
* detect them and complain, but don't infinite loop or fail the
|
||||||
|
* test.
|
||||||
|
*/
|
||||||
|
|
||||||
|
ucontext_t *ctx = (ucontext_t*)ctx_void;
|
||||||
|
unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
|
||||||
|
|
||||||
|
if (*ip == 0x340f || *ip == 0x050f) {
|
||||||
|
/* The trap was on SYSCALL or SYSENTER */
|
||||||
|
sigtrap_consecutive_syscalls++;
|
||||||
|
if (sigtrap_consecutive_syscalls > 3) {
|
||||||
|
printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n");
|
||||||
|
siglongjmp(jmpbuf, 1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sigtrap_consecutive_syscalls = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void sigill(int sig, siginfo_t *info, void *ctx_void)
|
static void sigill(int sig, siginfo_t *info, void *ctx_void)
|
||||||
{
|
{
|
||||||
printf("[SKIP]\tIllegal instruction\n");
|
ucontext_t *ctx = (ucontext_t*)ctx_void;
|
||||||
|
unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
|
||||||
|
|
||||||
|
if (*ip == 0x0b0f) {
|
||||||
|
/* one of the ud2 instructions faulted */
|
||||||
|
printf("[OK]\tSYSCALL returned normally\n");
|
||||||
|
} else {
|
||||||
|
printf("[SKIP]\tIllegal instruction\n");
|
||||||
|
}
|
||||||
siglongjmp(jmpbuf, 1);
|
siglongjmp(jmpbuf, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,9 +183,48 @@ int main()
|
||||||
"movl $-1, %%ebp\n\t"
|
"movl $-1, %%ebp\n\t"
|
||||||
"movl $-1, %%esp\n\t"
|
"movl $-1, %%esp\n\t"
|
||||||
"syscall\n\t"
|
"syscall\n\t"
|
||||||
"pushl $0" /* make sure we segfault cleanly */
|
"ud2" /* make sure we recover cleanly */
|
||||||
: : : "memory", "flags");
|
: : : "memory", "flags");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("[RUN]\tSYSENTER with TF and invalid state\n");
|
||||||
|
sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
|
||||||
|
|
||||||
|
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||||
|
sigtrap_consecutive_syscalls = 0;
|
||||||
|
set_eflags(get_eflags() | X86_EFLAGS_TF);
|
||||||
|
asm volatile (
|
||||||
|
"movl $-1, %%eax\n\t"
|
||||||
|
"movl $-1, %%ebx\n\t"
|
||||||
|
"movl $-1, %%ecx\n\t"
|
||||||
|
"movl $-1, %%edx\n\t"
|
||||||
|
"movl $-1, %%esi\n\t"
|
||||||
|
"movl $-1, %%edi\n\t"
|
||||||
|
"movl $-1, %%ebp\n\t"
|
||||||
|
"movl $-1, %%esp\n\t"
|
||||||
|
"sysenter"
|
||||||
|
: : : "memory", "flags");
|
||||||
|
}
|
||||||
|
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
|
||||||
|
|
||||||
|
printf("[RUN]\tSYSCALL with TF and invalid state\n");
|
||||||
|
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||||
|
sigtrap_consecutive_syscalls = 0;
|
||||||
|
set_eflags(get_eflags() | X86_EFLAGS_TF);
|
||||||
|
asm volatile (
|
||||||
|
"movl $-1, %%eax\n\t"
|
||||||
|
"movl $-1, %%ebx\n\t"
|
||||||
|
"movl $-1, %%ecx\n\t"
|
||||||
|
"movl $-1, %%edx\n\t"
|
||||||
|
"movl $-1, %%esi\n\t"
|
||||||
|
"movl $-1, %%edi\n\t"
|
||||||
|
"movl $-1, %%ebp\n\t"
|
||||||
|
"movl $-1, %%esp\n\t"
|
||||||
|
"syscall\n\t"
|
||||||
|
"ud2" /* make sure we recover cleanly */
|
||||||
|
: : : "memory", "flags");
|
||||||
|
}
|
||||||
|
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue