x86/asm, sched/x86: Rewrite the FS and GS context switch code
The old code was incomprehensible and was buggy on AMD CPUs. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Borislav Petkov <bp@suse.de> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rudolf Marek <r.marek@assembler.cz> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/5f6bde874c6fe6831c6711b5b1522a238ba035b4.1460075211.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
7a5d670487
commit
3e2b68d752
|
@ -282,7 +282,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
unsigned fsindex, gsindex;
|
||||
unsigned prev_fsindex, prev_gsindex;
|
||||
fpu_switch_t fpu_switch;
|
||||
|
||||
fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
|
||||
|
@ -292,8 +292,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
*
|
||||
* (e.g. xen_load_tls())
|
||||
*/
|
||||
savesegment(fs, fsindex);
|
||||
savesegment(gs, gsindex);
|
||||
savesegment(fs, prev_fsindex);
|
||||
savesegment(gs, prev_gsindex);
|
||||
|
||||
/*
|
||||
* Load TLS before restoring any segments so that segment loads
|
||||
|
@ -336,66 +336,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|||
* Switch FS and GS.
|
||||
*
|
||||
* These are even more complicated than DS and ES: they have
|
||||
* 64-bit bases are that controlled by arch_prctl. Those bases
|
||||
* only differ from the values in the GDT or LDT if the selector
|
||||
* is 0.
|
||||
* 64-bit bases are that controlled by arch_prctl. The bases
|
||||
* don't necessarily match the selectors, as user code can do
|
||||
* any number of things to cause them to be inconsistent.
|
||||
*
|
||||
* Loading the segment register resets the hidden base part of
|
||||
* the register to 0 or the value from the GDT / LDT. If the
|
||||
* next base address zero, writing 0 to the segment register is
|
||||
* much faster than using wrmsr to explicitly zero the base.
|
||||
* We don't promise to preserve the bases if the selectors are
|
||||
* nonzero. We also don't promise to preserve the base if the
|
||||
* selector is zero and the base doesn't match whatever was
|
||||
* most recently passed to ARCH_SET_FS/GS. (If/when the
|
||||
* FSGSBASE instructions are enabled, we'll need to offer
|
||||
* stronger guarantees.)
|
||||
*
|
||||
* The thread_struct.fs and thread_struct.gs values are 0
|
||||
* if the fs and gs bases respectively are not overridden
|
||||
* from the values implied by fsindex and gsindex. They
|
||||
* are nonzero, and store the nonzero base addresses, if
|
||||
* the bases are overridden.
|
||||
*
|
||||
* (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
|
||||
* be impossible.
|
||||
*
|
||||
* Therefore we need to reload the segment registers if either
|
||||
* the old or new selector is nonzero, and we need to override
|
||||
* the base address if next thread expects it to be overridden.
|
||||
*
|
||||
* This code is unnecessarily slow in the case where the old and
|
||||
* new indexes are zero and the new base is nonzero -- it will
|
||||
* unnecessarily write 0 to the selector before writing the new
|
||||
* base address.
|
||||
*
|
||||
* Note: This all depends on arch_prctl being the only way that
|
||||
* user code can override the segment base. Once wrfsbase and
|
||||
* wrgsbase are enabled, most of this code will need to change.
|
||||
* As an invariant,
|
||||
* (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) is
|
||||
* impossible.
|
||||
*/
|
||||
if (unlikely(fsindex | next->fsindex | prev->fs)) {
|
||||
if (next->fsindex) {
|
||||
/* Loading a nonzero value into FS sets the index and base. */
|
||||
loadsegment(fs, next->fsindex);
|
||||
|
||||
/*
|
||||
* If user code wrote a nonzero value to FS, then it also
|
||||
* cleared the overridden base address.
|
||||
*
|
||||
* XXX: if user code wrote 0 to FS and cleared the base
|
||||
* address itself, we won't notice and we'll incorrectly
|
||||
* restore the prior base address next time we reschdule
|
||||
* the process.
|
||||
*/
|
||||
if (fsindex)
|
||||
prev->fs = 0;
|
||||
} else {
|
||||
if (next->fs) {
|
||||
/* Next index is zero but next base is nonzero. */
|
||||
if (prev_fsindex)
|
||||
loadsegment(fs, 0);
|
||||
wrmsrl(MSR_FS_BASE, next->fs);
|
||||
} else {
|
||||
/* Next base and index are both zero. */
|
||||
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
|
||||
/*
|
||||
* We don't know the previous base and can't
|
||||
* find out without RDMSR. Forcibly clear it.
|
||||
*/
|
||||
loadsegment(fs, __USER_DS);
|
||||
loadsegment(fs, 0);
|
||||
} else {
|
||||
/*
|
||||
* If the previous index is zero and ARCH_SET_FS
|
||||
* didn't change the base, then the base is
|
||||
* also zero and we don't need to do anything.
|
||||
*/
|
||||
if (prev->fs || prev_fsindex)
|
||||
loadsegment(fs, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (next->fs)
|
||||
wrmsrl(MSR_FS_BASE, next->fs);
|
||||
prev->fsindex = fsindex;
|
||||
/*
|
||||
* Save the old state and preserve the invariant.
|
||||
* NB: if prev_fsindex == 0, then we can't reliably learn the base
|
||||
* without RDMSR because Intel user code can zero it without telling
|
||||
* us and AMD user code can program any 32-bit value without telling
|
||||
* us.
|
||||
*/
|
||||
if (prev_fsindex)
|
||||
prev->fs = 0;
|
||||
prev->fsindex = prev_fsindex;
|
||||
|
||||
if (unlikely(gsindex | next->gsindex | prev->gs)) {
|
||||
if (next->gsindex) {
|
||||
/* Loading a nonzero value into GS sets the index and base. */
|
||||
load_gs_index(next->gsindex);
|
||||
|
||||
/* This works (and fails) the same way as fsindex above. */
|
||||
if (gsindex)
|
||||
prev->gs = 0;
|
||||
} else {
|
||||
if (next->gs) {
|
||||
/* Next index is zero but next base is nonzero. */
|
||||
if (prev_gsindex)
|
||||
load_gs_index(0);
|
||||
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
|
||||
} else {
|
||||
/* Next base and index are both zero. */
|
||||
if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
|
||||
/*
|
||||
* We don't know the previous base and can't
|
||||
* find out without RDMSR. Forcibly clear it.
|
||||
*
|
||||
* This contains a pointless SWAPGS pair.
|
||||
* Fixing it would involve an explicit check
|
||||
* for Xen or a new pvop.
|
||||
*/
|
||||
load_gs_index(__USER_DS);
|
||||
load_gs_index(0);
|
||||
} else {
|
||||
/*
|
||||
* If the previous index is zero and ARCH_SET_GS
|
||||
* didn't change the base, then the base is
|
||||
* also zero and we don't need to do anything.
|
||||
*/
|
||||
if (prev->gs || prev_gsindex)
|
||||
load_gs_index(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (next->gs)
|
||||
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
|
||||
prev->gsindex = gsindex;
|
||||
/*
|
||||
* Save the old state and preserve the invariant.
|
||||
* NB: if prev_gsindex == 0, then we can't reliably learn the base
|
||||
* without RDMSR because Intel user code can zero it without telling
|
||||
* us and AMD user code can program any 32-bit value without telling
|
||||
* us.
|
||||
*/
|
||||
if (prev_gsindex)
|
||||
prev->gs = 0;
|
||||
prev->gsindex = prev_gsindex;
|
||||
|
||||
switch_fpu_finish(next_fpu, fpu_switch);
|
||||
|
||||
|
|
Loading…
Reference in New Issue