x86/nmi/64: Switch stacks on userspace NMI entry
Returning to userspace is tricky: IRET can fail, and ESPFIX can rearrange the stack prior to IRET. The NMI nesting fixup relies on a precise stack layout and atomic IRET. Rather than trying to teach the NMI nesting fixup to handle ESPFIX and failed IRET, punt: run NMIs that came from user mode on the normal kernel stack. This will make some nested NMIs visible to C code, but the C code is okay with that. As a side effect, this should speed up perf: it eliminates an RDMSR when NMIs come from user mode. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Steven Rostedt <rostedt@goodmis.org> Reviewed-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
0e181bb581
commit
9b6e6a8334
|
@ -1250,18 +1250,72 @@ ENTRY(nmi)
|
|||
* a nested NMI that updated the copy interrupt stack frame, a
|
||||
* jump will be made to the repeat_nmi code that will handle the second
|
||||
* NMI.
|
||||
*
|
||||
* However, espfix prevents us from directly returning to userspace
|
||||
* with a single IRET instruction. Similarly, IRET to user mode
|
||||
* can fault. We therefore handle NMIs from user space like
|
||||
* other IST entries.
|
||||
*/
|
||||
|
||||
/* Use %rdx as our temp variable throughout */
|
||||
pushq %rdx
|
||||
|
||||
/*
|
||||
* If %cs was not the kernel segment, then the NMI triggered in user
|
||||
* space, which means it is definitely not nested.
|
||||
*/
|
||||
cmpl $__KERNEL_CS, 16(%rsp)
|
||||
jne first_nmi
|
||||
testb $3, CS-RIP+8(%rsp)
|
||||
jz .Lnmi_from_kernel
|
||||
|
||||
/*
|
||||
* NMI from user mode. We need to run on the thread stack, but we
|
||||
* can't go through the normal entry paths: NMIs are masked, and
|
||||
* we don't want to enable interrupts, because then we'll end
|
||||
* up in an awkward situation in which IRQs are on but NMIs
|
||||
* are off.
|
||||
*/
|
||||
|
||||
SWAPGS
|
||||
cld
|
||||
movq %rsp, %rdx
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
pushq 5*8(%rdx) /* pt_regs->ss */
|
||||
pushq 4*8(%rdx) /* pt_regs->rsp */
|
||||
pushq 3*8(%rdx) /* pt_regs->flags */
|
||||
pushq 2*8(%rdx) /* pt_regs->cs */
|
||||
pushq 1*8(%rdx) /* pt_regs->rip */
|
||||
pushq $-1 /* pt_regs->orig_ax */
|
||||
pushq %rdi /* pt_regs->di */
|
||||
pushq %rsi /* pt_regs->si */
|
||||
pushq (%rdx) /* pt_regs->dx */
|
||||
pushq %rcx /* pt_regs->cx */
|
||||
pushq %rax /* pt_regs->ax */
|
||||
pushq %r8 /* pt_regs->r8 */
|
||||
pushq %r9 /* pt_regs->r9 */
|
||||
pushq %r10 /* pt_regs->r10 */
|
||||
pushq %r11 /* pt_regs->r11 */
|
||||
pushq %rbx /* pt_regs->rbx */
|
||||
pushq %rbp /* pt_regs->rbp */
|
||||
pushq %r12 /* pt_regs->r12 */
|
||||
pushq %r13 /* pt_regs->r13 */
|
||||
pushq %r14 /* pt_regs->r14 */
|
||||
pushq %r15 /* pt_regs->r15 */
|
||||
|
||||
/*
|
||||
* At this point we no longer need to worry about stack damage
|
||||
* due to nesting -- we're on the normal thread stack and we're
|
||||
* done with the NMI stack.
|
||||
*/
|
||||
|
||||
movq %rsp, %rdi
|
||||
movq $-1, %rsi
|
||||
call do_nmi
|
||||
|
||||
/*
|
||||
* Return back to user mode. We must *not* do the normal exit
|
||||
* work, because we don't want to enable interrupts. Fortunately,
|
||||
* do_nmi doesn't modify pt_regs.
|
||||
*/
|
||||
SWAPGS
|
||||
jmp restore_c_regs_and_iret
|
||||
|
||||
.Lnmi_from_kernel:
|
||||
/*
|
||||
* Check the special variable on the stack to see if NMIs are
|
||||
* executing.
|
||||
|
|
Loading…
Reference in New Issue