2019-05-23 17:14:39 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2006-12-07 09:14:07 +08:00
|
|
|
/* Paravirtualization interfaces
|
|
|
|
Copyright (C) 2006 Rusty Russell IBM Corporation
|
|
|
|
|
2008-01-30 20:32:04 +08:00
|
|
|
|
|
|
|
2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
|
2006-12-07 09:14:07 +08:00
|
|
|
*/
|
2008-01-30 20:32:04 +08:00
|
|
|
|
2006-12-07 09:14:07 +08:00
|
|
|
#include <linux/errno.h>
|
2016-07-14 08:18:56 +08:00
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/export.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
#include <linux/efi.h>
|
|
|
|
#include <linux/bcd.h>
|
2007-05-03 01:27:15 +08:00
|
|
|
#include <linux/highmem.h>
|
2014-04-17 16:17:05 +08:00
|
|
|
#include <linux/kprobes.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
|
|
|
|
#include <asm/bug.h>
|
|
|
|
#include <asm/paravirt.h>
|
2012-01-21 07:35:53 +08:00
|
|
|
#include <asm/debugreg.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
#include <asm/desc.h>
|
|
|
|
#include <asm/setup.h>
|
2008-07-09 06:06:23 +08:00
|
|
|
#include <asm/pgtable.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
#include <asm/time.h>
|
2008-06-25 12:19:12 +08:00
|
|
|
#include <asm/pgalloc.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
#include <asm/irq.h>
|
|
|
|
#include <asm/delay.h>
|
2006-12-07 09:14:08 +08:00
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <asm/apic.h>
|
2006-12-07 09:14:08 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2007-03-05 16:30:35 +08:00
|
|
|
#include <asm/timer.h>
|
2012-03-29 01:11:12 +08:00
|
|
|
#include <asm/special_insns.h>
|
2018-08-22 23:30:16 +08:00
|
|
|
#include <asm/tlb.h>
|
2020-02-18 23:47:12 +08:00
|
|
|
#include <asm/io_bitmap.h>
|
2006-12-07 09:14:07 +08:00
|
|
|
|
x86/paravirt: Replace the paravirt nop with a bona fide empty function
PARAVIRT_ADJUST_EXCEPTION_FRAME generates this code (using nmi as an
example, trimmed for readability):
ff 15 00 00 00 00 callq *0x0(%rip) # 2796 <nmi+0x6>
2792: R_X86_64_PC32 pv_irq_ops+0x2c
That's a call through a function pointer to regular C function that
does nothing on native boots, but that function isn't protected
against kprobes, isn't marked notrace, and is certainly not
guaranteed to preserve any registers if the compiler is feeling
perverse. This is bad news for a CLBR_NONE operation.
Of course, if everything works correctly, once paravirt ops are
patched, it gets nopped out, but what if we hit this code before
paravirt ops are patched in? This can potentially cause breakage
that is very difficult to debug.
A more subtle failure is possible here, too: if _paravirt_nop uses
the stack at all (even just to push RBP), it will overwrite the "NMI
executing" variable if it's called in the NMI prologue.
The Xen case, perhaps surprisingly, is fine, because it's already
written in asm.
Fix all of the cases that default to paravirt_nop (including
adjust_exception_frame) with a big hammer: replace paravirt_nop with
an asm function that is just a ret instruction.
The Xen case may have other problems, so document them.
This is part of a fix for some random crashes that Sasha saw.
Reported-and-tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/8f5d2ba295f9d73751c33d97fda03e0495d9ade0.1442791737.git.luto@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-09-21 07:32:04 +08:00
|
|
|
/*
|
|
|
|
* nop stub, which must not clobber anything *including the stack* to
|
|
|
|
* avoid confusing the entry prologues.
|
|
|
|
*/
|
|
|
|
extern void _paravirt_nop(void);
|
|
|
|
asm (".pushsection .entry.text, \"ax\"\n"
|
|
|
|
".global _paravirt_nop\n"
|
|
|
|
"_paravirt_nop:\n\t"
|
|
|
|
"ret\n\t"
|
|
|
|
".size _paravirt_nop, . - _paravirt_nop\n\t"
|
|
|
|
".type _paravirt_nop, @function\n\t"
|
|
|
|
".popsection");
|
2006-12-07 09:14:07 +08:00
|
|
|
|
2009-08-20 19:19:57 +08:00
|
|
|
void __init default_banner(void)
|
2006-12-07 09:14:07 +08:00
|
|
|
{
|
|
|
|
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
|
2007-10-17 02:51:29 +08:00
|
|
|
pv_info.name);
|
2006-12-07 09:14:07 +08:00
|
|
|
}
|
|
|
|
|
2007-10-17 02:51:29 +08:00
|
|
|
/* Undefined instruction for dealing with missing ops pointers. */
|
|
|
|
static const unsigned char ud2a[] = { 0x0f, 0x0b };
|
2006-12-07 09:14:08 +08:00
|
|
|
|
2007-07-22 17:12:31 +08:00
|
|
|
struct branch {
|
|
|
|
unsigned char opcode;
|
|
|
|
u32 delta;
|
|
|
|
} __attribute__((packed));
|
|
|
|
|
2019-04-25 19:03:31 +08:00
|
|
|
static unsigned paravirt_patch_call(void *insn_buff, const void *target,
|
2018-08-28 15:40:17 +08:00
|
|
|
unsigned long addr, unsigned len)
|
2007-05-03 01:27:14 +08:00
|
|
|
{
|
2019-04-25 17:50:39 +08:00
|
|
|
const int call_len = 5;
|
2019-04-25 19:03:31 +08:00
|
|
|
struct branch *b = insn_buff;
|
2019-04-25 17:50:39 +08:00
|
|
|
unsigned long delta = (unsigned long)target - (addr+call_len);
|
2007-05-03 01:27:14 +08:00
|
|
|
|
2019-04-25 17:50:39 +08:00
|
|
|
if (len < call_len) {
|
|
|
|
pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr);
|
|
|
|
/* Kernel might not be viable if patching fails, bail out: */
|
|
|
|
BUG_ON(1);
|
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
Nadav reported that on guests we're failing to rewrite the indirect
calls to CALLEE_SAVE paravirt functions. In particular the
pv_queued_spin_unlock() call is left unpatched and that is all over the
place. This obviously wrecks Spectre-v2 mitigation (for paravirt
guests) which relies on not actually having indirect calls around.
The reason is an incorrect clobber test in paravirt_patch_call(); this
function rewrites an indirect call with a direct call to the _SAME_
function, there is no possible way the clobbers can be different
because of this.
Therefore remove this clobber check. Also put WARNs on the other patch
failure case (not enough room for the instruction) which I've not seen
trigger in my (limited) testing.
Three live kernel image disassemblies for lock_sock_nested (as a small
function that illustrates the problem nicely). PRE is the current
situation for guests, POST is with this patch applied and NATIVE is with
or without the patch for !guests.
PRE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq *0xffffffff822299e8
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
POST:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq 0xffffffff810a0c20 <__raw_callee_save___pv_queued_spin_unlock>
0xffffffff817be9a5 <+53>: xchg %ax,%ax
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063aa0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
NATIVE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: movb $0x0,(%rdi)
0xffffffff817be9a3 <+51>: nopl 0x0(%rax)
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
Fixes: 63f70270ccd9 ("[PATCH] i386: PARAVIRT: add common patching machinery")
Fixes: 3010a0663fd9 ("x86/paravirt, objtool: Annotate indirect calls")
Reported-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: stable@vger.kernel.org
2018-08-03 22:41:39 +08:00
|
|
|
}
|
2006-12-07 09:14:08 +08:00
|
|
|
|
2007-08-11 04:31:03 +08:00
|
|
|
b->opcode = 0xe8; /* call */
|
|
|
|
b->delta = delta;
|
2019-04-25 17:50:39 +08:00
|
|
|
BUILD_BUG_ON(sizeof(*b) != call_len);
|
2006-12-07 09:14:08 +08:00
|
|
|
|
2019-04-25 17:50:39 +08:00
|
|
|
return call_len;
|
2007-05-03 01:27:14 +08:00
|
|
|
}
|
|
|
|
|
2018-08-28 15:40:23 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2018-10-30 14:33:01 +08:00
|
|
|
/* identity function, which can be inlined */
|
|
|
|
u64 notrace _paravirt_ident_64(u64 x)
|
|
|
|
{
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2019-04-25 19:03:31 +08:00
|
|
|
static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
|
2018-08-28 15:40:16 +08:00
|
|
|
unsigned long addr, unsigned len)
|
2007-05-03 01:27:14 +08:00
|
|
|
{
|
2019-04-25 19:03:31 +08:00
|
|
|
struct branch *b = insn_buff;
|
2007-08-11 04:31:03 +08:00
|
|
|
unsigned long delta = (unsigned long)target - (addr+5);
|
2007-05-03 01:27:14 +08:00
|
|
|
|
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
Nadav reported that on guests we're failing to rewrite the indirect
calls to CALLEE_SAVE paravirt functions. In particular the
pv_queued_spin_unlock() call is left unpatched and that is all over the
place. This obviously wrecks Spectre-v2 mitigation (for paravirt
guests) which relies on not actually having indirect calls around.
The reason is an incorrect clobber test in paravirt_patch_call(); this
function rewrites an indirect call with a direct call to the _SAME_
function, there is no possible way the clobbers can be different
because of this.
Therefore remove this clobber check. Also put WARNs on the other patch
failure case (not enough room for the instruction) which I've not seen
trigger in my (limited) testing.
Three live kernel image disassemblies for lock_sock_nested (as a small
function that illustrates the problem nicely). PRE is the current
situation for guests, POST is with this patch applied and NATIVE is with
or without the patch for !guests.
PRE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq *0xffffffff822299e8
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
POST:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq 0xffffffff810a0c20 <__raw_callee_save___pv_queued_spin_unlock>
0xffffffff817be9a5 <+53>: xchg %ax,%ax
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063aa0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
NATIVE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: movb $0x0,(%rdi)
0xffffffff817be9a3 <+51>: nopl 0x0(%rax)
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
Fixes: 63f70270ccd9 ("[PATCH] i386: PARAVIRT: add common patching machinery")
Fixes: 3010a0663fd9 ("x86/paravirt, objtool: Annotate indirect calls")
Reported-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: stable@vger.kernel.org
2018-08-03 22:41:39 +08:00
|
|
|
if (len < 5) {
|
|
|
|
#ifdef CONFIG_RETPOLINE
|
2018-09-19 18:35:53 +08:00
|
|
|
WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
|
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
Nadav reported that on guests we're failing to rewrite the indirect
calls to CALLEE_SAVE paravirt functions. In particular the
pv_queued_spin_unlock() call is left unpatched and that is all over the
place. This obviously wrecks Spectre-v2 mitigation (for paravirt
guests) which relies on not actually having indirect calls around.
The reason is an incorrect clobber test in paravirt_patch_call(); this
function rewrites an indirect call with a direct call to the _SAME_
function, there is no possible way the clobbers can be different
because of this.
Therefore remove this clobber check. Also put WARNs on the other patch
failure case (not enough room for the instruction) which I've not seen
trigger in my (limited) testing.
Three live kernel image disassemblies for lock_sock_nested (as a small
function that illustrates the problem nicely). PRE is the current
situation for guests, POST is with this patch applied and NATIVE is with
or without the patch for !guests.
PRE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq *0xffffffff822299e8
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
POST:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq 0xffffffff810a0c20 <__raw_callee_save___pv_queued_spin_unlock>
0xffffffff817be9a5 <+53>: xchg %ax,%ax
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063aa0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
NATIVE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: movb $0x0,(%rdi)
0xffffffff817be9a3 <+51>: nopl 0x0(%rax)
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
Fixes: 63f70270ccd9 ("[PATCH] i386: PARAVIRT: add common patching machinery")
Fixes: 3010a0663fd9 ("x86/paravirt, objtool: Annotate indirect calls")
Reported-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: stable@vger.kernel.org
2018-08-03 22:41:39 +08:00
|
|
|
#endif
|
2007-05-03 01:27:14 +08:00
|
|
|
return len; /* call too long for patch site */
|
x86/paravirt: Fix spectre-v2 mitigations for paravirt guests
Nadav reported that on guests we're failing to rewrite the indirect
calls to CALLEE_SAVE paravirt functions. In particular the
pv_queued_spin_unlock() call is left unpatched and that is all over the
place. This obviously wrecks Spectre-v2 mitigation (for paravirt
guests) which relies on not actually having indirect calls around.
The reason is an incorrect clobber test in paravirt_patch_call(); this
function rewrites an indirect call with a direct call to the _SAME_
function, there is no possible way the clobbers can be different
because of this.
Therefore remove this clobber check. Also put WARNs on the other patch
failure case (not enough room for the instruction) which I've not seen
trigger in my (limited) testing.
Three live kernel image disassemblies for lock_sock_nested (as a small
function that illustrates the problem nicely). PRE is the current
situation for guests, POST is with this patch applied and NATIVE is with
or without the patch for !guests.
PRE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq *0xffffffff822299e8
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
POST:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: callq 0xffffffff810a0c20 <__raw_callee_save___pv_queued_spin_unlock>
0xffffffff817be9a5 <+53>: xchg %ax,%ax
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063aa0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
NATIVE:
(gdb) disassemble lock_sock_nested
Dump of assembler code for function lock_sock_nested:
0xffffffff817be970 <+0>: push %rbp
0xffffffff817be971 <+1>: mov %rdi,%rbp
0xffffffff817be974 <+4>: push %rbx
0xffffffff817be975 <+5>: lea 0x88(%rbp),%rbx
0xffffffff817be97c <+12>: callq 0xffffffff819f7160 <_cond_resched>
0xffffffff817be981 <+17>: mov %rbx,%rdi
0xffffffff817be984 <+20>: callq 0xffffffff819fbb00 <_raw_spin_lock_bh>
0xffffffff817be989 <+25>: mov 0x8c(%rbp),%eax
0xffffffff817be98f <+31>: test %eax,%eax
0xffffffff817be991 <+33>: jne 0xffffffff817be9ba <lock_sock_nested+74>
0xffffffff817be993 <+35>: movl $0x1,0x8c(%rbp)
0xffffffff817be99d <+45>: mov %rbx,%rdi
0xffffffff817be9a0 <+48>: movb $0x0,(%rdi)
0xffffffff817be9a3 <+51>: nopl 0x0(%rax)
0xffffffff817be9a7 <+55>: pop %rbx
0xffffffff817be9a8 <+56>: pop %rbp
0xffffffff817be9a9 <+57>: mov $0x200,%esi
0xffffffff817be9ae <+62>: mov $0xffffffff817be993,%rdi
0xffffffff817be9b5 <+69>: jmpq 0xffffffff81063ae0 <__local_bh_enable_ip>
0xffffffff817be9ba <+74>: mov %rbp,%rdi
0xffffffff817be9bd <+77>: callq 0xffffffff817be8c0 <__lock_sock>
0xffffffff817be9c2 <+82>: jmp 0xffffffff817be993 <lock_sock_nested+35>
End of assembler dump.
Fixes: 63f70270ccd9 ("[PATCH] i386: PARAVIRT: add common patching machinery")
Fixes: 3010a0663fd9 ("x86/paravirt, objtool: Annotate indirect calls")
Reported-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: stable@vger.kernel.org
2018-08-03 22:41:39 +08:00
|
|
|
}
|
2007-05-03 01:27:14 +08:00
|
|
|
|
2007-08-11 04:31:03 +08:00
|
|
|
b->opcode = 0xe9; /* jmp */
|
|
|
|
b->delta = delta;
|
2007-05-03 01:27:14 +08:00
|
|
|
|
|
|
|
return 5;
|
|
|
|
}
|
2018-08-28 15:40:23 +08:00
|
|
|
#endif
|
2007-05-03 01:27:14 +08:00
|
|
|
|
2017-09-07 01:36:24 +08:00
|
|
|
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
|
|
|
|
|
|
|
|
void __init native_pv_lock_init(void)
|
|
|
|
{
|
2019-03-30 02:52:59 +08:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
2017-09-07 01:36:24 +08:00
|
|
|
static_branch_disable(&virt_spin_lock_key);
|
|
|
|
}
|
|
|
|
|
2019-04-25 19:03:31 +08:00
|
|
|
unsigned paravirt_patch_default(u8 type, void *insn_buff,
|
2007-08-11 04:31:03 +08:00
|
|
|
unsigned long addr, unsigned len)
|
2007-05-03 01:27:14 +08:00
|
|
|
{
|
2018-08-28 15:40:19 +08:00
|
|
|
/*
|
|
|
|
* Neat trick to map patch type back to the call within the
|
|
|
|
* corresponding structure.
|
|
|
|
*/
|
|
|
|
void *opfunc = *((void **)&pv_ops + type);
|
2007-05-03 01:27:14 +08:00
|
|
|
unsigned ret;
|
|
|
|
|
|
|
|
if (opfunc == NULL)
|
|
|
|
/* If there's no function, patch it with a ud2a (BUG) */
|
2019-04-25 19:03:31 +08:00
|
|
|
ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
|
2009-01-29 06:35:02 +08:00
|
|
|
else if (opfunc == _paravirt_nop)
|
2015-11-03 17:18:49 +08:00
|
|
|
ret = 0;
|
2009-01-29 06:35:02 +08:00
|
|
|
|
2018-10-30 14:33:01 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2009-01-29 06:35:02 +08:00
|
|
|
/* identity functions just return their single argument */
|
|
|
|
else if (opfunc == _paravirt_ident_64)
|
2019-04-25 19:03:31 +08:00
|
|
|
ret = paravirt_patch_ident_64(insn_buff, len);
|
2009-01-29 06:35:02 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
else if (type == PARAVIRT_PATCH(cpu.iret) ||
|
|
|
|
type == PARAVIRT_PATCH(cpu.usergs_sysret64))
|
2007-05-03 01:27:14 +08:00
|
|
|
/* If operation requires a jmp, then jmp */
|
2019-04-25 19:03:31 +08:00
|
|
|
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
|
2018-08-28 15:40:23 +08:00
|
|
|
#endif
|
2007-05-03 01:27:14 +08:00
|
|
|
else
|
2018-08-28 15:40:17 +08:00
|
|
|
/* Otherwise call the function. */
|
2019-04-25 19:03:31 +08:00
|
|
|
ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
|
2007-05-03 01:27:14 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-25 19:03:31 +08:00
|
|
|
unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
|
2007-05-03 01:27:14 +08:00
|
|
|
const char *start, const char *end)
|
|
|
|
{
|
|
|
|
unsigned insn_len = end - start;
|
2006-12-07 09:14:08 +08:00
|
|
|
|
x86/paravirt: Detect over-sized patching bugs in paravirt_patch_insns()
So paravirt_patch_insns() contains this gem of logic:
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
const char *start, const char *end)
{
unsigned insn_len = end - start;
if (insn_len > len || start == NULL)
insn_len = len;
else
memcpy(insnbuf, start, insn_len);
return insn_len;
}
Note how 'len' (size of the original instruction) is checked against the new
instruction, and silently discarded with no warning printed whatsoever.
This crashes the kernel in funny ways if the patching template is buggy,
and usually in much later places.
Instead do a direct BUG_ON(), there's no way to continue successfully at that point.
I've tested this patch, with the vanilla kernel check never triggers, and
if I intentionally increase the size of one of the patch templates to a
too high value the assert triggers:
[ 0.164385] kernel BUG at arch/x86/kernel/paravirt.c:167!
Without this patch a broken kernel randomly crashes in later places,
after the silent patching failure.
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20190425091717.GA72229@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-25 17:17:17 +08:00
|
|
|
/* Alternative instruction is too large for the patch site and we cannot continue: */
|
|
|
|
BUG_ON(insn_len > len || start == NULL);
|
|
|
|
|
2019-04-25 19:03:31 +08:00
|
|
|
memcpy(insn_buff, start, insn_len);
|
2006-12-07 09:14:08 +08:00
|
|
|
|
|
|
|
return insn_len;
|
|
|
|
}
|
|
|
|
|
2007-02-13 20:26:25 +08:00
|
|
|
static void native_flush_tlb(void)
|
2006-12-07 09:14:08 +08:00
|
|
|
{
|
|
|
|
__native_flush_tlb();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global pages have to be flushed a bit differently. Not a real
|
|
|
|
* performance problem because this does not happen often.
|
|
|
|
*/
|
2007-02-13 20:26:25 +08:00
|
|
|
static void native_flush_tlb_global(void)
|
2006-12-07 09:14:08 +08:00
|
|
|
{
|
|
|
|
__native_flush_tlb_global();
|
|
|
|
}
|
|
|
|
|
2018-02-01 00:03:10 +08:00
|
|
|
static void native_flush_tlb_one_user(unsigned long addr)
|
2006-12-07 09:14:08 +08:00
|
|
|
{
|
2018-02-01 00:03:10 +08:00
|
|
|
__native_flush_tlb_one_user(addr);
|
2006-12-07 09:14:08 +08:00
|
|
|
}
|
|
|
|
|
2012-02-24 15:31:31 +08:00
|
|
|
struct static_key paravirt_steal_enabled;
|
|
|
|
struct static_key paravirt_steal_rq_enabled;
|
2011-07-12 03:28:15 +08:00
|
|
|
|
|
|
|
static u64 native_steal_clock(int cpu)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-12-07 09:14:07 +08:00
|
|
|
/* These are in entry.S */
|
2007-02-13 20:26:25 +08:00
|
|
|
extern void native_iret(void);
|
2008-06-25 12:19:28 +08:00
|
|
|
extern void native_usergs_sysret64(void);
|
2006-12-07 09:14:07 +08:00
|
|
|
|
2007-07-18 09:37:04 +08:00
|
|
|
static struct resource reserve_ioports = {
|
|
|
|
.start = 0,
|
|
|
|
.end = IO_SPACE_LIMIT,
|
|
|
|
.name = "paravirt-ioport",
|
|
|
|
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve the whole legacy IO space to prevent any legacy drivers
|
|
|
|
* from wasting time probing for their hardware. This is a fairly
|
|
|
|
* brute-force approach to disabling all non-virtual drivers.
|
|
|
|
*
|
|
|
|
* Note that this must be called very early to have any effect.
|
|
|
|
*/
|
|
|
|
int paravirt_disable_iospace(void)
|
|
|
|
{
|
2008-03-28 08:28:40 +08:00
|
|
|
return request_resource(&ioport_resource, &reserve_ioports);
|
2007-07-18 09:37:04 +08:00
|
|
|
}
|
|
|
|
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
|
|
|
|
|
|
|
|
static inline void enter_lazy(enum paravirt_lazy_mode mode)
|
|
|
|
{
|
2012-05-11 15:35:27 +08:00
|
|
|
BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
|
2012-05-11 15:35:27 +08:00
|
|
|
this_cpu_write(paravirt_lazy_mode, mode);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
}
|
|
|
|
|
2009-02-18 15:46:21 +08:00
|
|
|
static void leave_lazy(enum paravirt_lazy_mode mode)
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
{
|
2012-05-11 15:35:27 +08:00
|
|
|
BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
|
2012-05-11 15:35:27 +08:00
|
|
|
this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void paravirt_enter_lazy_mmu(void)
|
|
|
|
{
|
|
|
|
enter_lazy(PARAVIRT_LAZY_MMU);
|
|
|
|
}
|
|
|
|
|
|
|
|
void paravirt_leave_lazy_mmu(void)
|
|
|
|
{
|
2009-02-18 15:46:21 +08:00
|
|
|
leave_lazy(PARAVIRT_LAZY_MMU);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
}
|
|
|
|
|
2013-03-23 21:36:36 +08:00
|
|
|
void paravirt_flush_lazy_mmu(void)
|
|
|
|
{
|
|
|
|
preempt_disable();
|
|
|
|
|
|
|
|
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
|
|
|
|
arch_leave_lazy_mmu_mode();
|
|
|
|
arch_enter_lazy_mmu_mode();
|
|
|
|
}
|
|
|
|
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
|
2018-08-28 15:40:23 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2009-02-19 03:18:57 +08:00
|
|
|
void paravirt_start_context_switch(struct task_struct *prev)
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
{
|
2009-02-18 15:53:19 +08:00
|
|
|
BUG_ON(preemptible());
|
|
|
|
|
2012-05-11 15:35:27 +08:00
|
|
|
if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
|
2009-02-18 15:46:21 +08:00
|
|
|
arch_leave_lazy_mmu_mode();
|
2009-02-19 03:18:57 +08:00
|
|
|
set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
|
2009-02-18 15:46:21 +08:00
|
|
|
}
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
enter_lazy(PARAVIRT_LAZY_CPU);
|
|
|
|
}
|
|
|
|
|
2009-02-19 03:18:57 +08:00
|
|
|
void paravirt_end_context_switch(struct task_struct *next)
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
{
|
2009-02-18 15:53:19 +08:00
|
|
|
BUG_ON(preemptible());
|
|
|
|
|
2009-02-18 15:46:21 +08:00
|
|
|
leave_lazy(PARAVIRT_LAZY_CPU);
|
|
|
|
|
2009-02-19 03:18:57 +08:00
|
|
|
if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
|
2009-02-18 15:46:21 +08:00
|
|
|
arch_enter_lazy_mmu_mode();
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
}
|
2018-08-28 15:40:23 +08:00
|
|
|
#endif
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
|
|
|
|
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
|
|
|
|
{
|
2009-02-18 15:05:19 +08:00
|
|
|
if (in_interrupt())
|
|
|
|
return PARAVIRT_LAZY_NONE;
|
|
|
|
|
2012-05-11 15:35:27 +08:00
|
|
|
return this_cpu_read(paravirt_lazy_mode);
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
}
|
|
|
|
|
2007-10-17 02:51:29 +08:00
|
|
|
struct pv_info pv_info = {
|
2006-12-07 09:14:07 +08:00
|
|
|
.name = "bare hardware",
|
2018-08-28 15:40:22 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2006-12-07 09:14:07 +08:00
|
|
|
.kernel_rpl = 0,
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:13 +08:00
|
|
|
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
|
2011-08-03 21:31:53 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
.extra_user_64bit_cs = __USER_CS,
|
|
|
|
#endif
|
2008-01-30 20:33:19 +08:00
|
|
|
#endif
|
2007-10-17 02:51:29 +08:00
|
|
|
};
|
2006-12-07 09:14:07 +08:00
|
|
|
|
2009-01-29 06:35:02 +08:00
|
|
|
/* 64-bit pagetable entries */
|
2009-01-29 06:35:07 +08:00
|
|
|
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
|
2009-01-29 06:35:02 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
struct paravirt_patch_template pv_ops = {
|
|
|
|
/* Init ops. */
|
|
|
|
.init.patch = native_patch,
|
[PATCH] i386: PARAVIRT: Hooks to set up initial pagetable
This patch introduces paravirt_ops hooks to control how the kernel's
initial pagetable is set up.
In the case of a native boot, the very early bootstrap code creates a
simple non-PAE pagetable to map the kernel and physical memory. When
the VM subsystem is initialized, it creates a proper pagetable which
respects the PAE mode, large pages, etc.
When booting under a hypervisor, there are many possibilities for what
paging environment the hypervisor establishes for the guest kernel, so
the constructon of the kernel's pagetable depends on the hypervisor.
In the case of Xen, the hypervisor boots the kernel with a fully
constructed pagetable, which is already using PAE if necessary. Also,
Xen requires particular care when constructing pagetables to make sure
all pagetables are always mapped read-only.
In order to make this easier, kernel's initial pagetable construction
has been changed to only allocate and initialize a pagetable page if
there's no page already present in the pagetable. This allows the Xen
paravirt backend to make a copy of the hypervisor-provided pagetable,
allowing the kernel to establish any more mappings it needs while
keeping the existing ones.
A slightly subtle point which is worth highlighting here is that Xen
requires all kernel mappings to share the same pte_t pages between all
pagetables, so that updating a kernel page's mapping in one pagetable
is reflected in all other pagetables. This makes it possible to
allocate a page and attach it to a pagetable without having to
explicitly enumerate that page's mapping in all pagetables.
And:
+From: "Eric W. Biederman" <ebiederm@xmission.com>
If we don't set the leaf page table entries it is quite possible that
will inherit and incorrect page table entry from the initial boot
page table setup in head.S. So we need to redo the effort here,
so we pick up PSE, PGE and the like.
Hypervisors like Xen require that their page tables be read-only,
which is slightly incompatible with our low identity mappings, however
I discussed this with Jeremy he has modified the Xen early set_pte
function to avoid problems in this area.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: William Irwin <bill.irwin@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
2007-05-03 01:27:13 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
/* Time ops. */
|
|
|
|
.time.sched_clock = native_sched_clock,
|
|
|
|
.time.steal_clock = native_steal_clock,
|
2007-10-17 02:51:29 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
/* Cpu ops. */
|
2018-08-28 15:40:23 +08:00
|
|
|
.cpu.io_delay = native_io_delay,
|
2006-12-07 09:14:08 +08:00
|
|
|
|
2018-08-28 15:40:23 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2018-08-28 15:40:19 +08:00
|
|
|
.cpu.cpuid = native_cpuid,
|
|
|
|
.cpu.get_debugreg = native_get_debugreg,
|
|
|
|
.cpu.set_debugreg = native_set_debugreg,
|
|
|
|
.cpu.read_cr0 = native_read_cr0,
|
|
|
|
.cpu.write_cr0 = native_write_cr0,
|
|
|
|
.cpu.write_cr4 = native_write_cr4,
|
|
|
|
.cpu.wbinvd = native_wbinvd,
|
|
|
|
.cpu.read_msr = native_read_msr,
|
|
|
|
.cpu.write_msr = native_write_msr,
|
|
|
|
.cpu.read_msr_safe = native_read_msr_safe,
|
|
|
|
.cpu.write_msr_safe = native_write_msr_safe,
|
|
|
|
.cpu.read_pmc = native_read_pmc,
|
|
|
|
.cpu.load_tr_desc = native_load_tr_desc,
|
|
|
|
.cpu.set_ldt = native_set_ldt,
|
|
|
|
.cpu.load_gdt = native_load_gdt,
|
|
|
|
.cpu.load_idt = native_load_idt,
|
|
|
|
.cpu.store_tr = native_store_tr,
|
|
|
|
.cpu.load_tls = native_load_tls,
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
.cpu.load_gs_index = native_load_gs_index,
|
|
|
|
#endif
|
|
|
|
.cpu.write_ldt_entry = native_write_ldt_entry,
|
|
|
|
.cpu.write_gdt_entry = native_write_gdt_entry,
|
|
|
|
.cpu.write_idt_entry = native_write_idt_entry,
|
2008-06-25 12:19:12 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.cpu.alloc_ldt = paravirt_nop,
|
|
|
|
.cpu.free_ldt = paravirt_nop,
|
2007-02-13 20:26:21 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.cpu.load_sp0 = native_load_sp0,
|
2007-05-03 01:27:13 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
.cpu.usergs_sysret64 = native_usergs_sysret64,
|
|
|
|
#endif
|
|
|
|
.cpu.iret = native_iret,
|
|
|
|
.cpu.swapgs = native_swapgs,
|
|
|
|
|
2020-02-18 23:47:12 +08:00
|
|
|
#ifdef CONFIG_X86_IOPL_IOPERM
|
|
|
|
.cpu.update_io_bitmap = native_tss_update_io_bitmap,
|
|
|
|
#endif
|
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.cpu.start_context_switch = paravirt_nop,
|
|
|
|
.cpu.end_context_switch = paravirt_nop,
|
|
|
|
|
|
|
|
/* Irq ops. */
|
|
|
|
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
|
|
|
|
.irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
|
|
|
|
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
|
|
|
|
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
|
|
|
|
.irq.safe_halt = native_safe_halt,
|
|
|
|
.irq.halt = native_halt,
|
2018-08-28 15:40:24 +08:00
|
|
|
#endif /* CONFIG_PARAVIRT_XXL */
|
2018-08-28 15:40:19 +08:00
|
|
|
|
|
|
|
/* Mmu ops. */
|
|
|
|
.mmu.flush_tlb_user = native_flush_tlb,
|
|
|
|
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
|
|
|
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
|
|
|
.mmu.flush_tlb_others = native_flush_tlb_others,
|
|
|
|
.mmu.tlb_remove_table =
|
|
|
|
(void (*)(struct mmu_gather *, void *))tlb_remove_page,
|
|
|
|
|
2018-08-28 15:40:25 +08:00
|
|
|
.mmu.exit_mmap = paravirt_nop,
|
|
|
|
|
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2019-07-11 19:40:55 +08:00
|
|
|
.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2),
|
2018-08-28 15:40:25 +08:00
|
|
|
.mmu.write_cr2 = native_write_cr2,
|
|
|
|
.mmu.read_cr3 = __native_read_cr3,
|
|
|
|
.mmu.write_cr3 = native_write_cr3,
|
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.pgd_alloc = __paravirt_pgd_alloc,
|
|
|
|
.mmu.pgd_free = paravirt_nop,
|
|
|
|
|
|
|
|
.mmu.alloc_pte = paravirt_nop,
|
|
|
|
.mmu.alloc_pmd = paravirt_nop,
|
|
|
|
.mmu.alloc_pud = paravirt_nop,
|
|
|
|
.mmu.alloc_p4d = paravirt_nop,
|
|
|
|
.mmu.release_pte = paravirt_nop,
|
|
|
|
.mmu.release_pmd = paravirt_nop,
|
|
|
|
.mmu.release_pud = paravirt_nop,
|
|
|
|
.mmu.release_p4d = paravirt_nop,
|
|
|
|
|
|
|
|
.mmu.set_pte = native_set_pte,
|
|
|
|
.mmu.set_pte_at = native_set_pte_at,
|
|
|
|
.mmu.set_pmd = native_set_pmd,
|
|
|
|
|
|
|
|
.mmu.ptep_modify_prot_start = __ptep_modify_prot_start,
|
|
|
|
.mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit,
|
2008-06-16 19:30:01 +08:00
|
|
|
|
2015-04-15 06:46:14 +08:00
|
|
|
#if CONFIG_PGTABLE_LEVELS >= 3
|
2006-12-07 09:14:08 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.set_pte_atomic = native_set_pte_atomic,
|
|
|
|
.mmu.pte_clear = native_pte_clear,
|
|
|
|
.mmu.pmd_clear = native_pmd_clear,
|
2008-01-30 20:33:20 +08:00
|
|
|
#endif
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.set_pud = native_set_pud,
|
2009-01-29 06:35:07 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.pmd_val = PTE_IDENT,
|
|
|
|
.mmu.make_pmd = PTE_IDENT,
|
2008-01-30 20:33:20 +08:00
|
|
|
|
2017-03-18 02:55:15 +08:00
|
|
|
#if CONFIG_PGTABLE_LEVELS >= 4
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.pud_val = PTE_IDENT,
|
|
|
|
.mmu.make_pud = PTE_IDENT,
|
2009-01-29 06:35:07 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.set_p4d = native_set_p4d,
|
2017-03-18 02:55:15 +08:00
|
|
|
|
|
|
|
#if CONFIG_PGTABLE_LEVELS >= 5
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.p4d_val = PTE_IDENT,
|
|
|
|
.mmu.make_p4d = PTE_IDENT,
|
2017-03-30 16:07:28 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.set_pgd = native_set_pgd,
|
2017-03-30 16:07:28 +08:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
|
2017-03-18 02:55:15 +08:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
|
2015-04-15 06:46:14 +08:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
|
2006-12-07 09:14:08 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.pte_val = PTE_IDENT,
|
|
|
|
.mmu.pgd_val = PTE_IDENT,
|
2007-05-03 01:27:13 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.make_pte = PTE_IDENT,
|
|
|
|
.mmu.make_pgd = PTE_IDENT,
|
2007-05-03 01:27:13 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.dup_mmap = paravirt_nop,
|
|
|
|
.mmu.activate_mm = paravirt_nop,
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.lazy_mode = {
|
|
|
|
.enter = paravirt_nop,
|
|
|
|
.leave = paravirt_nop,
|
|
|
|
.flush = paravirt_nop,
|
paravirt: clean up lazy mode handling
Currently, the set_lazy_mode pv_op is overloaded with 5 functions:
1. enter lazy cpu mode
2. leave lazy cpu mode
3. enter lazy mmu mode
4. leave lazy mmu mode
5. flush pending batched operations
This complicates each paravirt backend, since it needs to deal with
all the possible state transitions, handling flushing, etc. In
particular, flushing is quite distinct from the other 4 functions, and
seems to just cause complication.
This patch removes the set_lazy_mode operation, and adds "enter" and
"leave" lazy mode operations on mmu_ops and cpu_ops. All the logic
associated with enter and leaving lazy states is now in common code
(basically BUG_ONs to make sure that no mode is current when entering
a lazy mode, and make sure that the mode is current when leaving).
Also, flush is handled in a common way, by simply leaving and
re-entering the lazy mode.
The result is that the Xen, lguest and VMI lazy mode implementations
are much simpler.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zach Amsden <zach@vmware.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Anthony Liguory <aliguori@us.ibm.com>
Cc: "Glauber de Oliveira Costa" <glommer@gmail.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
|
|
|
},
|
2008-06-18 02:42:01 +08:00
|
|
|
|
2018-08-28 15:40:19 +08:00
|
|
|
.mmu.set_fixmap = native_set_fixmap,
|
2018-08-28 15:40:25 +08:00
|
|
|
#endif /* CONFIG_PARAVIRT_XXL */
|
2018-08-28 15:40:19 +08:00
|
|
|
|
|
|
|
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
|
|
|
|
/* Lock ops. */
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
.lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
|
|
|
|
.lock.queued_spin_unlock =
|
|
|
|
PV_CALLEE_SAVE(__native_queued_spin_unlock),
|
|
|
|
.lock.wait = paravirt_nop,
|
|
|
|
.lock.kick = paravirt_nop,
|
|
|
|
.lock.vcpu_is_preempted =
|
|
|
|
PV_CALLEE_SAVE(__native_vcpu_is_preempted),
|
|
|
|
#endif /* SMP */
|
|
|
|
#endif
|
2006-12-07 09:14:07 +08:00
|
|
|
};
|
2007-01-23 12:40:36 +08:00
|
|
|
|
2018-08-28 15:40:23 +08:00
|
|
|
#ifdef CONFIG_PARAVIRT_XXL
|
2018-08-28 15:40:19 +08:00
|
|
|
/* At this point, native_get/set_debugreg has real function entries */
|
|
|
|
NOKPROBE_SYMBOL(native_get_debugreg);
|
|
|
|
NOKPROBE_SYMBOL(native_set_debugreg);
|
|
|
|
NOKPROBE_SYMBOL(native_load_idt);
|
2018-08-28 15:40:23 +08:00
|
|
|
#endif
|
2018-08-28 15:40:19 +08:00
|
|
|
|
2018-10-29 23:01:16 +08:00
|
|
|
EXPORT_SYMBOL(pv_ops);
|
2007-10-17 02:51:29 +08:00
|
|
|
EXPORT_SYMBOL_GPL(pv_info);
|