system/xen: Updated for version 4.13.0.
Signed-off-by: Mario Preksavec <mario@slackware.hr> Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
parent
bddbdfdccb
commit
08a409483c
|
@ -7,7 +7,8 @@ Solaris, and various versions of the BSD operating systems.
|
|||
This script has a few optional dependencies:
|
||||
|
||||
mbootpack - creates LILO compatible kernel images
|
||||
libssh2 - mostly used by libvirt, enable with USE_LIBSSH2=yes
|
||||
libssh - mostly used by libvirt, enable with USE_LIBSSH=yes
|
||||
(previously known as USE_LIBSSH2)
|
||||
ocaml-findlib - autodetected, builds oxenstored binary
|
||||
spice - enable with USE_SPICE=yes
|
||||
|
||||
|
@ -15,6 +16,8 @@ Linking with the stock libraries:
|
|||
|
||||
bluez - enable with USE_BLUEZ=yes
|
||||
gtk - enable with USE_GTK=yes
|
||||
audio - enable with USE_AUDIO=yes
|
||||
(or a comma-delimited list: oss alsa sdl pa)
|
||||
|
||||
Reading material:
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ Xen EFI binary.
|
|||
|
||||
To make things a bit easier, a copy of Xen EFI binary can be found here:
|
||||
|
||||
http://slackware.hr/~mario/xen/xen-4.12.1.efi.gz
|
||||
http://slackware.hr/~mario/xen/xen-4.13.0.efi.gz
|
||||
|
||||
If an automatic boot to Xen kernel is desired, the binary should be renamed and
|
||||
copied to the following location: /boot/efi/EFI/BOOT/bootx64.efi
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#
|
||||
# Automatically generated file; DO NOT EDIT.
|
||||
# Linux/x86 4.4.202 Kernel Configuration
|
||||
# Linux/x86 4.4.217 Kernel Configuration
|
||||
#
|
||||
# CONFIG_64BIT is not set
|
||||
CONFIG_X86_32=y
|
||||
|
@ -7113,6 +7113,7 @@ CONFIG_DOUBLEFAULT=y
|
|||
# CONFIG_DEBUG_TLBFLUSH is not set
|
||||
# CONFIG_IOMMU_STRESS is not set
|
||||
CONFIG_HAVE_MMIOTRACE_SUPPORT=y
|
||||
# CONFIG_X86_DECODER_SELFTEST is not set
|
||||
CONFIG_IO_DELAY_TYPE_0X80=0
|
||||
CONFIG_IO_DELAY_TYPE_0XED=1
|
||||
CONFIG_IO_DELAY_TYPE_UDELAY=2
|
|
@ -1,6 +1,6 @@
|
|||
#
|
||||
# Automatically generated file; DO NOT EDIT.
|
||||
# Linux/x86 4.4.202 Kernel Configuration
|
||||
# Linux/x86 4.4.217 Kernel Configuration
|
||||
#
|
||||
CONFIG_64BIT=y
|
||||
CONFIG_X86_64=y
|
||||
|
@ -6925,6 +6925,7 @@ CONFIG_DOUBLEFAULT=y
|
|||
# CONFIG_IOMMU_DEBUG is not set
|
||||
# CONFIG_IOMMU_STRESS is not set
|
||||
CONFIG_HAVE_MMIOTRACE_SUPPORT=y
|
||||
# CONFIG_X86_DECODER_SELFTEST is not set
|
||||
CONFIG_IO_DELAY_TYPE_0X80=0
|
||||
CONFIG_IO_DELAY_TYPE_0XED=1
|
||||
CONFIG_IO_DELAY_TYPE_UDELAY=2
|
|
@ -5,8 +5,8 @@
|
|||
# Written by Chris Abela <chris.abela@maltats.com>, 20100515
|
||||
# Modified by Mario Preksavec <mario@slackware.hr>
|
||||
|
||||
KERNEL=${KERNEL:-4.4.202}
|
||||
XEN=${XEN:-4.12.1}
|
||||
KERNEL=${KERNEL:-4.4.217}
|
||||
XEN=${XEN:-4.13.0}
|
||||
BOOTLOADER=${BOOTLOADER:-lilo}
|
||||
|
||||
ROOTMOD=${ROOTMOD:-ext4}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
set -e
|
||||
|
||||
KERNEL=${KERNEL:-4.4.202}
|
||||
KERNEL=${KERNEL:-4.4.217}
|
||||
|
||||
# Build an image for the root file system and another for the swap
|
||||
# Default values : 8GB and 500MB resepectively.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# Slackware build script for xen
|
||||
|
||||
# Copyright 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2018, 2019 Mario Preksavec, Zagreb, Croatia
|
||||
# Copyright 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Mario Preksavec, Zagreb, Croatia
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use of this script, with or without modification, is
|
||||
|
@ -23,13 +23,13 @@
|
|||
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
PRGNAM=xen
|
||||
VERSION=${VERSION:-4.12.1}
|
||||
VERSION=${VERSION:-4.13.0}
|
||||
BUILD=${BUILD:-1}
|
||||
TAG=${TAG:-_SBo}
|
||||
|
||||
SEABIOS=${SEABIOS:-1.12.0}
|
||||
OVMF=${OVMF:-20180725_ef529e6ab7}
|
||||
IPXE=${IPXE:-d2063b7693e0e35db97b2264aa987eb6341ae779}
|
||||
SEABIOS=${SEABIOS:-1.12.1}
|
||||
OVMF=${OVMF:-20190606_20d2e5a125}
|
||||
IPXE=${IPXE:-1dd56dbd11082fb622c2ed21cfaced4f47d798a6}
|
||||
|
||||
if [ -z "$ARCH" ]; then
|
||||
case "$( uname -m )" in
|
||||
|
@ -90,9 +90,9 @@ EOF
|
|||
esac
|
||||
esac
|
||||
|
||||
case "${USE_LIBSSH2:-no}" in
|
||||
yes) CONF_QEMUU+=" --enable-libssh2" ;;
|
||||
*) CONF_QEMUU+=" --disable-libssh2" ;;
|
||||
case "${USE_LIBSSH:-no}" in
|
||||
yes) CONF_QEMUU+=" --enable-libssh" ;;
|
||||
*) CONF_QEMUU+=" --disable-libssh" ;;
|
||||
esac
|
||||
|
||||
case "${USE_BLUEZ:-no}" in
|
||||
|
@ -110,6 +110,12 @@ case "${USE_SPICE:-no}" in
|
|||
*) CONF_QEMUU+=" --disable-spice" ;;
|
||||
esac
|
||||
|
||||
case "${USE_AUDIO:-no}" in
|
||||
yes) CONF_QEMUU+="" ;;
|
||||
no) CONF_QEMUU+=" --audio-drv-list=" ;;
|
||||
*) CONF_QEMUU+=" --audio-drv-list=$USE_AUDIO" ;;
|
||||
esac
|
||||
|
||||
set -e
|
||||
|
||||
rm -rf $PKG
|
||||
|
@ -142,8 +148,8 @@ cp $CWD/ipxe-git-$IPXE.tar.gz tools/firmware/etherboot/_ipxe.tar.gz
|
|||
(
|
||||
# Seabios
|
||||
cd tools/firmware
|
||||
tar -xf $CWD/seabios-$SEABIOS.tar.?z
|
||||
mv seabios-$SEABIOS seabios-dir-remote
|
||||
tar -xf $CWD/seabios-$SEABIOS.tar.?z || tar -xf $CWD/seabios-rel-$SEABIOS.tar.?z
|
||||
mv seabios-$SEABIOS seabios-dir-remote || mv seabios-rel-$SEABIOS seabios-dir-remote
|
||||
ln -s seabios-dir-remote seabios-dir
|
||||
make -C seabios-dir defconfig
|
||||
# OVMF
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
PRGNAM="xen"
|
||||
VERSION="4.12.1"
|
||||
VERSION="4.13.0"
|
||||
HOMEPAGE="http://www.xenproject.org/"
|
||||
DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.1.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-d2063b7693e0e35db97b2264aa987eb6341ae779.tar.gz \
|
||||
DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.13.0.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-1dd56dbd11082fb622c2ed21cfaced4f47d798a6.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/newlib-1.16.0.tar.gz \
|
||||
|
@ -11,10 +11,10 @@ DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.1.tar.gz \
|
|||
http://mirror.slackware.hr/sources/xen-extfiles/polarssl-1.1.4-gpl.tgz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/gmp-4.3.2.tar.bz2 \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.12.0.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20180725_ef529e6ab7.tar.bz2"
|
||||
MD5SUM="3f96ae93a5d6a3dd89bdf1398e30895e \
|
||||
0de05da7aec358881bb1dff815ecca14 \
|
||||
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.12.1.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20190606_20d2e5a125.tar.bz2"
|
||||
MD5SUM="d3b13c4c785601be2f104eaddd7c6a00 \
|
||||
b3ab0488a989a089207302111d12e1a0 \
|
||||
36cc57650cffda9a0269493be2a169bb \
|
||||
debc62758716a169df9f62e6ab2bc634 \
|
||||
bf8f1f9e3ca83d732c00a79a6ef29bc4 \
|
||||
|
@ -23,8 +23,8 @@ MD5SUM="3f96ae93a5d6a3dd89bdf1398e30895e \
|
|||
7b72caf22b01464ee7d6165f2fd85f44 \
|
||||
dd60683d7057917e34630b4a787932e8 \
|
||||
e26becb8a6a2b6695f6b3e8097593db8 \
|
||||
2fd637b323d247a0948556104a2121c9 \
|
||||
75df1ed5ad9e08b1fb0be78c8b5234b9"
|
||||
6cb6cba431fd725126ddb5ec529ab85c \
|
||||
a6063a0d3d45e6f77deea8c80569653e"
|
||||
DOWNLOAD_x86_64=""
|
||||
MD5SUM_x86_64=""
|
||||
REQUIRES="acpica yajl"
|
||||
|
|
|
@ -1,195 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: xen/hypercall: Don't use BUG() for parameter checking in hypercall_create_continuation()
|
||||
|
||||
Since c/s 1d429034 "hypercall: update vcpu_op to take an unsigned vcpuid",
|
||||
which incorrectly swapped 'i' for 'u' in the parameter type list, guests have
|
||||
been able to hit the BUG() in next_args()'s default case.
|
||||
|
||||
Correct these back to 'i'.
|
||||
|
||||
In addition, make adjustments to prevent this class of issue from occurring in
|
||||
the future - crashing Xen is not an appropriate form of parameter checking.
|
||||
|
||||
Capitalise NEXT_ARG() to catch all uses, to highlight that it is a macro doing
|
||||
non-function-like things behind the scenes, and undef it when appropriate.
|
||||
Implement a bad_fmt: block which prints an error, asserts unreachable, and
|
||||
crashes the guest.
|
||||
|
||||
On the ARM side, drop all parameter checking of p. It is asymmetric with the
|
||||
x86 side, and akin to expecting memcpy() or sprintf() to check their src/fmt
|
||||
parameter before use. A caller passing "" or something other than a string
|
||||
literal will be obvious during code review.
|
||||
|
||||
This is XSA-296.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: Julien Grall <julien.grall@arm.com>
|
||||
|
||||
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
|
||||
index 941bbff4fe..a3da8e9c08 100644
|
||||
--- a/xen/arch/arm/domain.c
|
||||
+++ b/xen/arch/arm/domain.c
|
||||
@@ -383,14 +383,15 @@ void sync_vcpu_execstate(struct vcpu *v)
|
||||
/* Nothing to do -- no lazy switching */
|
||||
}
|
||||
|
||||
-#define next_arg(fmt, args) ({ \
|
||||
+#define NEXT_ARG(fmt, args) \
|
||||
+({ \
|
||||
unsigned long __arg; \
|
||||
switch ( *(fmt)++ ) \
|
||||
{ \
|
||||
case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
|
||||
case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
|
||||
case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
|
||||
- default: __arg = 0; BUG(); \
|
||||
+ default: goto bad_fmt; \
|
||||
} \
|
||||
__arg; \
|
||||
})
|
||||
@@ -405,9 +406,6 @@ unsigned long hypercall_create_continuation(
|
||||
unsigned int i;
|
||||
va_list args;
|
||||
|
||||
- /* All hypercalls take at least one argument */
|
||||
- BUG_ON( !p || *p == '\0' );
|
||||
-
|
||||
current->hcall_preempted = true;
|
||||
|
||||
va_start(args, format);
|
||||
@@ -415,7 +413,7 @@ unsigned long hypercall_create_continuation(
|
||||
if ( mcs->flags & MCSF_in_multicall )
|
||||
{
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
- mcs->call.args[i] = next_arg(p, args);
|
||||
+ mcs->call.args[i] = NEXT_ARG(p, args);
|
||||
|
||||
/* Return value gets written back to mcs->call.result */
|
||||
rc = mcs->call.result;
|
||||
@@ -431,7 +429,7 @@ unsigned long hypercall_create_continuation(
|
||||
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
{
|
||||
- arg = next_arg(p, args);
|
||||
+ arg = NEXT_ARG(p, args);
|
||||
|
||||
switch ( i )
|
||||
{
|
||||
@@ -454,7 +452,7 @@ unsigned long hypercall_create_continuation(
|
||||
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
{
|
||||
- arg = next_arg(p, args);
|
||||
+ arg = NEXT_ARG(p, args);
|
||||
|
||||
switch ( i )
|
||||
{
|
||||
@@ -475,8 +473,16 @@ unsigned long hypercall_create_continuation(
|
||||
va_end(args);
|
||||
|
||||
return rc;
|
||||
+
|
||||
+ bad_fmt:
|
||||
+ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p);
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ domain_crash(current->domain);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
+#undef NEXT_ARG
|
||||
+
|
||||
void startup_cpu_idle_loop(void)
|
||||
{
|
||||
struct vcpu *v = current;
|
||||
diff --git a/xen/arch/x86/hypercall.c b/xen/arch/x86/hypercall.c
|
||||
index d483dbaa6b..4643e5eb43 100644
|
||||
--- a/xen/arch/x86/hypercall.c
|
||||
+++ b/xen/arch/x86/hypercall.c
|
||||
@@ -80,14 +80,15 @@ const hypercall_args_t hypercall_args_table[NR_hypercalls] =
|
||||
#undef COMP
|
||||
#undef ARGS
|
||||
|
||||
-#define next_arg(fmt, args) ({ \
|
||||
+#define NEXT_ARG(fmt, args) \
|
||||
+({ \
|
||||
unsigned long __arg; \
|
||||
switch ( *(fmt)++ ) \
|
||||
{ \
|
||||
case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \
|
||||
case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \
|
||||
case 'h': __arg = (unsigned long)va_arg(args, void *); break; \
|
||||
- default: __arg = 0; BUG(); \
|
||||
+ default: goto bad_fmt; \
|
||||
} \
|
||||
__arg; \
|
||||
})
|
||||
@@ -109,7 +110,7 @@ unsigned long hypercall_create_continuation(
|
||||
if ( mcs->flags & MCSF_in_multicall )
|
||||
{
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
- mcs->call.args[i] = next_arg(p, args);
|
||||
+ mcs->call.args[i] = NEXT_ARG(p, args);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -121,7 +122,7 @@ unsigned long hypercall_create_continuation(
|
||||
{
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
{
|
||||
- arg = next_arg(p, args);
|
||||
+ arg = NEXT_ARG(p, args);
|
||||
switch ( i )
|
||||
{
|
||||
case 0: regs->rdi = arg; break;
|
||||
@@ -137,7 +138,7 @@ unsigned long hypercall_create_continuation(
|
||||
{
|
||||
for ( i = 0; *p != '\0'; i++ )
|
||||
{
|
||||
- arg = next_arg(p, args);
|
||||
+ arg = NEXT_ARG(p, args);
|
||||
switch ( i )
|
||||
{
|
||||
case 0: regs->rbx = arg; break;
|
||||
@@ -154,8 +155,16 @@ unsigned long hypercall_create_continuation(
|
||||
va_end(args);
|
||||
|
||||
return op;
|
||||
+
|
||||
+ bad_fmt:
|
||||
+ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p);
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ domain_crash(curr->domain);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
+#undef NEXT_ARG
|
||||
+
|
||||
int hypercall_xlat_continuation(unsigned int *id, unsigned int nr,
|
||||
unsigned int mask, ...)
|
||||
{
|
||||
diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c
|
||||
index 39877b3ab2..2531fa7421 100644
|
||||
--- a/xen/common/compat/domain.c
|
||||
+++ b/xen/common/compat/domain.c
|
||||
@@ -81,7 +81,7 @@ int compat_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) ar
|
||||
}
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh",
|
||||
+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
|
||||
cmd, vcpuid, arg);
|
||||
|
||||
break;
|
||||
diff --git a/xen/common/domain.c b/xen/common/domain.c
|
||||
index 2308588052..65bcd85e34 100644
|
||||
--- a/xen/common/domain.c
|
||||
+++ b/xen/common/domain.c
|
||||
@@ -1411,7 +1411,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
|
||||
|
||||
rc = arch_initialise_vcpu(v, arg);
|
||||
if ( rc == -ERESTART )
|
||||
- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh",
|
||||
+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
|
||||
cmd, vcpuid, arg);
|
||||
|
||||
break;
|
|
@ -1,89 +0,0 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: x86/PV: check GDT/LDT limits during emulation
|
||||
|
||||
Accesses beyond the LDT limit originating from emulation would trigger
|
||||
the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
|
||||
accesses would cause an attempt to promote the touched page (offset from
|
||||
the present LDT base address) to a segment descriptor one. If this
|
||||
happens to succeed, guest user mode would be able to elevate its
|
||||
privileges to that of the guest kernel. This is particularly easy when
|
||||
there's no LDT at all, in which case the LDT base stored internally to
|
||||
Xen is simply zero.
|
||||
|
||||
Also adjust the ASSERT() that was triggering: It was off by one to
|
||||
begin with, and for production builds we also better use
|
||||
ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
|
||||
|
||||
This is XSA-298.
|
||||
|
||||
Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
---
|
||||
v2: Correct 64-bit-only limit check (by folding into the common one).
|
||||
|
||||
--- a/xen/arch/x86/pv/emul-gate-op.c
|
||||
+++ b/xen/arch/x86/pv/emul-gate-op.c
|
||||
@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned
|
||||
const seg_desc_t *pdesc = gdt_ldt_desc_ptr(gate_sel);
|
||||
|
||||
if ( (gate_sel < 4) ||
|
||||
- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
|
||||
+ /*
|
||||
+ * We're interested in call gates only, which occupy a single
|
||||
+ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
|
||||
+ */
|
||||
+ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
|
||||
+ (gate_sel & 4 ? v->arch.pv.ldt_ents
|
||||
+ : v->arch.pv.gdt_ents)) ||
|
||||
__get_user(desc, pdesc) )
|
||||
return 0;
|
||||
|
||||
@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned
|
||||
if ( !is_pv_32bit_vcpu(v) )
|
||||
{
|
||||
if ( (*ar & 0x1f00) != 0x0c00 ||
|
||||
- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
|
||||
+ /* Limit check done above already. */
|
||||
__get_user(desc, pdesc + 1) ||
|
||||
(desc.b & 0x1f00) )
|
||||
return 0;
|
||||
--- a/xen/arch/x86/pv/emulate.c
|
||||
+++ b/xen/arch/x86/pv/emulate.c
|
||||
@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
|
||||
{
|
||||
seg_desc_t desc;
|
||||
|
||||
- if ( sel < 4)
|
||||
+ if ( sel < 4 ||
|
||||
+ /*
|
||||
+ * Don't apply the GDT limit here, as the selector may be a Xen
|
||||
+ * provided one. __get_user() will fail (without taking further
|
||||
+ * action) for ones falling in the gap between guest populated
|
||||
+ * and Xen ones.
|
||||
+ */
|
||||
+ ((sel & 4) && (sel >> 3) >= v->arch.pv.ldt_ents) )
|
||||
desc.b = desc.a = 0;
|
||||
else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) )
|
||||
return 0;
|
||||
--- a/xen/arch/x86/pv/mm.c
|
||||
+++ b/xen/arch/x86/pv/mm.c
|
||||
@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int
|
||||
BUG_ON(unlikely(in_irq()));
|
||||
|
||||
/*
|
||||
- * Hardware limit checking should guarantee this property. NB. This is
|
||||
+ * Prior limit checking should guarantee this property. NB. This is
|
||||
* safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
|
||||
* current vcpu, and vcpu_reset() will block until this vcpu has been
|
||||
* descheduled before continuing.
|
||||
*/
|
||||
- ASSERT((offset >> 3) <= curr->arch.pv.ldt_ents);
|
||||
+ if ( unlikely((offset >> 3) >= curr->arch.pv.ldt_ents) )
|
||||
+ {
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ return false;
|
||||
+ }
|
||||
|
||||
if ( is_pv_32bit_domain(currd) )
|
||||
linear = (uint32_t)linear;
|
|
@ -1,94 +0,0 @@
|
|||
From 33d051917d5ef38f678b507a3c832afde48b9b49 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry
|
||||
|
||||
On detection of a potential L1TF issue, most validation code returns
|
||||
-ERESTART to allow the switch to shadow mode to happen and cause the
|
||||
original operation to be restarted.
|
||||
|
||||
However, in the validation code, the return value -ERESTART has been
|
||||
repurposed to indicate 1) the function has partially completed
|
||||
something which needs to be undone, and 2) calling put_page_type()
|
||||
should cleanly undo it. This causes problems in several places.
|
||||
|
||||
For L1 tables, on receiving an -ERESTART return from alloc_l1_table(),
|
||||
alloc_page_type() will set PGT_partial on the page. If for some
|
||||
reason the original operation never restarts, then on domain
|
||||
destruction, relinquish_memory() will call free_page_type() on the
|
||||
page.
|
||||
|
||||
Unfortunately, alloc_ and free_l1_table() aren't set up to deal with
|
||||
PGT_partial. When returning a failure, alloc_l1_table() always
|
||||
de-validates whatever it's validated so far, and free_l1_table()
|
||||
always devalidates the whole page. This means that if
|
||||
relinquish_memory() calls free_page_type() on an L1 that didn't
|
||||
complete due to an L1TF, it will call put_page_from_l1e() on "page
|
||||
entries" that have never been validated.
|
||||
|
||||
For L2+ tables, setting rc to ERESTART causes the rest of the
|
||||
alloc_lN_table() function to *think* that the entry in question will
|
||||
have PGT_partial set. This will cause it to set partial_pte = 1. If
|
||||
relinqush_memory() then calls free_page_type() on one of those pages,
|
||||
then free_lN_table() will call put_page_from_lNe() on the entry when
|
||||
it shouldn't.
|
||||
|
||||
Rather than indicating -ERESTART, indicate -EINTR. This is the code
|
||||
to indicate that nothing has changed from when you started the call
|
||||
(which is effectively how alloc_l1_table() handles errors).
|
||||
|
||||
mod_lN_entry() shouldn't have any of these types of problems, so leave
|
||||
potential changes there for a clean-up patch later.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 3557cd1178..a1b55c10ff 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1409,7 +1409,7 @@ static int alloc_l1_table(struct page_info *page)
|
||||
{
|
||||
if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
|
||||
{
|
||||
- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
|
||||
+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0;
|
||||
if ( ret )
|
||||
goto out;
|
||||
}
|
||||
@@ -1517,7 +1517,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
{
|
||||
if ( !pv_l1tf_check_l2e(d, l2e) )
|
||||
continue;
|
||||
- rc = -ERESTART;
|
||||
+ rc = -EINTR;
|
||||
}
|
||||
else
|
||||
rc = get_page_from_l2e(l2e, pfn, d, partial);
|
||||
@@ -1603,7 +1603,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
{
|
||||
if ( !pv_l1tf_check_l3e(d, l3e) )
|
||||
continue;
|
||||
- rc = -ERESTART;
|
||||
+ rc = -EINTR;
|
||||
}
|
||||
else
|
||||
rc = get_page_from_l3e(l3e, pfn, d, partial);
|
||||
@@ -1783,7 +1783,7 @@ static int alloc_l4_table(struct page_info *page)
|
||||
{
|
||||
if ( !pv_l1tf_check_l4e(d, l4e) )
|
||||
continue;
|
||||
- rc = -ERESTART;
|
||||
+ rc = -EINTR;
|
||||
}
|
||||
else
|
||||
rc = get_page_from_l4e(l4e, pfn, d, partial);
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,99 +0,0 @@
|
|||
From b490792c18f74b76ec8161721c1e07f810e36309 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially
|
||||
de-validated page
|
||||
|
||||
When unpinning pagetables, if an operation is interrupted,
|
||||
relinquish_memory() re-sets PGT_pinned so that the un-pin will
|
||||
pickedup again when the hypercall restarts.
|
||||
|
||||
This is appropriate when put_page_and_type_preemptible() returns
|
||||
-EINTR, which indicates that the page is back in its initial state
|
||||
(i.e., completely validated). However, for -ERESTART, this leads to a
|
||||
state where a page has both PGT_pinned and PGT_partial set.
|
||||
|
||||
This happens to work at the moment, although it's not really a
|
||||
"canonical" state; but in subsequent patches, where we need to make a
|
||||
distinction in handling between PGT_validated and PGT_partial pages,
|
||||
this causes issues.
|
||||
|
||||
Move to a "canonical" state by:
|
||||
- Only re-setting PGT_pinned on -EINTR
|
||||
- Re-dropping the refcount held by PGT_pinned on -ERESTART
|
||||
|
||||
In the latter case, the PGT_partial bit will be cleared further down
|
||||
with the rest of the other PGT_partial pages.
|
||||
|
||||
While here, clean up some trainling whitespace.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++---
|
||||
1 file changed, 28 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
|
||||
index 2585327834..59df8a6d8d 100644
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -114,7 +114,7 @@ static void play_dead(void)
|
||||
* this case, heap corruption or #PF can occur (when heap debugging is
|
||||
* enabled). For example, even printk() can involve tasklet scheduling,
|
||||
* which touches per-cpu vars.
|
||||
- *
|
||||
+ *
|
||||
* Consider very carefully when adding code to *dead_idle. Most hypervisor
|
||||
* subsystems are unsafe to call.
|
||||
*/
|
||||
@@ -1909,9 +1909,34 @@ static int relinquish_memory(
|
||||
break;
|
||||
case -ERESTART:
|
||||
case -EINTR:
|
||||
+ /*
|
||||
+ * -EINTR means PGT_validated has been re-set; re-set
|
||||
+ * PGT_pinned again so that it gets picked up next time
|
||||
+ * around.
|
||||
+ *
|
||||
+ * -ERESTART, OTOH, means PGT_partial is set instead. Put
|
||||
+ * it back on the list, but don't set PGT_pinned; the
|
||||
+ * section below will finish off de-validation. But we do
|
||||
+ * need to drop the general ref associated with
|
||||
+ * PGT_pinned, since put_page_and_type_preemptible()
|
||||
+ * didn't do it.
|
||||
+ *
|
||||
+ * NB we can do an ASSERT for PGT_validated, since we
|
||||
+ * "own" the type ref; but theoretically, the PGT_partial
|
||||
+ * could be cleared by someone else.
|
||||
+ */
|
||||
+ if ( ret == -EINTR )
|
||||
+ {
|
||||
+ ASSERT(page->u.inuse.type_info & PGT_validated);
|
||||
+ set_bit(_PGT_pinned, &page->u.inuse.type_info);
|
||||
+ }
|
||||
+ else
|
||||
+ put_page(page);
|
||||
+
|
||||
ret = -ERESTART;
|
||||
+
|
||||
+ /* Put the page back on the list and drop the ref we grabbed above */
|
||||
page_list_add(page, list);
|
||||
- set_bit(_PGT_pinned, &page->u.inuse.type_info);
|
||||
put_page(page);
|
||||
goto out;
|
||||
default:
|
||||
@@ -2161,7 +2186,7 @@ void vcpu_kick(struct vcpu *v)
|
||||
* pending flag. These values may fluctuate (after all, we hold no
|
||||
* locks) but the key insight is that each change will cause
|
||||
* evtchn_upcall_pending to be polled.
|
||||
- *
|
||||
+ *
|
||||
* NB2. We save the running flag across the unblock to avoid a needless
|
||||
* IPI for domains that we IPI'd to unblock.
|
||||
*/
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,618 +0,0 @@
|
|||
From 0f9f61e5737fdd346550ec6e30161fa99e4653fa Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into
|
||||
individual flags
|
||||
|
||||
At the moment, partial_pte is a tri-state that contains two distinct bits
|
||||
of information:
|
||||
|
||||
1. If zero, the pte at index [nr_validated_ptes] is un-validated. If
|
||||
non-zero, the pte was last seen with PGT_partial set.
|
||||
|
||||
2. If positive, the pte at index [nr_validated_ptes] does not hold a
|
||||
general reference count. If negative, it does.
|
||||
|
||||
To make future patches more clear, separate out this functionality
|
||||
into two distinct, named bits: PTF_partial_set (for #1) and
|
||||
PTF_partial_general_ref (for #2).
|
||||
|
||||
Additionally, a number of functions which need this information also
|
||||
take other flags to control behavior (such as `preemptible` and
|
||||
`defer`). These are hard to read in the caller (since you only see
|
||||
'true' or 'false'), and ugly when many are added together. In
|
||||
preparation for adding yet another flag in a future patch, collapse
|
||||
all of these into a single `flag` variable.
|
||||
|
||||
NB that this does mean checking for what was previously the '-1'
|
||||
condition a bit more ugly in the put_page_from_lNe functions (since
|
||||
you have to check for both partial_set and general ref); but this
|
||||
clause will go away in a future patch.
|
||||
|
||||
Also note that the original comment had an off-by-one error:
|
||||
partial_flags (like partial_pte before it) concerns
|
||||
plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].
|
||||
|
||||
No functional change intended.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++---------------
|
||||
xen/include/asm-x86/mm.h | 41 +++++++---
|
||||
2 files changed, 128 insertions(+), 78 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index a1b55c10ff..3f6f8cc9b8 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1094,20 +1094,35 @@ get_page_from_l1e(
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PV
|
||||
+
|
||||
+/*
|
||||
+ * The following flags are used to specify behavior of various get and
|
||||
+ * put commands. The first two are also stored in page->partial_flags
|
||||
+ * to indicate the state of the page pointed to by
|
||||
+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
|
||||
+ * more information.
|
||||
+ */
|
||||
+#define PTF_partial_set (1 << 0)
|
||||
+#define PTF_partial_general_ref (1 << 1)
|
||||
+#define PTF_preemptible (1 << 2)
|
||||
+#define PTF_defer (1 << 3)
|
||||
+
|
||||
static int get_page_and_type_from_mfn(
|
||||
mfn_t mfn, unsigned long type, struct domain *d,
|
||||
- int partial, int preemptible)
|
||||
+ unsigned int flags)
|
||||
{
|
||||
struct page_info *page = mfn_to_page(mfn);
|
||||
int rc;
|
||||
+ bool preemptible = flags & PTF_preemptible,
|
||||
+ partial_ref = flags & PTF_partial_general_ref;
|
||||
|
||||
- if ( likely(partial >= 0) &&
|
||||
+ if ( likely(!partial_ref) &&
|
||||
unlikely(!get_page_from_mfn(mfn, d)) )
|
||||
return -EINVAL;
|
||||
|
||||
rc = _get_page_type(page, type, preemptible);
|
||||
|
||||
- if ( unlikely(rc) && partial >= 0 &&
|
||||
+ if ( unlikely(rc) && !partial_ref &&
|
||||
(!preemptible || page != current->arch.old_guest_table) )
|
||||
put_page(page);
|
||||
|
||||
@@ -1117,7 +1132,7 @@ static int get_page_and_type_from_mfn(
|
||||
define_get_linear_pagetable(l2);
|
||||
static int
|
||||
get_page_from_l2e(
|
||||
- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
|
||||
+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
|
||||
{
|
||||
unsigned long mfn = l2e_get_pfn(l2e);
|
||||
int rc;
|
||||
@@ -1129,8 +1144,9 @@ get_page_from_l2e(
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
|
||||
- partial, false);
|
||||
+ ASSERT(!(flags & PTF_preemptible));
|
||||
+
|
||||
+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags);
|
||||
if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
|
||||
rc = 0;
|
||||
|
||||
@@ -1140,7 +1156,7 @@ get_page_from_l2e(
|
||||
define_get_linear_pagetable(l3);
|
||||
static int
|
||||
get_page_from_l3e(
|
||||
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
|
||||
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
|
||||
{
|
||||
int rc;
|
||||
|
||||
@@ -1152,7 +1168,7 @@ get_page_from_l3e(
|
||||
}
|
||||
|
||||
rc = get_page_and_type_from_mfn(
|
||||
- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
|
||||
+ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
|
||||
if ( unlikely(rc == -EINVAL) &&
|
||||
!is_pv_32bit_domain(d) &&
|
||||
get_l3_linear_pagetable(l3e, pfn, d) )
|
||||
@@ -1164,7 +1180,7 @@ get_page_from_l3e(
|
||||
define_get_linear_pagetable(l4);
|
||||
static int
|
||||
get_page_from_l4e(
|
||||
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
|
||||
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
|
||||
{
|
||||
int rc;
|
||||
|
||||
@@ -1176,7 +1192,7 @@ get_page_from_l4e(
|
||||
}
|
||||
|
||||
rc = get_page_and_type_from_mfn(
|
||||
- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
|
||||
+ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
|
||||
if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
|
||||
rc = 0;
|
||||
|
||||
@@ -1277,7 +1293,7 @@ static void put_data_page(struct page_info *page, bool writeable)
|
||||
* Note also that this automatically deals correctly with linear p.t.'s.
|
||||
*/
|
||||
static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
- int partial, bool defer)
|
||||
+ unsigned int flags)
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
@@ -1300,12 +1316,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
struct page_info *pg = l2e_get_page(l2e);
|
||||
struct page_info *ptpg = mfn_to_page(_mfn(pfn));
|
||||
|
||||
- if ( unlikely(partial > 0) )
|
||||
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
+ PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!defer);
|
||||
+ ASSERT(!(flags & PTF_defer));
|
||||
rc = _put_page_type(pg, true, ptpg);
|
||||
}
|
||||
- else if ( defer )
|
||||
+ else if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = ptpg;
|
||||
current->arch.old_guest_table = pg;
|
||||
@@ -1322,7 +1339,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
}
|
||||
|
||||
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
- int partial, bool defer)
|
||||
+ unsigned int flags)
|
||||
{
|
||||
struct page_info *pg;
|
||||
int rc;
|
||||
@@ -1345,13 +1362,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
|
||||
pg = l3e_get_page(l3e);
|
||||
|
||||
- if ( unlikely(partial > 0) )
|
||||
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
+ PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!defer);
|
||||
+ ASSERT(!(flags & PTF_defer));
|
||||
return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
}
|
||||
|
||||
- if ( defer )
|
||||
+ if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
current->arch.old_guest_table = pg;
|
||||
@@ -1366,7 +1384,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
}
|
||||
|
||||
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
- int partial, bool defer)
|
||||
+ unsigned int flags)
|
||||
{
|
||||
int rc = 1;
|
||||
|
||||
@@ -1375,13 +1393,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
{
|
||||
struct page_info *pg = l4e_get_page(l4e);
|
||||
|
||||
- if ( unlikely(partial > 0) )
|
||||
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
+ PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!defer);
|
||||
+ ASSERT(!(flags & PTF_defer));
|
||||
return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
}
|
||||
|
||||
- if ( defer )
|
||||
+ if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
current->arch.old_guest_table = pg;
|
||||
@@ -1492,12 +1511,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l2_pgentry_t *pl2e;
|
||||
unsigned int i;
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
+ int rc = 0;
|
||||
+ unsigned int partial_flags = page->partial_flags;
|
||||
|
||||
pl2e = map_domain_page(_mfn(pfn));
|
||||
|
||||
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
|
||||
- i++, partial = 0 )
|
||||
+ i++, partial_flags = 0 )
|
||||
{
|
||||
l2_pgentry_t l2e;
|
||||
|
||||
@@ -1520,17 +1540,18 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
rc = -EINTR;
|
||||
}
|
||||
else
|
||||
- rc = get_page_from_l2e(l2e, pfn, d, partial);
|
||||
+ rc = get_page_from_l2e(l2e, pfn, d, partial_flags);
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: 1;
|
||||
+ /* Set 'set', retain 'general ref' */
|
||||
+ page->partial_flags = partial_flags | PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
rc = -ERESTART;
|
||||
}
|
||||
else if ( rc < 0 && rc != -EINTR )
|
||||
@@ -1539,7 +1560,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
@@ -1563,7 +1584,8 @@ static int alloc_l3_table(struct page_info *page)
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l3_pgentry_t *pl3e;
|
||||
unsigned int i;
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
+ int rc = 0;
|
||||
+ unsigned int partial_flags = page->partial_flags;
|
||||
|
||||
pl3e = map_domain_page(_mfn(pfn));
|
||||
|
||||
@@ -1578,7 +1600,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
|
||||
|
||||
for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
|
||||
- i++, partial = 0 )
|
||||
+ i++, partial_flags = 0 )
|
||||
{
|
||||
l3_pgentry_t l3e = pl3e[i];
|
||||
|
||||
@@ -1597,7 +1619,8 @@ static int alloc_l3_table(struct page_info *page)
|
||||
else
|
||||
rc = get_page_and_type_from_mfn(
|
||||
l3e_get_mfn(l3e),
|
||||
- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
|
||||
+ PGT_l2_page_table | PGT_pae_xen_l2, d,
|
||||
+ partial_flags | PTF_preemptible);
|
||||
}
|
||||
else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
|
||||
{
|
||||
@@ -1606,17 +1629,18 @@ static int alloc_l3_table(struct page_info *page)
|
||||
rc = -EINTR;
|
||||
}
|
||||
else
|
||||
- rc = get_page_from_l3e(l3e, pfn, d, partial);
|
||||
+ rc = get_page_from_l3e(l3e, pfn, d, partial_flags);
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: 1;
|
||||
+ /* Set 'set', leave 'general ref' set if this entry was set */
|
||||
+ page->partial_flags = partial_flags | PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
rc = -ERESTART;
|
||||
}
|
||||
if ( rc < 0 )
|
||||
@@ -1633,7 +1657,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
@@ -1767,10 +1791,11 @@ static int alloc_l4_table(struct page_info *page)
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
|
||||
unsigned int i;
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
+ int rc = 0;
|
||||
+ unsigned int partial_flags = page->partial_flags;
|
||||
|
||||
for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
|
||||
- i++, partial = 0 )
|
||||
+ i++, partial_flags = 0 )
|
||||
{
|
||||
l4_pgentry_t l4e;
|
||||
|
||||
@@ -1786,12 +1811,13 @@ static int alloc_l4_table(struct page_info *page)
|
||||
rc = -EINTR;
|
||||
}
|
||||
else
|
||||
- rc = get_page_from_l4e(l4e, pfn, d, partial);
|
||||
+ rc = get_page_from_l4e(l4e, pfn, d, partial_flags);
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: 1;
|
||||
+ /* Set 'set', leave 'general ref' set if this entry was set */
|
||||
+ page->partial_flags = partial_flags | PTF_partial_set;
|
||||
}
|
||||
else if ( rc < 0 )
|
||||
{
|
||||
@@ -1801,7 +1827,7 @@ static int alloc_l4_table(struct page_info *page)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
if ( rc == -EINTR )
|
||||
rc = -ERESTART;
|
||||
else
|
||||
@@ -1853,19 +1879,20 @@ static int free_l2_table(struct page_info *page)
|
||||
struct domain *d = page_get_owner(page);
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l2_pgentry_t *pl2e;
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
- unsigned int i = page->nr_validated_ptes - !partial;
|
||||
+ int rc = 0;
|
||||
+ unsigned int partial_flags = page->partial_flags,
|
||||
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
|
||||
|
||||
pl2e = map_domain_page(_mfn(pfn));
|
||||
|
||||
for ( ; ; )
|
||||
{
|
||||
if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
|
||||
- rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
|
||||
+ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
|
||||
if ( rc < 0 )
|
||||
break;
|
||||
|
||||
- partial = 0;
|
||||
+ partial_flags = 0;
|
||||
|
||||
if ( !i-- )
|
||||
break;
|
||||
@@ -1887,12 +1914,14 @@ static int free_l2_table(struct page_info *page)
|
||||
else if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: -1;
|
||||
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
+ partial_flags :
|
||||
+ (PTF_partial_set | PTF_partial_general_ref);
|
||||
}
|
||||
else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
page->nr_validated_ptes = i + 1;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
rc = -ERESTART;
|
||||
}
|
||||
|
||||
@@ -1904,18 +1933,19 @@ static int free_l3_table(struct page_info *page)
|
||||
struct domain *d = page_get_owner(page);
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l3_pgentry_t *pl3e;
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
- unsigned int i = page->nr_validated_ptes - !partial;
|
||||
+ int rc = 0;
|
||||
+ unsigned int partial_flags = page->partial_flags,
|
||||
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
|
||||
|
||||
pl3e = map_domain_page(_mfn(pfn));
|
||||
|
||||
for ( ; ; )
|
||||
{
|
||||
- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
|
||||
+ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
|
||||
if ( rc < 0 )
|
||||
break;
|
||||
|
||||
- partial = 0;
|
||||
+ partial_flags = 0;
|
||||
if ( rc == 0 )
|
||||
pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
|
||||
|
||||
@@ -1934,12 +1964,14 @@ static int free_l3_table(struct page_info *page)
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: -1;
|
||||
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
+ partial_flags :
|
||||
+ (PTF_partial_set | PTF_partial_general_ref);
|
||||
}
|
||||
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
page->nr_validated_ptes = i + 1;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
rc = -ERESTART;
|
||||
}
|
||||
return rc > 0 ? 0 : rc;
|
||||
@@ -1950,26 +1982,29 @@ static int free_l4_table(struct page_info *page)
|
||||
struct domain *d = page_get_owner(page);
|
||||
unsigned long pfn = mfn_x(page_to_mfn(page));
|
||||
l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
|
||||
- int rc = 0, partial = page->partial_pte;
|
||||
- unsigned int i = page->nr_validated_ptes - !partial;
|
||||
+ int rc = 0;
|
||||
+ unsigned partial_flags = page->partial_flags,
|
||||
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
|
||||
|
||||
do {
|
||||
if ( is_guest_l4_slot(d, i) )
|
||||
- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
|
||||
+ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
|
||||
if ( rc < 0 )
|
||||
break;
|
||||
- partial = 0;
|
||||
+ partial_flags = 0;
|
||||
} while ( i-- );
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_pte = partial ?: -1;
|
||||
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
+ partial_flags :
|
||||
+ (PTF_partial_set | PTF_partial_general_ref);
|
||||
}
|
||||
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
page->nr_validated_ptes = i + 1;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
rc = -ERESTART;
|
||||
}
|
||||
|
||||
@@ -2247,7 +2282,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
- put_page_from_l2e(ol2e, pfn, 0, true);
|
||||
+ put_page_from_l2e(ol2e, pfn, PTF_defer);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -2315,7 +2350,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
|
||||
if ( !create_pae_xen_mappings(d, pl3e) )
|
||||
BUG();
|
||||
|
||||
- put_page_from_l3e(ol3e, pfn, 0, 1);
|
||||
+ put_page_from_l3e(ol3e, pfn, PTF_defer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -2378,7 +2413,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
- put_page_from_l4e(ol4e, pfn, 0, 1);
|
||||
+ put_page_from_l4e(ol4e, pfn, PTF_defer);
|
||||
return rc;
|
||||
}
|
||||
#endif /* CONFIG_PV */
|
||||
@@ -2649,7 +2684,7 @@ int free_page_type(struct page_info *page, unsigned long type,
|
||||
if ( !(type & PGT_partial) )
|
||||
{
|
||||
page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
}
|
||||
|
||||
switch ( type & PGT_type_mask )
|
||||
@@ -2946,7 +2981,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
if ( !(x & PGT_partial) )
|
||||
{
|
||||
page->nr_validated_ptes = 0;
|
||||
- page->partial_pte = 0;
|
||||
+ page->partial_flags = 0;
|
||||
}
|
||||
page->linear_pt_count = 0;
|
||||
rc = alloc_page_type(page, type, preemptible);
|
||||
@@ -3122,7 +3157,7 @@ int new_guest_cr3(mfn_t mfn)
|
||||
return 0;
|
||||
}
|
||||
|
||||
- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
|
||||
+ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible);
|
||||
switch ( rc )
|
||||
{
|
||||
case 0:
|
||||
@@ -3473,7 +3508,7 @@ long do_mmuext_op(
|
||||
if ( op.arg1.mfn != 0 )
|
||||
{
|
||||
rc = get_page_and_type_from_mfn(
|
||||
- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
|
||||
+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible);
|
||||
|
||||
if ( unlikely(rc) )
|
||||
{
|
||||
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
||||
index 6faa563167..8406ac3c37 100644
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -228,19 +228,34 @@ struct page_info
|
||||
* setting the flag must not drop that reference, whereas the instance
|
||||
* clearing it will have to.
|
||||
*
|
||||
- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
|
||||
- * been partially validated. This implies that the general reference
|
||||
- * to the page (acquired from get_page_from_lNe()) would be dropped
|
||||
- * (again due to the apparent failure) and hence must be re-acquired
|
||||
- * when resuming the validation, but must not be dropped when picking
|
||||
- * up the page for invalidation.
|
||||
+ * If partial_flags & PTF_partial_set is set, then the page at
|
||||
+ * at @nr_validated_ptes had PGT_partial set as a result of an
|
||||
+ * operation on the current page. (That page may or may not
|
||||
+ * still have PGT_partial set.)
|
||||
*
|
||||
- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
|
||||
- * been partially invalidated. This is basically the opposite case of
|
||||
- * above, i.e. the general reference to the page was not dropped in
|
||||
- * put_page_from_lNe() (due to the apparent failure), and hence it
|
||||
- * must be dropped when the put operation is resumed (and completes),
|
||||
- * but it must not be acquired if picking up the page for validation.
|
||||
+ * If PTF_partial_general_ref is set, then the PTE at
|
||||
+ * @nr_validated_ptef holds a general reference count for the
|
||||
+ * page.
|
||||
+ *
|
||||
+ * This happens:
|
||||
+ * - During de-validation, if de-validation of the page was
|
||||
+ * interrupted
|
||||
+ * - During validation, if an invalid entry is encountered and
|
||||
+ * validation is preemptible
|
||||
+ * - During validation, if PTF_partial_general_ref was set on
|
||||
+ * this entry to begin with (perhaps because we're picking
|
||||
+ * up from a partial de-validation).
|
||||
+ *
|
||||
+ * When resuming validation, if PTF_partial_general_ref is clear,
|
||||
+ * then a general reference must be re-acquired; if it is set, no
|
||||
+ * reference should be acquired.
|
||||
+ *
|
||||
+ * When resuming de-validation, if PTF_partial_general_ref is
|
||||
+ * clear, no reference should be dropped; if it is set, a
|
||||
+ * reference should be dropped.
|
||||
+ *
|
||||
+ * NB that PTF_partial_set and PTF_partial_general_ref are
|
||||
+ * defined in mm.c, the only place where they are used.
|
||||
*
|
||||
* The 3rd field, @linear_pt_count, indicates
|
||||
* - by a positive value, how many same-level page table entries a page
|
||||
@@ -251,7 +266,7 @@ struct page_info
|
||||
struct {
|
||||
u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
|
||||
u16 :16 - PAGETABLE_ORDER - 1 - 2;
|
||||
- s16 partial_pte:2;
|
||||
+ u16 partial_flags:2;
|
||||
s16 linear_pt_count;
|
||||
};
|
||||
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,140 +0,0 @@
|
|||
From db1d801aa8dcb918a27486a6e8d9cf5d7307dec3 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a
|
||||
boolean
|
||||
|
||||
This is in mainly in preparation for _put_page_type taking the
|
||||
partial_flags value in the future. It also makes it easier to read in
|
||||
the caller (since you see a flag name rather than `true` or `false`).
|
||||
|
||||
No functional change intended.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 25 +++++++++++++------------
|
||||
1 file changed, 13 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 3f6f8cc9b8..0740b61af8 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1200,7 +1200,7 @@ get_page_from_l4e(
|
||||
}
|
||||
#endif /* CONFIG_PV */
|
||||
|
||||
-static int _put_page_type(struct page_info *page, bool preemptible,
|
||||
+static int _put_page_type(struct page_info *page, unsigned int flags,
|
||||
struct page_info *ptpg);
|
||||
|
||||
void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
|
||||
@@ -1320,7 +1320,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
PTF_partial_set )
|
||||
{
|
||||
ASSERT(!(flags & PTF_defer));
|
||||
- rc = _put_page_type(pg, true, ptpg);
|
||||
+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
|
||||
}
|
||||
else if ( flags & PTF_defer )
|
||||
{
|
||||
@@ -1329,7 +1329,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
}
|
||||
else
|
||||
{
|
||||
- rc = _put_page_type(pg, true, ptpg);
|
||||
+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
}
|
||||
@@ -1366,7 +1366,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
PTF_partial_set )
|
||||
{
|
||||
ASSERT(!(flags & PTF_defer));
|
||||
- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
}
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
@@ -1376,7 +1376,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
return 0;
|
||||
}
|
||||
|
||||
- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
|
||||
@@ -1397,7 +1397,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
PTF_partial_set )
|
||||
{
|
||||
ASSERT(!(flags & PTF_defer));
|
||||
- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
}
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
@@ -1407,7 +1407,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
return 0;
|
||||
}
|
||||
|
||||
- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
|
||||
+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
}
|
||||
@@ -2757,10 +2757,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type,
|
||||
}
|
||||
|
||||
|
||||
-static int _put_page_type(struct page_info *page, bool preemptible,
|
||||
+static int _put_page_type(struct page_info *page, unsigned int flags,
|
||||
struct page_info *ptpg)
|
||||
{
|
||||
unsigned long nx, x, y = page->u.inuse.type_info;
|
||||
+ bool preemptible = flags & PTF_preemptible;
|
||||
|
||||
ASSERT(current_locked_page_ne_check(page));
|
||||
|
||||
@@ -2969,7 +2970,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
|
||||
if ( unlikely(iommu_ret) )
|
||||
{
|
||||
- _put_page_type(page, false, NULL);
|
||||
+ _put_page_type(page, 0, NULL);
|
||||
rc = iommu_ret;
|
||||
goto out;
|
||||
}
|
||||
@@ -2996,7 +2997,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
|
||||
void put_page_type(struct page_info *page)
|
||||
{
|
||||
- int rc = _put_page_type(page, false, NULL);
|
||||
+ int rc = _put_page_type(page, 0, NULL);
|
||||
ASSERT(rc == 0);
|
||||
(void)rc;
|
||||
}
|
||||
@@ -3013,7 +3014,7 @@ int get_page_type(struct page_info *page, unsigned long type)
|
||||
|
||||
int put_page_type_preemptible(struct page_info *page)
|
||||
{
|
||||
- return _put_page_type(page, true, NULL);
|
||||
+ return _put_page_type(page, PTF_preemptible, NULL);
|
||||
}
|
||||
|
||||
int get_page_type_preemptible(struct page_info *page, unsigned long type)
|
||||
@@ -3030,7 +3031,7 @@ int put_old_guest_table(struct vcpu *v)
|
||||
if ( !v->arch.old_guest_table )
|
||||
return 0;
|
||||
|
||||
- switch ( rc = _put_page_type(v->arch.old_guest_table, true,
|
||||
+ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
|
||||
v->arch.old_guest_ptpg) )
|
||||
{
|
||||
case -EINTR:
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
From 6f257854c8778774210281c5c21028c4b7739b44 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional
|
||||
|
||||
Make it easier to read by declaring the conditions in which we will
|
||||
retain the ref, rather than the conditions under which we release it.
|
||||
|
||||
The only way (page == current->arch.old_guest_table) can be true is if
|
||||
preemptible is true; so remove this from the query itself, and add an
|
||||
ASSERT() to that effect on the opposite path.
|
||||
|
||||
No functional change intended.
|
||||
|
||||
NB that alloc_lN_table() mishandle the "linear pt failure" situation
|
||||
described in the comment; this will be addressed in a future patch.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 37 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 0740b61af8..0a4d39a2c3 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1122,8 +1122,43 @@ static int get_page_and_type_from_mfn(
|
||||
|
||||
rc = _get_page_type(page, type, preemptible);
|
||||
|
||||
- if ( unlikely(rc) && !partial_ref &&
|
||||
- (!preemptible || page != current->arch.old_guest_table) )
|
||||
+ /*
|
||||
+ * Retain the refcount if:
|
||||
+ * - page is fully validated (rc == 0)
|
||||
+ * - page is not validated (rc < 0) but:
|
||||
+ * - We came in with a reference (partial_ref)
|
||||
+ * - page is partially validated but there's been an error
|
||||
+ * (page == current->arch.old_guest_table)
|
||||
+ *
|
||||
+ * The partial_ref-on-error clause is worth an explanation. There
|
||||
+ * are two scenarios where partial_ref might be true coming in:
|
||||
+ * - mfn has been partially demoted as type `type`; i.e. has
|
||||
+ * PGT_partial set
|
||||
+ * - mfn has been partially demoted as L(type+1) (i.e., a linear
|
||||
+ * page; e.g. we're being called from get_page_from_l2e with
|
||||
+ * type == PGT_l1_table, but the mfn is PGT_l2_table)
|
||||
+ *
|
||||
+ * If there's an error, in the first case, _get_page_type will
|
||||
+ * either return -ERESTART, in which case we want to retain the
|
||||
+ * ref (as the caller will consider it retained), or -EINVAL, in
|
||||
+ * which case old_guest_table will be set; in both cases, we need
|
||||
+ * to retain the ref.
|
||||
+ *
|
||||
+ * In the second case, if there's an error, _get_page_type() can
|
||||
+ * *only* return -EINVAL, and *never* set old_guest_table. In
|
||||
+ * that case we also want to retain the reference, to allow the
|
||||
+ * page to continue to be torn down (i.e., PGT_partial cleared)
|
||||
+ * safely.
|
||||
+ *
|
||||
+ * Also note that we shouldn't be able to leave with the reference
|
||||
+ * count retained unless we succeeded, or the operation was
|
||||
+ * preemptible.
|
||||
+ */
|
||||
+ if ( likely(!rc) || partial_ref )
|
||||
+ /* nothing */;
|
||||
+ else if ( page == current->arch.old_guest_table )
|
||||
+ ASSERT(preemptible);
|
||||
+ else
|
||||
put_page(page);
|
||||
|
||||
return rc;
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
From 4ad70553611a7a4e4494d5a3b51b5cc295a488e0 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when
|
||||
preempting
|
||||
|
||||
In order to allow recursive pagetable promotions and demotions to be
|
||||
interrupted, Xen must keep track of the state of the sub-pages
|
||||
promoted or demoted. This is stored in two elements in the page
|
||||
struct: nr_entries_validated and partial_flags.
|
||||
|
||||
The rule is that entries [0, nr_entries_validated) should always be
|
||||
validated and hold a general reference count. If partial_flags is
|
||||
zero, then [nr_entries_validated] is not validated and no reference
|
||||
count is held. If PTF_partial_set is set, then [nr_entries_validated]
|
||||
is partially validated.
|
||||
|
||||
At the moment, a distinction is made between promotion and demotion
|
||||
with regard to whether the entry itself "holds" a general reference
|
||||
count: when entry promotion is interrupted (i.e., returns -ERESTART),
|
||||
the entry is not considered to hold a reference; when entry demotion
|
||||
is interrupted, the entry is still considered to hold a general
|
||||
reference.
|
||||
|
||||
PTF_partial_general_ref is used to distinguish between these cases.
|
||||
If clear, it's a partial promotion => no general reference count held
|
||||
by the entry; if set, it's partial demotion, so a general reference
|
||||
count held. Because promotions and demotions can be interleaved, this
|
||||
value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
|
||||
to be able to properly handle reference counts.
|
||||
|
||||
Unfortunately, when alloc_l[23]_table check hypercall_preempt_check()
|
||||
and return -ERESTART, they set nr_entries_validated, but don't clear
|
||||
partial_flags.
|
||||
|
||||
If we were picking up from a previously-interrupted promotion, that
|
||||
means that PTF_partial_set would be set even though
|
||||
[nr_entries_validated] was not partially validated. This means that
|
||||
if the page in this state were de-validated, put_page_type() would
|
||||
erroneously be called on that entry.
|
||||
|
||||
Perhaps worse, if we were racing with a de-validation, then we might
|
||||
leave both PTF_partial_set and PTF_partial_general_ref; and when
|
||||
de-validation picked up again, both the type and the general ref would
|
||||
be erroneously dropped from [nr_entries_validated].
|
||||
|
||||
In a sense, the real issue here is code duplication. Rather than
|
||||
duplicate the interruption code, set rc to -EINTR and fall through to
|
||||
the code which already handles that case correctly.
|
||||
|
||||
Given the logic at this point, it should be impossible for
|
||||
partial_flags to be non-zero; add an ASSERT() to catch any changes.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 25 ++++++-------------------
|
||||
1 file changed, 6 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 0a4d39a2c3..bbd29a68f4 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1554,21 +1554,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
|
||||
i++, partial_flags = 0 )
|
||||
{
|
||||
- l2_pgentry_t l2e;
|
||||
+ l2_pgentry_t l2e = pl2e[i];
|
||||
|
||||
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
|
||||
- {
|
||||
- page->nr_validated_ptes = i;
|
||||
- rc = -ERESTART;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
- if ( !is_guest_l2_slot(d, type, i) )
|
||||
+ rc = -EINTR;
|
||||
+ else if ( !is_guest_l2_slot(d, type, i) )
|
||||
continue;
|
||||
-
|
||||
- l2e = pl2e[i];
|
||||
-
|
||||
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
|
||||
+ else if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
|
||||
{
|
||||
if ( !pv_l1tf_check_l2e(d, l2e) )
|
||||
continue;
|
||||
@@ -1640,13 +1632,8 @@ static int alloc_l3_table(struct page_info *page)
|
||||
l3_pgentry_t l3e = pl3e[i];
|
||||
|
||||
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
|
||||
- {
|
||||
- page->nr_validated_ptes = i;
|
||||
- rc = -ERESTART;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
- if ( is_pv_32bit_domain(d) && (i == 3) )
|
||||
+ rc = -EINTR;
|
||||
+ else if ( is_pv_32bit_domain(d) && (i == 3) )
|
||||
{
|
||||
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
|
||||
(l3e_get_flags(l3e) & l3_disallow_mask(d)) )
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,378 +0,0 @@
|
|||
From 51fe4e67d954649fcf103116be6206a769f0db1e Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial
|
||||
|
||||
In order to allow recursive pagetable promotions and demotions to be
|
||||
interrupted, Xen must keep track of the state of the sub-pages
|
||||
promoted or demoted. This is stored in two elements in the page struct:
|
||||
nr_entries_validated and partial_flags.
|
||||
|
||||
The rule is that entries [0, nr_entries_validated) should always be
|
||||
validated and hold a general reference count. If partial_flags is
|
||||
zero, then [nr_entries_validated] is not validated and no reference
|
||||
count is held. If PTF_partial_set is set, then [nr_entries_validated]
|
||||
is partially validated.
|
||||
|
||||
At the moment, a distinction is made between promotion and demotion
|
||||
with regard to whether the entry itself "holds" a general reference
|
||||
count: when entry promotion is interrupted (i.e., returns -ERESTART),
|
||||
the entry is not considered to hold a reference; when entry demotion
|
||||
is interrupted, the entry is still considered to hold a general
|
||||
reference.
|
||||
|
||||
PTF_partial_general_ref is used to distinguish between these cases.
|
||||
If clear, it's a partial promotion => no general reference count held
|
||||
by the entry; if set, it's partial demotion, so a general reference
|
||||
count held. Because promotions and demotions can be interleaved, this
|
||||
value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
|
||||
to be able to properly handle reference counts.
|
||||
|
||||
Unfortunately, because a refcount is not held, it is possible to
|
||||
engineer a situation where PFT_partial_set is set but the page in
|
||||
question has been assigned to another domain. A sketch is provided in
|
||||
the appendix.
|
||||
|
||||
Fix this by having the parent page table entry hold a general
|
||||
reference count whenever PFT_partial_set is set. (For clarity of
|
||||
change, keep two separate flags. These will be collapsed in a
|
||||
subsequent changeset.)
|
||||
|
||||
This has two basic implications. On the put_page_from_lNe() side,
|
||||
this mean that the (partial_set && !partial_ref) case can never happen,
|
||||
and no longer needs to be special-cased.
|
||||
|
||||
Secondly, because both flags are set together, there's no need to carry over
|
||||
existing bits from partial_pte.
|
||||
|
||||
(NB there is still another issue with calling _put_page_type() on a
|
||||
page which had PGT_partial set; that will be handled in a subsequent
|
||||
patch.)
|
||||
|
||||
On the get_page_and_type_from_mfn() side, we need to distinguish
|
||||
between callers which hold a reference on partial (i.e.,
|
||||
alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and
|
||||
so on): pass a flag if the type should be retained on interruption.
|
||||
|
||||
NB that since l1 promotion can't be preempted, that get_page_from_l2e
|
||||
can't return -ERESTART.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
-----
|
||||
* Appendix: Engineering PTF_partial_set while a page belongs to a
|
||||
foreign domain
|
||||
|
||||
Suppose A is a page which can be promoted to an l3, and B is a page
|
||||
which can be promoted to an l2, and A[x] points to B. B has
|
||||
PGC_allocated set but no other general references.
|
||||
|
||||
V1: PIN_L3 A.
|
||||
A is validated, B is validated.
|
||||
A.type_count = 1 | PGT_validated | PGT_pinned
|
||||
B.type_count = 1 | PGT_validated
|
||||
B.count = 2 | PGC_allocated (A[x] holds a general ref)
|
||||
|
||||
V1: UNPIN A.
|
||||
A begins de-validation.
|
||||
Arrange to be interrupted when i < x
|
||||
V1->old_guest_table = A
|
||||
V1->old_guest_table_ref_held = false
|
||||
A.type_count = 1 | PGT_partial
|
||||
A.nr_validated_entries = i < x
|
||||
B.type_count = 0
|
||||
B.count = 1 | PGC_allocated
|
||||
|
||||
V2: MOD_L4_ENTRY to point some l4e to A.
|
||||
Picks up re-validation of A.
|
||||
Arrange to be interrupted halfway through B's validation
|
||||
B.type_count = 1 | PGT_partial
|
||||
B.count = 2 | PGC_allocated (PGT_partial holds a general ref)
|
||||
A.type_count = 1 | PGT_partial
|
||||
A.nr_validated_entries = x
|
||||
A.partial_pte = PTF_partial_set
|
||||
|
||||
V3: MOD_L3_ENTRY to point some other l3e (not in A) to B.
|
||||
Validates B.
|
||||
B.type_count = 1 | PGT_validated
|
||||
B.count = 2 | PGC_allocated ("other l3e" holds a general ref)
|
||||
|
||||
V3: MOD_L3_ENTRY to clear l3e pointing to B.
|
||||
Devalidates B.
|
||||
B.type_count = 0
|
||||
B.count = 1 | PGC_allocated
|
||||
|
||||
V3: decrease_reservation(B)
|
||||
Clears PGC_allocated
|
||||
B.count = 0 => B is freed
|
||||
|
||||
B gets assigned to a different domain
|
||||
|
||||
V1: Restarts UNPIN of A
|
||||
put_old_guest_table(A)
|
||||
...
|
||||
free_l3_table(A)
|
||||
|
||||
Now since A.partial_flags has PTF_partial_set, free_l3_table() will
|
||||
call put_page_from_l3e() on A[x], which points to B, while B is owned
|
||||
by another domain.
|
||||
|
||||
If A[x] held a general refcount for B on partial validation, as it does
|
||||
for partial de-validation, then B would still have a reference count of
|
||||
1 after PGC_allocated was freed; so B wouldn't be freed until after
|
||||
put_page_from_l3e() had happend on A[x].
|
||||
---
|
||||
xen/arch/x86/mm.c | 84 +++++++++++++++++++++++-----------------
|
||||
xen/include/asm-x86/mm.h | 15 ++++---
|
||||
2 files changed, 58 insertions(+), 41 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index bbd29a68f4..4d3ebf341d 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1102,10 +1102,11 @@ get_page_from_l1e(
|
||||
* page->pte[page->nr_validated_entries]. See the comment in mm.h for
|
||||
* more information.
|
||||
*/
|
||||
-#define PTF_partial_set (1 << 0)
|
||||
-#define PTF_partial_general_ref (1 << 1)
|
||||
-#define PTF_preemptible (1 << 2)
|
||||
-#define PTF_defer (1 << 3)
|
||||
+#define PTF_partial_set (1 << 0)
|
||||
+#define PTF_partial_general_ref (1 << 1)
|
||||
+#define PTF_preemptible (1 << 2)
|
||||
+#define PTF_defer (1 << 3)
|
||||
+#define PTF_retain_ref_on_restart (1 << 4)
|
||||
|
||||
static int get_page_and_type_from_mfn(
|
||||
mfn_t mfn, unsigned long type, struct domain *d,
|
||||
@@ -1114,7 +1115,11 @@ static int get_page_and_type_from_mfn(
|
||||
struct page_info *page = mfn_to_page(mfn);
|
||||
int rc;
|
||||
bool preemptible = flags & PTF_preemptible,
|
||||
- partial_ref = flags & PTF_partial_general_ref;
|
||||
+ partial_ref = flags & PTF_partial_general_ref,
|
||||
+ partial_set = flags & PTF_partial_set,
|
||||
+ retain_ref = flags & PTF_retain_ref_on_restart;
|
||||
+
|
||||
+ ASSERT(partial_ref == partial_set);
|
||||
|
||||
if ( likely(!partial_ref) &&
|
||||
unlikely(!get_page_from_mfn(mfn, d)) )
|
||||
@@ -1127,13 +1132,15 @@ static int get_page_and_type_from_mfn(
|
||||
* - page is fully validated (rc == 0)
|
||||
* - page is not validated (rc < 0) but:
|
||||
* - We came in with a reference (partial_ref)
|
||||
+ * - page is partially validated (rc == -ERESTART), and the
|
||||
+ * caller has asked the ref to be retained in that case
|
||||
* - page is partially validated but there's been an error
|
||||
* (page == current->arch.old_guest_table)
|
||||
*
|
||||
* The partial_ref-on-error clause is worth an explanation. There
|
||||
* are two scenarios where partial_ref might be true coming in:
|
||||
- * - mfn has been partially demoted as type `type`; i.e. has
|
||||
- * PGT_partial set
|
||||
+ * - mfn has been partially promoted / demoted as type `type`;
|
||||
+ * i.e. has PGT_partial set
|
||||
* - mfn has been partially demoted as L(type+1) (i.e., a linear
|
||||
* page; e.g. we're being called from get_page_from_l2e with
|
||||
* type == PGT_l1_table, but the mfn is PGT_l2_table)
|
||||
@@ -1156,7 +1163,8 @@ static int get_page_and_type_from_mfn(
|
||||
*/
|
||||
if ( likely(!rc) || partial_ref )
|
||||
/* nothing */;
|
||||
- else if ( page == current->arch.old_guest_table )
|
||||
+ else if ( page == current->arch.old_guest_table ||
|
||||
+ (retain_ref && rc == -ERESTART) )
|
||||
ASSERT(preemptible);
|
||||
else
|
||||
put_page(page);
|
||||
@@ -1354,8 +1362,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!(flags & PTF_defer));
|
||||
- rc = _put_page_type(pg, PTF_preemptible, ptpg);
|
||||
+ /* partial_set should always imply partial_ref */
|
||||
+ BUG();
|
||||
}
|
||||
else if ( flags & PTF_defer )
|
||||
{
|
||||
@@ -1400,8 +1408,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!(flags & PTF_defer));
|
||||
- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
+ /* partial_set should always imply partial_ref */
|
||||
+ BUG();
|
||||
}
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
@@ -1431,8 +1439,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
PTF_partial_set )
|
||||
{
|
||||
- ASSERT(!(flags & PTF_defer));
|
||||
- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
+ /* partial_set should always imply partial_ref */
|
||||
+ BUG();
|
||||
}
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
@@ -1569,13 +1577,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
else
|
||||
rc = get_page_from_l2e(l2e, pfn, d, partial_flags);
|
||||
|
||||
- if ( rc == -ERESTART )
|
||||
- {
|
||||
- page->nr_validated_ptes = i;
|
||||
- /* Set 'set', retain 'general ref' */
|
||||
- page->partial_flags = partial_flags | PTF_partial_set;
|
||||
- }
|
||||
- else if ( rc == -EINTR && i )
|
||||
+ /*
|
||||
+ * It shouldn't be possible for get_page_from_l2e to return
|
||||
+ * -ERESTART, since we never call this with PTF_preemptible.
|
||||
+ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable
|
||||
+ * entry.)
|
||||
+ *
|
||||
+ * NB that while on a "clean" promotion, we can never get
|
||||
+ * PGT_partial. It is possible to arrange for an l2e to
|
||||
+ * contain a partially-devalidated l2; but in that case, both
|
||||
+ * of the following functions will fail anyway (the first
|
||||
+ * because the page in question is not an l1; the second
|
||||
+ * because the page is not fully validated).
|
||||
+ */
|
||||
+ ASSERT(rc != -ERESTART);
|
||||
+
|
||||
+ if ( rc == -EINTR && i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
page->partial_flags = 0;
|
||||
@@ -1584,6 +1601,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
else if ( rc < 0 && rc != -EINTR )
|
||||
{
|
||||
gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
|
||||
+ ASSERT(current->arch.old_guest_table == NULL);
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
@@ -1642,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
rc = get_page_and_type_from_mfn(
|
||||
l3e_get_mfn(l3e),
|
||||
PGT_l2_page_table | PGT_pae_xen_l2, d,
|
||||
- partial_flags | PTF_preemptible);
|
||||
+ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart);
|
||||
}
|
||||
else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
|
||||
{
|
||||
@@ -1651,13 +1669,14 @@ static int alloc_l3_table(struct page_info *page)
|
||||
rc = -EINTR;
|
||||
}
|
||||
else
|
||||
- rc = get_page_from_l3e(l3e, pfn, d, partial_flags);
|
||||
+ rc = get_page_from_l3e(l3e, pfn, d,
|
||||
+ partial_flags | PTF_retain_ref_on_restart);
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
/* Set 'set', leave 'general ref' set if this entry was set */
|
||||
- page->partial_flags = partial_flags | PTF_partial_set;
|
||||
+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
}
|
||||
else if ( rc == -EINTR && i )
|
||||
{
|
||||
@@ -1833,13 +1852,14 @@ static int alloc_l4_table(struct page_info *page)
|
||||
rc = -EINTR;
|
||||
}
|
||||
else
|
||||
- rc = get_page_from_l4e(l4e, pfn, d, partial_flags);
|
||||
+ rc = get_page_from_l4e(l4e, pfn, d,
|
||||
+ partial_flags | PTF_retain_ref_on_restart);
|
||||
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
/* Set 'set', leave 'general ref' set if this entry was set */
|
||||
- page->partial_flags = partial_flags | PTF_partial_set;
|
||||
+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
}
|
||||
else if ( rc < 0 )
|
||||
{
|
||||
@@ -1936,9 +1956,7 @@ static int free_l2_table(struct page_info *page)
|
||||
else if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
- partial_flags :
|
||||
- (PTF_partial_set | PTF_partial_general_ref);
|
||||
+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
@@ -1986,9 +2004,7 @@ static int free_l3_table(struct page_info *page)
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
- partial_flags :
|
||||
- (PTF_partial_set | PTF_partial_general_ref);
|
||||
+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
@@ -2019,9 +2035,7 @@ static int free_l4_table(struct page_info *page)
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = (partial_flags & PTF_partial_set) ?
|
||||
- partial_flags :
|
||||
- (PTF_partial_set | PTF_partial_general_ref);
|
||||
+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
||||
index 8406ac3c37..02079e1324 100644
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -238,22 +238,25 @@ struct page_info
|
||||
* page.
|
||||
*
|
||||
* This happens:
|
||||
- * - During de-validation, if de-validation of the page was
|
||||
+ * - During validation or de-validation, if the operation was
|
||||
* interrupted
|
||||
* - During validation, if an invalid entry is encountered and
|
||||
* validation is preemptible
|
||||
* - During validation, if PTF_partial_general_ref was set on
|
||||
- * this entry to begin with (perhaps because we're picking
|
||||
- * up from a partial de-validation).
|
||||
+ * this entry to begin with (perhaps because it picked up a
|
||||
+ * previous operation)
|
||||
*
|
||||
- * When resuming validation, if PTF_partial_general_ref is clear,
|
||||
- * then a general reference must be re-acquired; if it is set, no
|
||||
- * reference should be acquired.
|
||||
+ * When resuming validation, if PTF_partial_general_ref is
|
||||
+ * clear, then a general reference must be re-acquired; if it
|
||||
+ * is set, no reference should be acquired.
|
||||
*
|
||||
* When resuming de-validation, if PTF_partial_general_ref is
|
||||
* clear, no reference should be dropped; if it is set, a
|
||||
* reference should be dropped.
|
||||
*
|
||||
+ * NB at the moment, PTF_partial_set should be set if and only if
|
||||
+ * PTF_partial_general_ref is set.
|
||||
+ *
|
||||
* NB that PTF_partial_set and PTF_partial_general_ref are
|
||||
* defined in mm.c, the only place where they are used.
|
||||
*
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
From 8a8d836f7f7418e659d37817a66cd7a6b115042b Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and
|
||||
PTF_partial_general_ref into one
|
||||
|
||||
...now that they are equivalent. No functional change intended.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 50 +++++++++++-----------------------------
|
||||
xen/include/asm-x86/mm.h | 29 +++++++++++------------
|
||||
2 files changed, 26 insertions(+), 53 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 4d3ebf341d..886e93b8aa 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1097,13 +1097,12 @@ get_page_from_l1e(
|
||||
|
||||
/*
|
||||
* The following flags are used to specify behavior of various get and
|
||||
- * put commands. The first two are also stored in page->partial_flags
|
||||
- * to indicate the state of the page pointed to by
|
||||
+ * put commands. The first is also stored in page->partial_flags to
|
||||
+ * indicate the state of the page pointed to by
|
||||
* page->pte[page->nr_validated_entries]. See the comment in mm.h for
|
||||
* more information.
|
||||
*/
|
||||
#define PTF_partial_set (1 << 0)
|
||||
-#define PTF_partial_general_ref (1 << 1)
|
||||
#define PTF_preemptible (1 << 2)
|
||||
#define PTF_defer (1 << 3)
|
||||
#define PTF_retain_ref_on_restart (1 << 4)
|
||||
@@ -1115,13 +1114,10 @@ static int get_page_and_type_from_mfn(
|
||||
struct page_info *page = mfn_to_page(mfn);
|
||||
int rc;
|
||||
bool preemptible = flags & PTF_preemptible,
|
||||
- partial_ref = flags & PTF_partial_general_ref,
|
||||
partial_set = flags & PTF_partial_set,
|
||||
retain_ref = flags & PTF_retain_ref_on_restart;
|
||||
|
||||
- ASSERT(partial_ref == partial_set);
|
||||
-
|
||||
- if ( likely(!partial_ref) &&
|
||||
+ if ( likely(!partial_set) &&
|
||||
unlikely(!get_page_from_mfn(mfn, d)) )
|
||||
return -EINVAL;
|
||||
|
||||
@@ -1131,14 +1127,14 @@ static int get_page_and_type_from_mfn(
|
||||
* Retain the refcount if:
|
||||
* - page is fully validated (rc == 0)
|
||||
* - page is not validated (rc < 0) but:
|
||||
- * - We came in with a reference (partial_ref)
|
||||
+ * - We came in with a reference (partial_set)
|
||||
* - page is partially validated (rc == -ERESTART), and the
|
||||
* caller has asked the ref to be retained in that case
|
||||
* - page is partially validated but there's been an error
|
||||
* (page == current->arch.old_guest_table)
|
||||
*
|
||||
- * The partial_ref-on-error clause is worth an explanation. There
|
||||
- * are two scenarios where partial_ref might be true coming in:
|
||||
+ * The partial_set-on-error clause is worth an explanation. There
|
||||
+ * are two scenarios where partial_set might be true coming in:
|
||||
* - mfn has been partially promoted / demoted as type `type`;
|
||||
* i.e. has PGT_partial set
|
||||
* - mfn has been partially demoted as L(type+1) (i.e., a linear
|
||||
@@ -1161,7 +1157,7 @@ static int get_page_and_type_from_mfn(
|
||||
* count retained unless we succeeded, or the operation was
|
||||
* preemptible.
|
||||
*/
|
||||
- if ( likely(!rc) || partial_ref )
|
||||
+ if ( likely(!rc) || partial_set )
|
||||
/* nothing */;
|
||||
else if ( page == current->arch.old_guest_table ||
|
||||
(retain_ref && rc == -ERESTART) )
|
||||
@@ -1359,13 +1355,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
struct page_info *pg = l2e_get_page(l2e);
|
||||
struct page_info *ptpg = mfn_to_page(_mfn(pfn));
|
||||
|
||||
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
- PTF_partial_set )
|
||||
- {
|
||||
- /* partial_set should always imply partial_ref */
|
||||
- BUG();
|
||||
- }
|
||||
- else if ( flags & PTF_defer )
|
||||
+ if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = ptpg;
|
||||
current->arch.old_guest_table = pg;
|
||||
@@ -1405,13 +1395,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
|
||||
pg = l3e_get_page(l3e);
|
||||
|
||||
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
- PTF_partial_set )
|
||||
- {
|
||||
- /* partial_set should always imply partial_ref */
|
||||
- BUG();
|
||||
- }
|
||||
-
|
||||
if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
@@ -1436,13 +1419,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
{
|
||||
struct page_info *pg = l4e_get_page(l4e);
|
||||
|
||||
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
|
||||
- PTF_partial_set )
|
||||
- {
|
||||
- /* partial_set should always imply partial_ref */
|
||||
- BUG();
|
||||
- }
|
||||
-
|
||||
if ( flags & PTF_defer )
|
||||
{
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
@@ -1676,7 +1652,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
/* Set 'set', leave 'general ref' set if this entry was set */
|
||||
- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i )
|
||||
{
|
||||
@@ -1859,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page)
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
/* Set 'set', leave 'general ref' set if this entry was set */
|
||||
- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
}
|
||||
else if ( rc < 0 )
|
||||
{
|
||||
@@ -1956,7 +1932,7 @@ static int free_l2_table(struct page_info *page)
|
||||
else if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
@@ -2004,7 +1980,7 @@ static int free_l3_table(struct page_info *page)
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
@@ -2035,7 +2011,7 @@ static int free_l4_table(struct page_info *page)
|
||||
if ( rc == -ERESTART )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
}
|
||||
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
|
||||
{
|
||||
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
||||
index 02079e1324..f0fd35bf6b 100644
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -233,7 +233,7 @@ struct page_info
|
||||
* operation on the current page. (That page may or may not
|
||||
* still have PGT_partial set.)
|
||||
*
|
||||
- * If PTF_partial_general_ref is set, then the PTE at
|
||||
+ * Additionally, if PTF_partial_set is set, then the PTE at
|
||||
* @nr_validated_ptef holds a general reference count for the
|
||||
* page.
|
||||
*
|
||||
@@ -242,23 +242,20 @@ struct page_info
|
||||
* interrupted
|
||||
* - During validation, if an invalid entry is encountered and
|
||||
* validation is preemptible
|
||||
- * - During validation, if PTF_partial_general_ref was set on
|
||||
- * this entry to begin with (perhaps because it picked up a
|
||||
+ * - During validation, if PTF_partial_set was set on this
|
||||
+ * entry to begin with (perhaps because it picked up a
|
||||
* previous operation)
|
||||
*
|
||||
- * When resuming validation, if PTF_partial_general_ref is
|
||||
- * clear, then a general reference must be re-acquired; if it
|
||||
- * is set, no reference should be acquired.
|
||||
+ * When resuming validation, if PTF_partial_set is clear, then
|
||||
+ * a general reference must be re-acquired; if it is set, no
|
||||
+ * reference should be acquired.
|
||||
*
|
||||
- * When resuming de-validation, if PTF_partial_general_ref is
|
||||
- * clear, no reference should be dropped; if it is set, a
|
||||
- * reference should be dropped.
|
||||
+ * When resuming de-validation, if PTF_partial_set is clear,
|
||||
+ * no reference should be dropped; if it is set, a reference
|
||||
+ * should be dropped.
|
||||
*
|
||||
- * NB at the moment, PTF_partial_set should be set if and only if
|
||||
- * PTF_partial_general_ref is set.
|
||||
- *
|
||||
- * NB that PTF_partial_set and PTF_partial_general_ref are
|
||||
- * defined in mm.c, the only place where they are used.
|
||||
+ * NB that PTF_partial_set is defined in mm.c, the only place
|
||||
+ * where it is used.
|
||||
*
|
||||
* The 3rd field, @linear_pt_count, indicates
|
||||
* - by a positive value, how many same-level page table entries a page
|
||||
@@ -268,8 +265,8 @@ struct page_info
|
||||
*/
|
||||
struct {
|
||||
u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
|
||||
- u16 :16 - PAGETABLE_ORDER - 1 - 2;
|
||||
- u16 partial_flags:2;
|
||||
+ u16 :16 - PAGETABLE_ORDER - 1 - 1;
|
||||
+ u16 partial_flags:1;
|
||||
s16 linear_pt_count;
|
||||
};
|
||||
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,106 +0,0 @@
|
|||
From da3d1d258e54fe600f7f75287183b74d957ec63b Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion
|
||||
failures
|
||||
|
||||
In order to allow recursive pagetable promotions and demotions to be
|
||||
interrupted, Xen must keep track of the state of the sub-pages
|
||||
promoted or demoted. This is stored in two elements in the page
|
||||
struct: nr_entries_validated and partial_flags.
|
||||
|
||||
The rule is that entries [0, nr_entries_validated) should always be
|
||||
validated and hold a general reference count. If partial_flags is
|
||||
zero, then [nr_entries_validated] is not validated and no reference
|
||||
count is held. If PTF_partial_set is set, then [nr_entries_validated]
|
||||
is partially validated, and a general reference count is held.
|
||||
|
||||
Unfortunately, in cases where an entry began with PTF_partial_set set,
|
||||
and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is
|
||||
erroneously dropped. (This scenario can be engineered mainly by the
|
||||
use of interleaving of promoting and demoting a page which has "linear
|
||||
pagetable" entries; see the appendix for a sketch.) This means that
|
||||
we will "leak" a general reference count on the page in question,
|
||||
preventing the page from being freed.
|
||||
|
||||
Fix this by setting page->partial_flags to the partial_flags local
|
||||
variable.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
-----
|
||||
Appendix
|
||||
|
||||
Suppose A and B can both be promoted to L2 pages, and A[x] points to B.
|
||||
|
||||
V1: PIN_L2 B.
|
||||
B.type_count = 1 | PGT_validated
|
||||
B.count = 2 | PGC_allocated
|
||||
|
||||
V1: MOD_L3_ENTRY pointing something to A.
|
||||
In the process of validating A[x], grab an extra type / ref on B:
|
||||
B.type_count = 2 | PGT_validated
|
||||
B.count = 3 | PGC_allocated
|
||||
A.type_count = 1 | PGT_validated
|
||||
A.count = 2 | PGC_allocated
|
||||
|
||||
V1: UNPIN B.
|
||||
B.type_count = 1 | PGT_validate
|
||||
B.count = 2 | PGC_allocated
|
||||
|
||||
V1: MOD_L3_ENTRY removing the reference to A.
|
||||
De-validate A, down to A[x], which points to B.
|
||||
Drop the final type on B. Arrange to be interrupted.
|
||||
B.type_count = 1 | PGT_partial
|
||||
B.count = 2 | PGC_allocated
|
||||
A.type_count = 1 | PGT_partial
|
||||
A.nr_validated_entries = x
|
||||
A.partial_pte = -1
|
||||
|
||||
V2: MOD_L3_ENTRY adds a reference to A.
|
||||
|
||||
At this point, get_page_from_l2e(A[x]) tries
|
||||
get_page_and_type_from_mfn(), which fails because it's the wrong type;
|
||||
and get_l2_linear_pagetable() also fails, because B isn't validated as
|
||||
an l2 anymore.
|
||||
---
|
||||
xen/arch/x86/mm.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 886e93b8aa..0a094291da 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1581,7 +1581,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = 0;
|
||||
+ page->partial_flags = partial_flags;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
@@ -1674,7 +1674,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = 0;
|
||||
+ page->partial_flags = partial_flags;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
@@ -1845,7 +1845,7 @@ static int alloc_l4_table(struct page_info *page)
|
||||
if ( i )
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
- page->partial_flags = 0;
|
||||
+ page->partial_flags = partial_flags;
|
||||
if ( rc == -EINTR )
|
||||
rc = -ERESTART;
|
||||
else
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
From b3e169dc8daeae85b0b51c25fdb142e2e552ec7f Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:49 +0100
|
||||
Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error
|
||||
|
||||
If an invalid entry is discovered when validating a page-table tree,
|
||||
the entire tree which has so far been validated must be de-validated.
|
||||
Since this may take a long time, alloc_l[2-4]_table() set current
|
||||
vcpu's old_guest_table immediately; put_old_guest_table() will make
|
||||
sure that put_page_type() will be called to finish off the
|
||||
de-validation before any other MMU operations can happen on the vcpu.
|
||||
|
||||
The invariant for partial pages should be:
|
||||
|
||||
* Entries [0, nr_validated_ptes) should be completely validated;
|
||||
put_page_type() will de-validate these.
|
||||
|
||||
* If [nr_validated_ptes] is partially validated, partial_flags should
|
||||
set PTF_partiaL_set. put_page_type() will be called on this page to
|
||||
finish off devalidation, and the appropriate refcount adjustments
|
||||
will be done.
|
||||
|
||||
alloc_l[2-3]_table() indicates partial validation to its callers by
|
||||
setting current->old_guest_table.
|
||||
|
||||
Unfortunately, this is mishandled.
|
||||
|
||||
Take the case where validating lNe[x] returns an error.
|
||||
|
||||
First, alloc_l3_table() doesn't check old_guest_table at all; as a
|
||||
result, partial_flags is not set when it should be. nr_validated_ptes
|
||||
is set to x; and since PFT_partial_set clear, de-validation resumes at
|
||||
nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not
|
||||
have put_page_type() called on it when de-validating the rest of the
|
||||
l3: it will be stuck in the PGT_partial state until the domain is
|
||||
destroyed, or until it is re-used as an l2. (Any other page type will
|
||||
fail.)
|
||||
|
||||
Worse, alloc_l4_table(), rather than setting PTF_partial_set as it
|
||||
should, sets nr_validated_ptes to x+1. When de-validating, since
|
||||
partial is 0, this will correctly resume calling put_page_type at [x];
|
||||
but, if the put_page_type() is never called, but instead
|
||||
get_page_type() is called, validation will pick up at [x+1],
|
||||
neglecting to validate [x]. If the rest of the validation succeeds,
|
||||
the l4 will be validated even though [x] is invalid.
|
||||
|
||||
Fix this in both cases by setting PTF_partial_set if old_guest_table
|
||||
is set.
|
||||
|
||||
While here, add some safety catches:
|
||||
- old_guest_table must point to the page contained in
|
||||
[nr_validated_ptes].
|
||||
- alloc_l1_page shouldn't set old_guest_table
|
||||
|
||||
If we experience one of these situations in production builds, it's
|
||||
safer to avoid calling put_page_type for the pages in question. If
|
||||
they have PGT_partial set, they will be cleaned up on domain
|
||||
destruction; if not, we have no idea whether a type count is safe to
|
||||
drop. Retaining an extra type ref that should have been dropped may
|
||||
trigger a BUG() on the free_domain_page() path, but dropping a type
|
||||
count that shouldn't be dropped may cause a privilege escalation.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
xen/arch/x86/mm.c | 53 +++++++++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 51 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 0a094291da..a432e69c74 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1580,6 +1580,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
ASSERT(current->arch.old_guest_table == NULL);
|
||||
if ( i )
|
||||
{
|
||||
+ /*
|
||||
+ * alloc_l1_table() doesn't set old_guest_table; it does
|
||||
+ * its own tear-down immediately on failure. If it
|
||||
+ * did we'd need to check it and set partial_flags as we
|
||||
+ * do in alloc_l[34]_table().
|
||||
+ *
|
||||
+ * Note on the use of ASSERT: if it's non-null and
|
||||
+ * hasn't been cleaned up yet, it should have
|
||||
+ * PGT_partial set; and so the type will be cleaned up
|
||||
+ * on domain destruction. Unfortunately, we would
|
||||
+ * leak the general ref held by old_guest_table; but
|
||||
+ * leaking a page is less bad than a host crash.
|
||||
+ */
|
||||
+ ASSERT(current->arch.old_guest_table == NULL);
|
||||
page->nr_validated_ptes = i;
|
||||
page->partial_flags = partial_flags;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
@@ -1607,6 +1621,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
unsigned int i;
|
||||
int rc = 0;
|
||||
unsigned int partial_flags = page->partial_flags;
|
||||
+ l3_pgentry_t l3e = l3e_empty();
|
||||
|
||||
pl3e = map_domain_page(_mfn(pfn));
|
||||
|
||||
@@ -1623,7 +1638,7 @@ static int alloc_l3_table(struct page_info *page)
|
||||
for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
|
||||
i++, partial_flags = 0 )
|
||||
{
|
||||
- l3_pgentry_t l3e = pl3e[i];
|
||||
+ l3e = pl3e[i];
|
||||
|
||||
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
|
||||
rc = -EINTR;
|
||||
@@ -1675,6 +1690,24 @@ static int alloc_l3_table(struct page_info *page)
|
||||
{
|
||||
page->nr_validated_ptes = i;
|
||||
page->partial_flags = partial_flags;
|
||||
+ if ( current->arch.old_guest_table )
|
||||
+ {
|
||||
+ /*
|
||||
+ * We've experienced a validation failure. If
|
||||
+ * old_guest_table is set, "transfer" the general
|
||||
+ * reference count to pl3e[nr_validated_ptes] by
|
||||
+ * setting PTF_partial_set.
|
||||
+ *
|
||||
+ * As a precaution, check that old_guest_table is the
|
||||
+ * page pointed to by pl3e[nr_validated_ptes]. If
|
||||
+ * not, it's safer to leak a type ref on production
|
||||
+ * builds.
|
||||
+ */
|
||||
+ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
+ else
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ }
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
@@ -1851,7 +1884,23 @@ static int alloc_l4_table(struct page_info *page)
|
||||
else
|
||||
{
|
||||
if ( current->arch.old_guest_table )
|
||||
- page->nr_validated_ptes++;
|
||||
+ {
|
||||
+ /*
|
||||
+ * We've experienced a validation failure. If
|
||||
+ * old_guest_table is set, "transfer" the general
|
||||
+ * reference count to pl3e[nr_validated_ptes] by
|
||||
+ * setting PTF_partial_set.
|
||||
+ *
|
||||
+ * As a precaution, check that old_guest_table is the
|
||||
+ * page pointed to by pl4e[nr_validated_ptes]. If
|
||||
+ * not, it's safer to leak a type ref on production
|
||||
+ * builds.
|
||||
+ */
|
||||
+ if ( current->arch.old_guest_table == l4e_get_page(l4e) )
|
||||
+ page->partial_flags = PTF_partial_set;
|
||||
+ else
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ }
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
}
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,413 +0,0 @@
|
|||
From ea3dc624c5e6325a9c2f079e52a85965d4ab6ce8 Mon Sep 17 00:00:00 2001
|
||||
From: George Dunlap <george.dunlap@citrix.com>
|
||||
Date: Thu, 10 Oct 2019 17:57:50 +0100
|
||||
Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to
|
||||
begin with
|
||||
|
||||
Validation and de-validation of pagetable trees may take arbitrarily
|
||||
large amounts of time, and so must be preemptible. This is indicated
|
||||
by setting the PGT_partial bit in the type_info, and setting
|
||||
nr_validated_entries and partial_flags appropriately. Specifically,
|
||||
if the entry at [nr_validated_entries] is partially validated,
|
||||
partial_flags should have the PGT_partial_set bit set, and the entry
|
||||
should hold a general reference count. During de-validation,
|
||||
put_page_type() is called on partially validated entries.
|
||||
|
||||
Unfortunately, there are a number of issues with the current algorithm.
|
||||
|
||||
First, doing a "normal" put_page_type() is not safe when no type ref
|
||||
is held: there is nothing to stop another vcpu from coming along and
|
||||
picking up validation again: at which point the put_page_type may drop
|
||||
the only page ref on an in-use page. Some examples are listed in the
|
||||
appendix.
|
||||
|
||||
The core issue is that put_page_type() is being called both to clean
|
||||
up PGT_partial, and to drop a type count; and has no way of knowing
|
||||
which is which; and so if in between, PGT_partial is cleared,
|
||||
put_page_type() will drop the type ref erroneously.
|
||||
|
||||
What is needed is to distinguish between two states:
|
||||
- Dropping a type ref which is held
|
||||
- Cleaning up a page which has been partially de/validated
|
||||
|
||||
Fix this by telling put_page_type() which of the two activities you
|
||||
intend.
|
||||
|
||||
When cleaning up a partial de/validation, take no action unless you
|
||||
find a page partially validated.
|
||||
|
||||
If put_page_type() is called without PTF_partial_set, and finds the
|
||||
page in a PGT_partial state anyway, then there's certainly been a
|
||||
misaccounting somewhere, and carrying on would almost certainly cause
|
||||
a security issue, so crash the host instead.
|
||||
|
||||
In put_page_from_lNe, pass partial_flags on to _put_page_type().
|
||||
|
||||
old_guest_table may be set either with a fully validated page (when
|
||||
using the "deferred put" pattern), or with a partially validated page
|
||||
(when a normal "de-validation" is interrupted, or when a validation
|
||||
fails part-way through due to invalid entries). Add a flag,
|
||||
old_guest_table_partial, to indicate which of these it is, and use
|
||||
that to pass the appropriate flag to _put_page_type().
|
||||
|
||||
While here, delete stray trailing whitespace.
|
||||
|
||||
This is part of XSA-299.
|
||||
|
||||
Reported-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
-----
|
||||
Appendix:
|
||||
|
||||
Suppose page A, when interpreted as an l3 pagetable, contains all
|
||||
valid entries; and suppose A[x] points to page B, which when
|
||||
interpreted as an l2 pagetable, contains all valid entries.
|
||||
|
||||
P1: PIN_L3_TABLE
|
||||
A -> PGT_l3_table | 1 | valid
|
||||
B -> PGT_l2_table | 1 | valid
|
||||
|
||||
P1: UNPIN_TABLE
|
||||
> Arrange to interrupt after B has been de-validated
|
||||
B:
|
||||
type_info -> PGT_l2_table | 0
|
||||
A:
|
||||
type_info -> PGT_l3_table | 1 | partial
|
||||
nr_validated_enties -> (less than x)
|
||||
|
||||
P2: mod_l4_entry to point to A
|
||||
> Arrange for this to be interrupted while B is being validated
|
||||
B:
|
||||
type_info -> PGT_l2_table | 1 | partial
|
||||
(nr_validated_entires &c set as appropriate)
|
||||
A:
|
||||
type_info -> PGT_l3_table | 1 | partial
|
||||
nr_validated_entries -> x
|
||||
partial_pte = 1
|
||||
|
||||
P3: mod_l3_entry some other unrelated l3 to point to B:
|
||||
B:
|
||||
type_info -> PGT_l2_table | 1
|
||||
|
||||
P1: Restart UNPIN_TABLE
|
||||
|
||||
At this point, since A.nr_validate_entries == x and A.partial_pte !=
|
||||
0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
|
||||
its type count to 0 while it's still being pointed to by some other l3
|
||||
|
||||
A similar issue arises with old_guest_table. Consider the following
|
||||
scenario:
|
||||
|
||||
Suppose A is a page which, when interpreted as an l2, has valid entries
|
||||
until entry x, which is invalid.
|
||||
|
||||
V1: PIN_L2_TABLE(A)
|
||||
<Validate until we try to validate [x], get -EINVAL>
|
||||
A -> PGT_l2_table | 1 | PGT_partial
|
||||
V1 -> old_guest_table = A
|
||||
<delayed>
|
||||
|
||||
V2: PIN_L2_TABLE(A)
|
||||
<Pick up where V1 left off, try to re-validate [x], get -EINVAL>
|
||||
A -> PGT_l2_table | 1 | PGT_partial
|
||||
V2 -> old_guest_table = A
|
||||
<restart>
|
||||
put_old_guest_table()
|
||||
_put_page_type(A)
|
||||
A -> PGT_l2_table | 0
|
||||
|
||||
V1: <restart>
|
||||
put_old_guest_table()
|
||||
_put_page_type(A) # UNDERFLOW
|
||||
|
||||
Indeed, it is possible to engineer for old_guest_table for every vcpu
|
||||
a guest has to point to the same page.
|
||||
---
|
||||
xen/arch/x86/domain.c | 6 +++
|
||||
xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++-----
|
||||
xen/include/asm-x86/domain.h | 4 +-
|
||||
3 files changed, 95 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
|
||||
index 59df8a6d8d..f1ae5f89f5 100644
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -1104,9 +1104,15 @@ int arch_set_info_guest(
|
||||
rc = -ERESTART;
|
||||
/* Fallthrough */
|
||||
case -ERESTART:
|
||||
+ /*
|
||||
+ * NB that we're putting the kernel-mode table
|
||||
+ * here, which we've already successfully
|
||||
+ * validated above; hence partial = false;
|
||||
+ */
|
||||
v->arch.old_guest_ptpg = NULL;
|
||||
v->arch.old_guest_table =
|
||||
pagetable_get_page(v->arch.guest_table);
|
||||
+ v->arch.old_guest_table_partial = false;
|
||||
v->arch.guest_table = pagetable_null();
|
||||
break;
|
||||
default:
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index a432e69c74..81774368a0 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -1359,10 +1359,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
|
||||
{
|
||||
current->arch.old_guest_ptpg = ptpg;
|
||||
current->arch.old_guest_table = pg;
|
||||
+ current->arch.old_guest_table_partial = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
- rc = _put_page_type(pg, PTF_preemptible, ptpg);
|
||||
+ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
}
|
||||
@@ -1385,6 +1386,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
unsigned long mfn = l3e_get_pfn(l3e);
|
||||
bool writeable = l3e_get_flags(l3e) & _PAGE_RW;
|
||||
|
||||
+ ASSERT(!(flags & PTF_partial_set));
|
||||
ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
|
||||
do {
|
||||
put_data_page(mfn_to_page(_mfn(mfn)), writeable);
|
||||
@@ -1397,12 +1399,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
{
|
||||
+ ASSERT(!(flags & PTF_partial_set));
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
current->arch.old_guest_table = pg;
|
||||
+ current->arch.old_guest_table_partial = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
+ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
|
||||
@@ -1421,12 +1425,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
|
||||
|
||||
if ( flags & PTF_defer )
|
||||
{
|
||||
+ ASSERT(!(flags & PTF_partial_set));
|
||||
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
|
||||
current->arch.old_guest_table = pg;
|
||||
+ current->arch.old_guest_table_partial = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
|
||||
+ rc = _put_page_type(pg, flags | PTF_preemptible,
|
||||
+ mfn_to_page(_mfn(pfn)));
|
||||
if ( likely(!rc) )
|
||||
put_page(pg);
|
||||
}
|
||||
@@ -1535,6 +1542,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
|
||||
pl2e = map_domain_page(_mfn(pfn));
|
||||
|
||||
+ /*
|
||||
+ * NB that alloc_l2_table will never set partial_pte on an l2; but
|
||||
+ * free_l2_table might if a linear_pagetable entry is interrupted
|
||||
+ * partway through de-validation. In that circumstance,
|
||||
+ * get_page_from_l2e() will always return -EINVAL; and we must
|
||||
+ * retain the type ref by doing the normal partial_flags tracking.
|
||||
+ */
|
||||
+
|
||||
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
|
||||
i++, partial_flags = 0 )
|
||||
{
|
||||
@@ -1598,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
|
||||
page->partial_flags = partial_flags;
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
+ current->arch.old_guest_table_partial = true;
|
||||
}
|
||||
}
|
||||
if ( rc < 0 )
|
||||
@@ -1704,12 +1720,16 @@ static int alloc_l3_table(struct page_info *page)
|
||||
* builds.
|
||||
*/
|
||||
if ( current->arch.old_guest_table == l3e_get_page(l3e) )
|
||||
+ {
|
||||
+ ASSERT(current->arch.old_guest_table_partial);
|
||||
page->partial_flags = PTF_partial_set;
|
||||
+ }
|
||||
else
|
||||
ASSERT_UNREACHABLE();
|
||||
}
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
+ current->arch.old_guest_table_partial = true;
|
||||
}
|
||||
while ( i-- > 0 )
|
||||
pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
|
||||
@@ -1897,12 +1917,16 @@ static int alloc_l4_table(struct page_info *page)
|
||||
* builds.
|
||||
*/
|
||||
if ( current->arch.old_guest_table == l4e_get_page(l4e) )
|
||||
+ {
|
||||
+ ASSERT(current->arch.old_guest_table_partial);
|
||||
page->partial_flags = PTF_partial_set;
|
||||
+ }
|
||||
else
|
||||
ASSERT_UNREACHABLE();
|
||||
}
|
||||
current->arch.old_guest_ptpg = NULL;
|
||||
current->arch.old_guest_table = page;
|
||||
+ current->arch.old_guest_table_partial = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2831,6 +2855,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
|
||||
x = y;
|
||||
nx = x - 1;
|
||||
|
||||
+ /*
|
||||
+ * Is this expected to do a full reference drop, or only
|
||||
+ * cleanup partial validation / devalidation?
|
||||
+ *
|
||||
+ * If the former, the caller must hold a "full" type ref;
|
||||
+ * which means the page must be validated. If the page is
|
||||
+ * *not* fully validated, continuing would almost certainly
|
||||
+ * open up a security hole. An exception to this is during
|
||||
+ * domain destruction, where PGT_validated can be dropped
|
||||
+ * without dropping a type ref.
|
||||
+ *
|
||||
+ * If the latter, do nothing unless type PGT_partial is set.
|
||||
+ * If it is set, the type count must be 1.
|
||||
+ */
|
||||
+ if ( !(flags & PTF_partial_set) )
|
||||
+ BUG_ON((x & PGT_partial) ||
|
||||
+ !((x & PGT_validated) || page_get_owner(page)->is_dying));
|
||||
+ else if ( !(x & PGT_partial) )
|
||||
+ return 0;
|
||||
+ else
|
||||
+ BUG_ON((x & PGT_count_mask) != 1);
|
||||
+
|
||||
ASSERT((x & PGT_count_mask) != 0);
|
||||
|
||||
switch ( nx & (PGT_locked | PGT_count_mask) )
|
||||
@@ -3092,17 +3138,34 @@ int put_old_guest_table(struct vcpu *v)
|
||||
if ( !v->arch.old_guest_table )
|
||||
return 0;
|
||||
|
||||
- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
|
||||
- v->arch.old_guest_ptpg) )
|
||||
+ rc = _put_page_type(v->arch.old_guest_table,
|
||||
+ PTF_preemptible |
|
||||
+ ( v->arch.old_guest_table_partial ?
|
||||
+ PTF_partial_set : 0 ),
|
||||
+ v->arch.old_guest_ptpg);
|
||||
+
|
||||
+ if ( rc == -ERESTART || rc == -EINTR )
|
||||
{
|
||||
- case -EINTR:
|
||||
- case -ERESTART:
|
||||
+ v->arch.old_guest_table_partial = (rc == -ERESTART);
|
||||
return -ERESTART;
|
||||
- case 0:
|
||||
- put_page(v->arch.old_guest_table);
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * It shouldn't be possible for _put_page_type() to return
|
||||
+ * anything else at the moment; but if it does happen in
|
||||
+ * production, leaking the type ref is probably the best thing to
|
||||
+ * do. Either way, drop the general ref held by old_guest_table.
|
||||
+ */
|
||||
+ ASSERT(rc == 0);
|
||||
+
|
||||
+ put_page(v->arch.old_guest_table);
|
||||
v->arch.old_guest_table = NULL;
|
||||
+ v->arch.old_guest_ptpg = NULL;
|
||||
+ /*
|
||||
+ * Safest default if someone sets old_guest_table without
|
||||
+ * explicitly setting old_guest_table_partial.
|
||||
+ */
|
||||
+ v->arch.old_guest_table_partial = true;
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -3253,11 +3316,11 @@ int new_guest_cr3(mfn_t mfn)
|
||||
switch ( rc = put_page_and_type_preemptible(page) )
|
||||
{
|
||||
case -EINTR:
|
||||
- rc = -ERESTART;
|
||||
- /* fallthrough */
|
||||
case -ERESTART:
|
||||
curr->arch.old_guest_ptpg = NULL;
|
||||
curr->arch.old_guest_table = page;
|
||||
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
|
||||
+ rc = -ERESTART;
|
||||
break;
|
||||
default:
|
||||
BUG_ON(rc);
|
||||
@@ -3494,6 +3557,7 @@ long do_mmuext_op(
|
||||
{
|
||||
curr->arch.old_guest_ptpg = NULL;
|
||||
curr->arch.old_guest_table = page;
|
||||
+ curr->arch.old_guest_table_partial = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3528,6 +3592,11 @@ long do_mmuext_op(
|
||||
case -ERESTART:
|
||||
curr->arch.old_guest_ptpg = NULL;
|
||||
curr->arch.old_guest_table = page;
|
||||
+ /*
|
||||
+ * EINTR means we still hold the type ref; ERESTART
|
||||
+ * means PGT_partial holds the type ref
|
||||
+ */
|
||||
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
|
||||
rc = 0;
|
||||
break;
|
||||
default:
|
||||
@@ -3596,11 +3665,15 @@ long do_mmuext_op(
|
||||
switch ( rc = put_page_and_type_preemptible(page) )
|
||||
{
|
||||
case -EINTR:
|
||||
- rc = -ERESTART;
|
||||
- /* fallthrough */
|
||||
case -ERESTART:
|
||||
curr->arch.old_guest_ptpg = NULL;
|
||||
curr->arch.old_guest_table = page;
|
||||
+ /*
|
||||
+ * EINTR means we still hold the type ref;
|
||||
+ * ERESTART means PGT_partial holds the ref
|
||||
+ */
|
||||
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
|
||||
+ rc = -ERESTART;
|
||||
break;
|
||||
default:
|
||||
BUG_ON(rc);
|
||||
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
|
||||
index 214e44ce1c..2cfce7b36b 100644
|
||||
--- a/xen/include/asm-x86/domain.h
|
||||
+++ b/xen/include/asm-x86/domain.h
|
||||
@@ -307,7 +307,7 @@ struct arch_domain
|
||||
|
||||
struct paging_domain paging;
|
||||
struct p2m_domain *p2m;
|
||||
- /* To enforce lock ordering in the pod code wrt the
|
||||
+ /* To enforce lock ordering in the pod code wrt the
|
||||
* page_alloc lock */
|
||||
int page_alloc_unlock_level;
|
||||
|
||||
@@ -581,6 +581,8 @@ struct arch_vcpu
|
||||
struct page_info *old_guest_table; /* partially destructed pagetable */
|
||||
struct page_info *old_guest_ptpg; /* containing page table of the */
|
||||
/* former, if any */
|
||||
+ bool old_guest_table_partial; /* Are we dropping a type ref, or just
|
||||
+ * finishing up a partial de-validation? */
|
||||
/* guest_table holds a ref to the page, and also a type-count unless
|
||||
* shadow refcounts are in use */
|
||||
pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
From 19d6330f142cb941b6340a88592e8a294de0ff8c Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Tue, 15 Oct 2019 17:10:40 +0100
|
||||
Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame
|
||||
|
||||
The P2M helpers implementation is quite lax and will end up to ignore
|
||||
the unused top bits of a guest physical frame.
|
||||
|
||||
This effectively means that p2m_set_entry() will create a mapping for a
|
||||
different frame (it is always equal to gfn & (mask unused bits)). Yet
|
||||
p2m->max_mapped_gfn will be updated using the original frame.
|
||||
|
||||
At the moment, p2m_get_entry() and p2m_resolve_translation_fault()
|
||||
assume that p2m_get_root_pointer() will always return a non-NULL pointer
|
||||
when the GFN is smaller than p2m->max_mapped_gfn.
|
||||
|
||||
Unfortunately, because of the aliasing described above, it would be
|
||||
possible to set p2m->max_mapped_gfn high enough so it covers frame that
|
||||
would lead p2m_get_root_pointer() to return NULL.
|
||||
|
||||
As we don't sanity check the guest physical frame provided by a guest, a
|
||||
malicious guest could craft a series of hypercalls that will hit the
|
||||
BUG_ON() and therefore DoS Xen.
|
||||
|
||||
To prevent aliasing, the function p2m_get_root_pointer() is now reworked
|
||||
to return NULL If any of the unused top bits are not zero. The caller
|
||||
can then decide what's the appropriate action to do. Since the two paths
|
||||
(i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very
|
||||
similarly, take the opportunity to consolidate them making the code a
|
||||
bit simpler.
|
||||
|
||||
With this change, p2m_get_entry() will not try to insert a mapping as
|
||||
the root pointer is invalid.
|
||||
|
||||
Note that root_table is now switch to unsigned long as unsigned int is
|
||||
not enough to hold part of a GFN.
|
||||
|
||||
This is part of XSA-301.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
---
|
||||
xen/arch/arm/p2m.c | 17 +++++------------
|
||||
1 file changed, 5 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
|
||||
index a2749d9b6f..d0045a8b28 100644
|
||||
--- a/xen/arch/arm/p2m.c
|
||||
+++ b/xen/arch/arm/p2m.c
|
||||
@@ -229,21 +229,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m)
|
||||
static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m,
|
||||
gfn_t gfn)
|
||||
{
|
||||
- unsigned int root_table;
|
||||
-
|
||||
- if ( P2M_ROOT_PAGES == 1 )
|
||||
- return __map_domain_page(p2m->root);
|
||||
+ unsigned long root_table;
|
||||
|
||||
/*
|
||||
- * Concatenated root-level tables. The table number will be the
|
||||
- * offset at the previous level. It is not possible to
|
||||
- * concatenate a level-0 root.
|
||||
+ * While the root table index is the offset from the previous level,
|
||||
+ * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be
|
||||
+ * 0. Yet we still want to check if all the unused bits are zeroed.
|
||||
*/
|
||||
- ASSERT(P2M_ROOT_LEVEL > 0);
|
||||
-
|
||||
- root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]);
|
||||
- root_table &= LPAE_ENTRY_MASK;
|
||||
-
|
||||
+ root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT);
|
||||
if ( root_table >= P2M_ROOT_PAGES )
|
||||
return NULL;
|
||||
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
From 3b896936f7505e929dd869d14afcb185d0ee75f8 Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Tue, 15 Oct 2019 17:10:41 +0100
|
||||
Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on
|
||||
p2m->max_mapped_gfn
|
||||
|
||||
The code base is using inconsistently the field p2m->max_mapped_gfn.
|
||||
Some of the useres expect that p2m->max_guest_gfn contain the highest
|
||||
mapped GFN while others expect highest + 1.
|
||||
|
||||
p2m->max_guest_gfn is set as highest + 1, because of that the sanity
|
||||
check on the GFN in p2m_resolved_translation_fault() and
|
||||
p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn.
|
||||
|
||||
p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is
|
||||
outside of address range supported and therefore the BUG_ON() could be
|
||||
hit.
|
||||
|
||||
The current value hold in p2m->max_mapped_gfn is inconsistent with the
|
||||
expectation of the common code (see domain_get_maximum_gpfn()) and also
|
||||
the documentation of the field.
|
||||
|
||||
Rather than changing the check in p2m_translation_fault() and
|
||||
p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest
|
||||
mapped GFN and the callers assuming "highest + 1" are now adjusted.
|
||||
|
||||
Take the opportunity to use 1UL rather than 1 as page_order could
|
||||
theoritically big enough to overflow a 32-bit integer.
|
||||
|
||||
Lastly, the documentation of the field max_guest_gfn to reflect how it
|
||||
is computed.
|
||||
|
||||
This is part of XSA-301.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
---
|
||||
xen/arch/arm/p2m.c | 6 +++---
|
||||
xen/include/asm-arm/p2m.h | 5 +----
|
||||
2 files changed, 4 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
|
||||
index d0045a8b28..8d20d27961 100644
|
||||
--- a/xen/arch/arm/p2m.c
|
||||
+++ b/xen/arch/arm/p2m.c
|
||||
@@ -1041,7 +1041,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m,
|
||||
p2m_write_pte(entry, pte, p2m->clean_pte);
|
||||
|
||||
p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
|
||||
- gfn_add(sgfn, 1 << page_order));
|
||||
+ gfn_add(sgfn, (1UL << page_order) - 1));
|
||||
p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
|
||||
}
|
||||
|
||||
@@ -1572,7 +1572,7 @@ int relinquish_p2m_mapping(struct domain *d)
|
||||
p2m_write_lock(p2m);
|
||||
|
||||
start = p2m->lowest_mapped_gfn;
|
||||
- end = p2m->max_mapped_gfn;
|
||||
+ end = gfn_add(p2m->max_mapped_gfn, 1);
|
||||
|
||||
for ( ; gfn_x(start) < gfn_x(end);
|
||||
start = gfn_next_boundary(start, order) )
|
||||
@@ -1641,7 +1641,7 @@ int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
|
||||
p2m_read_lock(p2m);
|
||||
|
||||
start = gfn_max(start, p2m->lowest_mapped_gfn);
|
||||
- end = gfn_min(end, p2m->max_mapped_gfn);
|
||||
+ end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1));
|
||||
|
||||
next_block_gfn = start;
|
||||
|
||||
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
|
||||
index 89f82df380..5fdb6e8183 100644
|
||||
--- a/xen/include/asm-arm/p2m.h
|
||||
+++ b/xen/include/asm-arm/p2m.h
|
||||
@@ -36,10 +36,7 @@ struct p2m_domain {
|
||||
/* Current Translation Table Base Register for the p2m */
|
||||
uint64_t vttbr;
|
||||
|
||||
- /*
|
||||
- * Highest guest frame that's ever been mapped in the p2m
|
||||
- * Only takes into account ram and foreign mapping
|
||||
- */
|
||||
+ /* Highest guest frame that's ever been mapped in the p2m */
|
||||
gfn_t max_mapped_gfn;
|
||||
|
||||
/*
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
From 060c2dd3b7c2674a019d94afb2b4ebf3663f6c6e Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Tue, 15 Oct 2019 17:10:42 +0100
|
||||
Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of
|
||||
p2m_get_root_pointer() with BUG_ON()
|
||||
|
||||
It turns out that the BUG_ON() was actually reachable with well-crafted
|
||||
hypercalls. The BUG_ON() is here to prevent catch logical error, so
|
||||
crashing Xen is a bit over the top.
|
||||
|
||||
While all the holes should now be fixed, it would be better to downgrade
|
||||
the BUG_ON() to something less fatal to prevent any more DoS.
|
||||
|
||||
The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE()
|
||||
to catch mistake in debug build and return INVALID_MFN for production
|
||||
build. The interface also requires to set page_order to give an idea of
|
||||
the size of "hole". So 'level' is now set so we report a hole of size of
|
||||
the an entry of the root page-table. This stays inline with what happen
|
||||
when the GFN is higher than p2m->max_mapped_gfn.
|
||||
|
||||
The BUG_ON() in p2m_resolve_translation_fault() is now replaced by
|
||||
ASSERT_UNREACHABLE() to catch mistake in debug build and just report a
|
||||
fault for producion build.
|
||||
|
||||
This is part of XSA-301.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
---
|
||||
xen/arch/arm/p2m.c | 13 +++++++++++--
|
||||
1 file changed, 11 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
|
||||
index 8d20d27961..ce59f2b503 100644
|
||||
--- a/xen/arch/arm/p2m.c
|
||||
+++ b/xen/arch/arm/p2m.c
|
||||
@@ -395,7 +395,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
|
||||
* the table should always be non-NULL because the gfn is below
|
||||
* p2m->max_mapped_gfn and the root table pages are always present.
|
||||
*/
|
||||
- BUG_ON(table == NULL);
|
||||
+ if ( !table )
|
||||
+ {
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ level = P2M_ROOT_LEVEL;
|
||||
+ goto out;
|
||||
+ }
|
||||
|
||||
for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
|
||||
{
|
||||
@@ -1196,7 +1201,11 @@ bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn)
|
||||
* The table should always be non-NULL because the gfn is below
|
||||
* p2m->max_mapped_gfn and the root table pages are always present.
|
||||
*/
|
||||
- BUG_ON(table == NULL);
|
||||
+ if ( !table )
|
||||
+ {
|
||||
+ ASSERT_UNREACHABLE();
|
||||
+ goto out;
|
||||
+ }
|
||||
|
||||
/*
|
||||
* Go down the page-tables until an entry has the valid bit unset or
|
||||
--
|
||||
2.23.0
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
From 0c9c0fbb356e3210cb77b3d738be50981b26058a Mon Sep 17 00:00:00 2001
|
||||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Date: Wed, 2 Oct 2019 13:36:59 +0200
|
||||
Subject: [PATCH 1/2] IOMMU: add missing HVM check
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Fix an unguarded d->arch.hvm access in assign_device().
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
|
||||
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
---
|
||||
xen/drivers/passthrough/pci.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
|
||||
index 8108ed5f9a..d7420bd8bf 100644
|
||||
--- a/xen/drivers/passthrough/pci.c
|
||||
+++ b/xen/drivers/passthrough/pci.c
|
||||
@@ -1452,7 +1452,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
|
||||
|
||||
/* Prevent device assign if mem paging or mem sharing have been
|
||||
* enabled for this domain */
|
||||
- if ( unlikely(d->arch.hvm.mem_sharing_enabled ||
|
||||
+ if ( unlikely((is_hvm_domain(d) &&
|
||||
+ d->arch.hvm.mem_sharing_enabled) ||
|
||||
vm_event_check_ring(d->vm_event_paging) ||
|
||||
p2m_get_hostp2m(d)->global_logdirty) )
|
||||
return -EXDEV;
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,499 +0,0 @@
|
|||
From 278d8e585a9f110a1af0bd92a9fc43733c9c7227 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Durrant <paul.durrant@citrix.com>
|
||||
Date: Mon, 14 Oct 2019 17:52:59 +0100
|
||||
Subject: [PATCH 2/2] passthrough: quarantine PCI devices
|
||||
|
||||
When a PCI device is assigned to an untrusted domain, it is possible for
|
||||
that domain to program the device to DMA to an arbitrary address. The
|
||||
IOMMU is used to protect the host from malicious DMA by making sure that
|
||||
the device addresses can only target memory assigned to the guest. However,
|
||||
when the guest domain is torn down the device is assigned back to dom0,
|
||||
thus allowing any in-flight DMA to potentially target critical host data.
|
||||
|
||||
This patch introduces a 'quarantine' for PCI devices using dom_io. When
|
||||
the toolstack makes a device assignable (by binding it to pciback), it
|
||||
will now also assign it to DOMID_IO and the device will only be assigned
|
||||
back to dom0 when the device is made unassignable again. Whilst device is
|
||||
assignable it will only ever transfer between dom_io and guest domains.
|
||||
dom_io is actually only used as a sentinel domain for quarantining purposes;
|
||||
it is not configured with any IOMMU mappings. Assignment to dom_io simply
|
||||
means that the device's initiator (requestor) identifier is not present in
|
||||
the IOMMU's device table and thus any DMA transactions issued will be
|
||||
terminated with a fault condition.
|
||||
|
||||
In addition, a fix to assignment handling is made for VT-d. Failure
|
||||
during the assignment step should not lead to a device still being
|
||||
associated with its prior owner. Hand the device to DomIO temporarily,
|
||||
until the assignment step has completed successfully. Remove the PI
|
||||
hooks from the source domain then earlier as well.
|
||||
|
||||
Failure of the recovery reassign_device_ownership() may not go silent:
|
||||
There e.g. may still be left over RMRR mappings in the domain assignment
|
||||
to which has failed, and hence we can't allow that domain to continue
|
||||
executing.
|
||||
|
||||
NOTE: This patch also includes one printk() cleanup; the
|
||||
"XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
|
||||
since similar printk()-s elsewhere also don't log such a tag.
|
||||
|
||||
This is XSA-302.
|
||||
|
||||
Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
|
||||
(cherry picked from commit ec99857f59f7f06236f11ca8b0b2303e5e745cc4)
|
||||
---
|
||||
tools/libxl/libxl_pci.c | 25 +++++++++++-
|
||||
xen/arch/x86/mm.c | 2 +
|
||||
xen/common/domctl.c | 14 ++++++-
|
||||
xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
|
||||
xen/drivers/passthrough/iommu.c | 9 +++++
|
||||
xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++-------
|
||||
xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++---
|
||||
xen/include/xen/pci.h | 3 ++
|
||||
8 files changed, 138 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
|
||||
index 88c324ea23..d6a23fb5f8 100644
|
||||
--- a/tools/libxl/libxl_pci.c
|
||||
+++ b/tools/libxl/libxl_pci.c
|
||||
@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
|
||||
libxl_device_pci *pcidev,
|
||||
int rebind)
|
||||
{
|
||||
+ libxl_ctx *ctx = libxl__gc_owner(gc);
|
||||
unsigned dom, bus, dev, func;
|
||||
char *spath, *driver_path = NULL;
|
||||
int rc;
|
||||
@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
|
||||
}
|
||||
if ( rc ) {
|
||||
LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
|
||||
- return 0;
|
||||
+ goto quarantine;
|
||||
}
|
||||
|
||||
/* Check to see if there's already a driver that we need to unbind from */
|
||||
@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
|
||||
return ERROR_FAIL;
|
||||
}
|
||||
|
||||
+quarantine:
|
||||
+ /*
|
||||
+ * DOMID_IO is just a sentinel domain, without any actual mappings,
|
||||
+ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
|
||||
+ * unnecessarily denied.
|
||||
+ */
|
||||
+ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
|
||||
+ XEN_DOMCTL_DEV_RDM_RELAXED);
|
||||
+ if ( rc < 0 ) {
|
||||
+ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
|
||||
+ return ERROR_FAIL;
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
|
||||
libxl_device_pci *pcidev,
|
||||
int rebind)
|
||||
{
|
||||
+ libxl_ctx *ctx = libxl__gc_owner(gc);
|
||||
int rc;
|
||||
char *driver_path;
|
||||
|
||||
+ /* De-quarantine */
|
||||
+ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
|
||||
+ if ( rc < 0 ) {
|
||||
+ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
|
||||
+ pcidev->dev, pcidev->func);
|
||||
+ return ERROR_FAIL;
|
||||
+ }
|
||||
+
|
||||
/* Unbind from pciback */
|
||||
if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
|
||||
return ERROR_FAIL;
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 3557cd1178..11d753d8d2 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
|
||||
* Initialise our DOMID_IO domain.
|
||||
* This domain owns I/O pages that are within the range of the page_info
|
||||
* array. Mappings occur at the priv of the caller.
|
||||
+ * Quarantined PCI devices will be associated with this domain.
|
||||
*/
|
||||
dom_io = domain_create(DOMID_IO, NULL, false);
|
||||
BUG_ON(IS_ERR(dom_io));
|
||||
+ INIT_LIST_HEAD(&dom_io->arch.pdev_list);
|
||||
|
||||
/*
|
||||
* Initialise our COW domain.
|
||||
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
|
||||
index d08b6274e2..e3c4be2b48 100644
|
||||
--- a/xen/common/domctl.c
|
||||
+++ b/xen/common/domctl.c
|
||||
@@ -391,6 +391,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
|
||||
|
||||
switch ( op->cmd )
|
||||
{
|
||||
+ case XEN_DOMCTL_assign_device:
|
||||
+ case XEN_DOMCTL_deassign_device:
|
||||
+ if ( op->domain == DOMID_IO )
|
||||
+ {
|
||||
+ d = dom_io;
|
||||
+ break;
|
||||
+ }
|
||||
+ else if ( op->domain == DOMID_INVALID )
|
||||
+ return -ESRCH;
|
||||
+ /* fall through */
|
||||
case XEN_DOMCTL_test_assign_device:
|
||||
if ( op->domain == DOMID_INVALID )
|
||||
{
|
||||
@@ -412,7 +422,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
|
||||
|
||||
if ( !domctl_lock_acquire() )
|
||||
{
|
||||
- if ( d )
|
||||
+ if ( d && d != dom_io )
|
||||
rcu_unlock_domain(d);
|
||||
return hypercall_create_continuation(
|
||||
__HYPERVISOR_domctl, "h", u_domctl);
|
||||
@@ -1074,7 +1084,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
|
||||
domctl_lock_release();
|
||||
|
||||
domctl_out_unlock_domonly:
|
||||
- if ( d )
|
||||
+ if ( d && d != dom_io )
|
||||
rcu_unlock_domain(d);
|
||||
|
||||
if ( copyback && __copy_to_guest(u_domctl, op, 1) )
|
||||
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
index 33a3798f36..15c13e1163 100644
|
||||
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
|
||||
@@ -120,6 +120,10 @@ static void amd_iommu_setup_domain_device(
|
||||
u8 bus = pdev->bus;
|
||||
const struct domain_iommu *hd = dom_iommu(domain);
|
||||
|
||||
+ /* dom_io is used as a sentinel for quarantined devices */
|
||||
+ if ( domain == dom_io )
|
||||
+ return;
|
||||
+
|
||||
BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
|
||||
!iommu->dev_table.buffer );
|
||||
|
||||
@@ -277,6 +281,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
|
||||
int req_id;
|
||||
u8 bus = pdev->bus;
|
||||
|
||||
+ /* dom_io is used as a sentinel for quarantined devices */
|
||||
+ if ( domain == dom_io )
|
||||
+ return;
|
||||
+
|
||||
BUG_ON ( iommu->dev_table.buffer == NULL );
|
||||
req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
|
||||
dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
|
||||
@@ -363,7 +371,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
|
||||
ivrs_mappings[req_id].read_permission);
|
||||
}
|
||||
|
||||
- return reassign_device(hardware_domain, d, devfn, pdev);
|
||||
+ return reassign_device(pdev->domain, d, devfn, pdev);
|
||||
}
|
||||
|
||||
static void deallocate_next_page_table(struct page_info *pg, int level)
|
||||
diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
|
||||
index a6697d58fb..2762e1342f 100644
|
||||
--- a/xen/drivers/passthrough/iommu.c
|
||||
+++ b/xen/drivers/passthrough/iommu.c
|
||||
@@ -232,6 +232,9 @@ void iommu_teardown(struct domain *d)
|
||||
{
|
||||
struct domain_iommu *hd = dom_iommu(d);
|
||||
|
||||
+ if ( d == dom_io )
|
||||
+ return;
|
||||
+
|
||||
hd->status = IOMMU_STATUS_disabled;
|
||||
hd->platform_ops->teardown(d);
|
||||
tasklet_schedule(&iommu_pt_cleanup_tasklet);
|
||||
@@ -241,6 +244,9 @@ int iommu_construct(struct domain *d)
|
||||
{
|
||||
struct domain_iommu *hd = dom_iommu(d);
|
||||
|
||||
+ if ( d == dom_io )
|
||||
+ return 0;
|
||||
+
|
||||
if ( hd->status == IOMMU_STATUS_initialized )
|
||||
return 0;
|
||||
|
||||
@@ -521,6 +527,9 @@ int __init iommu_setup(void)
|
||||
printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
|
||||
if ( iommu_enabled )
|
||||
{
|
||||
+ if ( iommu_domain_init(dom_io) )
|
||||
+ panic("Could not set up quarantine\n");
|
||||
+
|
||||
printk(" - Dom0 mode: %s\n",
|
||||
iommu_hwdom_passthrough ? "Passthrough" :
|
||||
iommu_hwdom_strict ? "Strict" : "Relaxed");
|
||||
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
|
||||
index d7420bd8bf..d66a8a1daf 100644
|
||||
--- a/xen/drivers/passthrough/pci.c
|
||||
+++ b/xen/drivers/passthrough/pci.c
|
||||
@@ -1426,19 +1426,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
|
||||
return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
|
||||
}
|
||||
|
||||
-/*
|
||||
- * If the device isn't owned by the hardware domain, it means it already
|
||||
- * has been assigned to other domain, or it doesn't exist.
|
||||
- */
|
||||
static int device_assigned(u16 seg, u8 bus, u8 devfn)
|
||||
{
|
||||
struct pci_dev *pdev;
|
||||
+ int rc = 0;
|
||||
|
||||
pcidevs_lock();
|
||||
- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
|
||||
+
|
||||
+ pdev = pci_get_pdev(seg, bus, devfn);
|
||||
+
|
||||
+ if ( !pdev )
|
||||
+ rc = -ENODEV;
|
||||
+ /*
|
||||
+ * If the device exists and it is not owned by either the hardware
|
||||
+ * domain or dom_io then it must be assigned to a guest, or be
|
||||
+ * hidden (owned by dom_xen).
|
||||
+ */
|
||||
+ else if ( pdev->domain != hardware_domain &&
|
||||
+ pdev->domain != dom_io )
|
||||
+ rc = -EBUSY;
|
||||
+
|
||||
pcidevs_unlock();
|
||||
|
||||
- return pdev ? 0 : -EBUSY;
|
||||
+ return rc;
|
||||
}
|
||||
|
||||
static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
|
||||
@@ -1452,7 +1462,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
|
||||
|
||||
/* Prevent device assign if mem paging or mem sharing have been
|
||||
* enabled for this domain */
|
||||
- if ( unlikely((is_hvm_domain(d) &&
|
||||
+ if ( d != dom_io &&
|
||||
+ unlikely((is_hvm_domain(d) &&
|
||||
d->arch.hvm.mem_sharing_enabled) ||
|
||||
vm_event_check_ring(d->vm_event_paging) ||
|
||||
p2m_get_hostp2m(d)->global_logdirty) )
|
||||
@@ -1468,12 +1479,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
|
||||
return rc;
|
||||
}
|
||||
|
||||
- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
|
||||
+ pdev = pci_get_pdev(seg, bus, devfn);
|
||||
+
|
||||
+ rc = -ENODEV;
|
||||
if ( !pdev )
|
||||
- {
|
||||
- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
|
||||
goto done;
|
||||
- }
|
||||
+
|
||||
+ rc = 0;
|
||||
+ if ( d == pdev->domain )
|
||||
+ goto done;
|
||||
+
|
||||
+ rc = -EBUSY;
|
||||
+ if ( pdev->domain != hardware_domain &&
|
||||
+ pdev->domain != dom_io )
|
||||
+ goto done;
|
||||
|
||||
if ( pdev->msix )
|
||||
msixtbl_init(d);
|
||||
@@ -1496,6 +1515,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
|
||||
}
|
||||
|
||||
done:
|
||||
+ /* The device is assigned to dom_io so mark it as quarantined */
|
||||
+ if ( !rc && d == dom_io )
|
||||
+ pdev->quarantine = true;
|
||||
+
|
||||
if ( !has_arch_pdevs(d) && has_iommu_pt(d) )
|
||||
iommu_teardown(d);
|
||||
pcidevs_unlock();
|
||||
@@ -1508,6 +1531,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
|
||||
{
|
||||
const struct domain_iommu *hd = dom_iommu(d);
|
||||
struct pci_dev *pdev = NULL;
|
||||
+ struct domain *target;
|
||||
int ret = 0;
|
||||
|
||||
if ( !iommu_enabled || !hd->platform_ops )
|
||||
@@ -1518,12 +1542,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
|
||||
if ( !pdev )
|
||||
return -ENODEV;
|
||||
|
||||
+ /* De-assignment from dom_io should de-quarantine the device */
|
||||
+ target = (pdev->quarantine && pdev->domain != dom_io) ?
|
||||
+ dom_io : hardware_domain;
|
||||
+
|
||||
while ( pdev->phantom_stride )
|
||||
{
|
||||
devfn += pdev->phantom_stride;
|
||||
if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
|
||||
break;
|
||||
- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
|
||||
+ ret = hd->platform_ops->reassign_device(d, target, devfn,
|
||||
pci_to_dev(pdev));
|
||||
if ( !ret )
|
||||
continue;
|
||||
@@ -1534,7 +1562,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
|
||||
}
|
||||
|
||||
devfn = pdev->devfn;
|
||||
- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
|
||||
+ ret = hd->platform_ops->reassign_device(d, target, devfn,
|
||||
pci_to_dev(pdev));
|
||||
if ( ret )
|
||||
{
|
||||
@@ -1544,6 +1572,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ if ( pdev->domain == hardware_domain )
|
||||
+ pdev->quarantine = false;
|
||||
+
|
||||
pdev->fault.count = 0;
|
||||
|
||||
if ( !has_arch_pdevs(d) && has_iommu_pt(d) )
|
||||
@@ -1722,7 +1753,7 @@ int iommu_do_pci_domctl(
|
||||
ret = hypercall_create_continuation(__HYPERVISOR_domctl,
|
||||
"h", u_domctl);
|
||||
else if ( ret )
|
||||
- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
|
||||
+ printk(XENLOG_G_ERR
|
||||
"assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
|
||||
seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
|
||||
d->domain_id, ret);
|
||||
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
|
||||
index 1db1cd9f2d..a8d1baa064 100644
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -1338,6 +1338,10 @@ int domain_context_mapping_one(
|
||||
int agaw, rc, ret;
|
||||
bool_t flush_dev_iotlb;
|
||||
|
||||
+ /* dom_io is used as a sentinel for quarantined devices */
|
||||
+ if ( domain == dom_io )
|
||||
+ return 0;
|
||||
+
|
||||
ASSERT(pcidevs_locked());
|
||||
spin_lock(&iommu->lock);
|
||||
maddr = bus_to_context_maddr(iommu, bus);
|
||||
@@ -1573,6 +1577,10 @@ int domain_context_unmap_one(
|
||||
int iommu_domid, rc, ret;
|
||||
bool_t flush_dev_iotlb;
|
||||
|
||||
+ /* dom_io is used as a sentinel for quarantined devices */
|
||||
+ if ( domain == dom_io )
|
||||
+ return 0;
|
||||
+
|
||||
ASSERT(pcidevs_locked());
|
||||
spin_lock(&iommu->lock);
|
||||
|
||||
@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
|
||||
goto out;
|
||||
}
|
||||
|
||||
+ /* dom_io is used as a sentinel for quarantined devices */
|
||||
+ if ( domain == dom_io )
|
||||
+ goto out;
|
||||
+
|
||||
/*
|
||||
* if no other devices under the same iommu owned by this domain,
|
||||
* clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
|
||||
@@ -2441,6 +2453,15 @@ static int reassign_device_ownership(
|
||||
if ( ret )
|
||||
return ret;
|
||||
|
||||
+ if ( devfn == pdev->devfn )
|
||||
+ {
|
||||
+ list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
|
||||
+ pdev->domain = dom_io;
|
||||
+ }
|
||||
+
|
||||
+ if ( !has_arch_pdevs(source) )
|
||||
+ vmx_pi_hooks_deassign(source);
|
||||
+
|
||||
if ( !has_arch_pdevs(target) )
|
||||
vmx_pi_hooks_assign(target);
|
||||
|
||||
@@ -2459,15 +2480,13 @@ static int reassign_device_ownership(
|
||||
pdev->domain = target;
|
||||
}
|
||||
|
||||
- if ( !has_arch_pdevs(source) )
|
||||
- vmx_pi_hooks_deassign(source);
|
||||
-
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int intel_iommu_assign_device(
|
||||
struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
|
||||
{
|
||||
+ struct domain *s = pdev->domain;
|
||||
struct acpi_rmrr_unit *rmrr;
|
||||
int ret = 0, i;
|
||||
u16 bdf, seg;
|
||||
@@ -2510,8 +2529,8 @@ static int intel_iommu_assign_device(
|
||||
}
|
||||
}
|
||||
|
||||
- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
|
||||
- if ( ret )
|
||||
+ ret = reassign_device_ownership(s, d, devfn, pdev);
|
||||
+ if ( ret || d == dom_io )
|
||||
return ret;
|
||||
|
||||
/* Setup rmrr identity mapping */
|
||||
@@ -2524,11 +2543,20 @@ static int intel_iommu_assign_device(
|
||||
ret = rmrr_identity_mapping(d, 1, rmrr, flag);
|
||||
if ( ret )
|
||||
{
|
||||
- reassign_device_ownership(d, hardware_domain, devfn, pdev);
|
||||
+ int rc;
|
||||
+
|
||||
+ rc = reassign_device_ownership(d, s, devfn, pdev);
|
||||
printk(XENLOG_G_ERR VTDPREFIX
|
||||
" cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
|
||||
rmrr->base_address, rmrr->end_address,
|
||||
d->domain_id, ret);
|
||||
+ if ( rc )
|
||||
+ {
|
||||
+ printk(XENLOG_ERR VTDPREFIX
|
||||
+ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
|
||||
+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
|
||||
+ domain_crash(d);
|
||||
+ }
|
||||
break;
|
||||
}
|
||||
}
|
||||
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
|
||||
index 8b21e8dc84..a031fd6020 100644
|
||||
--- a/xen/include/xen/pci.h
|
||||
+++ b/xen/include/xen/pci.h
|
||||
@@ -88,6 +88,9 @@ struct pci_dev {
|
||||
|
||||
nodeid_t node; /* NUMA node */
|
||||
|
||||
+ /* Device to be quarantined, don't automatically re-assign to dom0 */
|
||||
+ bool quarantine;
|
||||
+
|
||||
/* Device with errata, ignore the BARs. */
|
||||
bool ignore_bars;
|
||||
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Tue, 1 Oct 2019 13:07:53 +0100
|
||||
Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two
|
||||
|
||||
The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap
|
||||
entry function. While the macro is fairly small today, follow-up patches
|
||||
will increase the size signicantly.
|
||||
|
||||
In general, assembly macros are more readable as they allow you to name
|
||||
parameters and avoid '\'. So the actual implementation of the trap is
|
||||
now switched to an assembly macro.
|
||||
|
||||
This is part of XSA-303.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
|
||||
---
|
||||
xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++---------------
|
||||
1 file changed, 19 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
|
||||
index 0b4cd19abd..4a762e04f1 100644
|
||||
--- a/xen/arch/arm/arm32/entry.S
|
||||
+++ b/xen/arch/arm/arm32/entry.S
|
||||
@@ -126,24 +126,28 @@ abort_guest_exit_end:
|
||||
skip_check:
|
||||
mov pc, lr
|
||||
|
||||
-/*
|
||||
- * Macro to define trap entry. The iflags corresponds to the list of
|
||||
- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
|
||||
- */
|
||||
+ /*
|
||||
+ * Macro to define trap entry. The iflags corresponds to the list of
|
||||
+ * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
|
||||
+ */
|
||||
+ .macro vector trap, iflags
|
||||
+ SAVE_ALL
|
||||
+ cpsie \iflags
|
||||
+ adr lr, return_from_trap
|
||||
+ mov r0, sp
|
||||
+ /*
|
||||
+ * Save the stack pointer in r11. It will be restored after the
|
||||
+ * trap has been handled (see return_from_trap).
|
||||
+ */
|
||||
+ mov r11, sp
|
||||
+ bic sp, #7 /* Align the stack pointer (noop on guest trap) */
|
||||
+ b do_trap_\trap
|
||||
+ .endm
|
||||
+
|
||||
#define __DEFINE_TRAP_ENTRY(trap, iflags) \
|
||||
ALIGN; \
|
||||
trap_##trap: \
|
||||
- SAVE_ALL; \
|
||||
- cpsie iflags; \
|
||||
- adr lr, return_from_trap; \
|
||||
- mov r0, sp; \
|
||||
- /* \
|
||||
- * Save the stack pointer in r11. It will be restored after the \
|
||||
- * trap has been handled (see return_from_trap). \
|
||||
- */ \
|
||||
- mov r11, sp; \
|
||||
- bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \
|
||||
- b do_trap_##trap
|
||||
+ vector trap, iflags
|
||||
|
||||
/* Trap handler which unmask IRQ/Abort, keep FIQ masked */
|
||||
#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Tue, 1 Oct 2019 13:15:48 +0100
|
||||
Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro
|
||||
vector
|
||||
|
||||
Follow-up rework will require the macro vector to distinguish between
|
||||
a trap from a guest vs while in the hypervisor.
|
||||
|
||||
The macro SAVE_ALL already has code to distinguish between the two and
|
||||
it is only called by the vector macro. So fold the former into the
|
||||
latter. This will help to avoid duplicating the check.
|
||||
|
||||
This is part of XSA-303.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
|
||||
---
|
||||
xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++-----------------------
|
||||
1 file changed, 23 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
|
||||
index 4a762e04f1..150cbc0b4b 100644
|
||||
--- a/xen/arch/arm/arm32/entry.S
|
||||
+++ b/xen/arch/arm/arm32/entry.S
|
||||
@@ -13,27 +13,6 @@
|
||||
#define RESTORE_BANKED(mode) \
|
||||
RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode)
|
||||
|
||||
-#define SAVE_ALL \
|
||||
- sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \
|
||||
- push {r0-r12}; /* Save R0-R12 */ \
|
||||
- \
|
||||
- mrs r11, ELR_hyp; /* ELR_hyp is return address. */\
|
||||
- str r11, [sp, #UREGS_pc]; \
|
||||
- \
|
||||
- str lr, [sp, #UREGS_lr]; \
|
||||
- \
|
||||
- add r11, sp, #UREGS_kernel_sizeof+4; \
|
||||
- str r11, [sp, #UREGS_sp]; \
|
||||
- \
|
||||
- mrc CP32(r11, HSR); /* Save exception syndrome */ \
|
||||
- str r11, [sp, #UREGS_hsr]; \
|
||||
- \
|
||||
- mrs r11, SPSR_hyp; \
|
||||
- str r11, [sp, #UREGS_cpsr]; \
|
||||
- and r11, #PSR_MODE_MASK; \
|
||||
- cmp r11, #PSR_MODE_HYP; \
|
||||
- blne save_guest_regs
|
||||
-
|
||||
save_guest_regs:
|
||||
#ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR
|
||||
/*
|
||||
@@ -52,7 +31,7 @@ save_guest_regs:
|
||||
ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */
|
||||
str r11, [sp, #UREGS_sp]
|
||||
SAVE_ONE_BANKED(SP_usr)
|
||||
- /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */
|
||||
+ /* LR_usr is the same physical register as lr and is saved by the caller */
|
||||
SAVE_BANKED(svc)
|
||||
SAVE_BANKED(abt)
|
||||
SAVE_BANKED(und)
|
||||
@@ -131,7 +110,28 @@ skip_check:
|
||||
* interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
|
||||
*/
|
||||
.macro vector trap, iflags
|
||||
- SAVE_ALL
|
||||
+ /* Save registers in the stack */
|
||||
+ sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
|
||||
+ push {r0-r12} /* Save R0-R12 */
|
||||
+ mrs r11, ELR_hyp /* ELR_hyp is return address */
|
||||
+ str r11, [sp, #UREGS_pc]
|
||||
+
|
||||
+ str lr, [sp, #UREGS_lr]
|
||||
+
|
||||
+ add r11, sp, #(UREGS_kernel_sizeof + 4)
|
||||
+
|
||||
+ str r11, [sp, #UREGS_sp]
|
||||
+
|
||||
+ mrc CP32(r11, HSR) /* Save exception syndrome */
|
||||
+ str r11, [sp, #UREGS_hsr]
|
||||
+
|
||||
+ mrs r11, SPSR_hyp
|
||||
+ str r11, [sp, #UREGS_cpsr]
|
||||
+ and r11, #PSR_MODE_MASK
|
||||
+ cmp r11, #PSR_MODE_HYP
|
||||
+ blne save_guest_regs
|
||||
+
|
||||
+ /* We are ready to handle the trap, setup the registers and jump. */
|
||||
cpsie \iflags
|
||||
adr lr, return_from_trap
|
||||
mov r0, sp
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,226 +0,0 @@
|
|||
From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Fri, 11 Oct 2019 17:49:28 +0100
|
||||
Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap
|
||||
without a change of level
|
||||
|
||||
Exception vectors will unmask interrupts regardless the state of them in
|
||||
the interrupted context.
|
||||
|
||||
One of the consequences is IRQ will be unmasked when receiving an
|
||||
undefined instruction exception (used by WARN*) from the hypervisor.
|
||||
This could result to unexpected behavior such as deadlock (if a lock was
|
||||
shared with interrupts).
|
||||
|
||||
In a nutshell, interrupts should only be unmasked when it is safe to do.
|
||||
Xen only unmask IRQ and Abort interrupts, so the logic can stay simple.
|
||||
|
||||
As vectors exceptions may be shared between guest and hypervisor, we now
|
||||
need to have a different policy for the interrupts.
|
||||
|
||||
On exception from hypervisor, each vector will select the list of
|
||||
interrupts to inherit from the interrupted context. Any interrupts not
|
||||
listed will be kept masked.
|
||||
|
||||
On exception from the guest, the Abort and IRQ will be unmasked
|
||||
depending on the exact vector.
|
||||
|
||||
The interrupts will be kept unmasked when the vector cannot used by
|
||||
either guest or hypervisor.
|
||||
|
||||
Note that each vector is not anymore preceded by ALIGN. This is fine
|
||||
because the alignment is already bigger than what we need.
|
||||
|
||||
This is part of XSA-303.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
|
||||
---
|
||||
xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++----------
|
||||
1 file changed, 109 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
|
||||
index 150cbc0b4b..ec90cca093 100644
|
||||
--- a/xen/arch/arm/arm32/entry.S
|
||||
+++ b/xen/arch/arm/arm32/entry.S
|
||||
@@ -4,6 +4,17 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <public/xen.h>
|
||||
|
||||
+/*
|
||||
+ * Short-hands to defined the interrupts (A, I, F)
|
||||
+ *
|
||||
+ * _ means the interrupt state will not change
|
||||
+ * X means the state of interrupt X will change
|
||||
+ *
|
||||
+ * To be used with msr cpsr_* only
|
||||
+ */
|
||||
+#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK
|
||||
+#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK
|
||||
+
|
||||
#define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg]
|
||||
#define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11
|
||||
|
||||
@@ -106,10 +117,18 @@ skip_check:
|
||||
mov pc, lr
|
||||
|
||||
/*
|
||||
- * Macro to define trap entry. The iflags corresponds to the list of
|
||||
- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
|
||||
+ * Macro to define a trap entry.
|
||||
+ *
|
||||
+ * @guest_iflags: Optional list of interrupts to unmask when
|
||||
+ * entering from guest context. As this is used with cpsie,
|
||||
+ * the letter (a, i, f) should be used.
|
||||
+ *
|
||||
+ * @hyp_iflags: Optional list of interrupts to inherit when
|
||||
+ * entering from hypervisor context. Any interrupts not
|
||||
+ * listed will be kept unchanged. As this is used with cpsr_*,
|
||||
+ * IFLAGS_* short-hands should be used.
|
||||
*/
|
||||
- .macro vector trap, iflags
|
||||
+ .macro vector trap, guest_iflags=n, hyp_iflags=0
|
||||
/* Save registers in the stack */
|
||||
sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
|
||||
push {r0-r12} /* Save R0-R12 */
|
||||
@@ -127,12 +146,39 @@ skip_check:
|
||||
|
||||
mrs r11, SPSR_hyp
|
||||
str r11, [sp, #UREGS_cpsr]
|
||||
- and r11, #PSR_MODE_MASK
|
||||
- cmp r11, #PSR_MODE_HYP
|
||||
- blne save_guest_regs
|
||||
|
||||
+ /*
|
||||
+ * We need to distinguish whether we came from guest or
|
||||
+ * hypervisor context.
|
||||
+ */
|
||||
+ and r0, r11, #PSR_MODE_MASK
|
||||
+ cmp r0, #PSR_MODE_HYP
|
||||
+
|
||||
+ bne 1f
|
||||
+ /*
|
||||
+ * Trap from the hypervisor
|
||||
+ *
|
||||
+ * Inherit the state of the interrupts from the hypervisor
|
||||
+ * context. For that we need to use SPSR (stored in r11) and
|
||||
+ * modify CPSR accordingly.
|
||||
+ *
|
||||
+ * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags)
|
||||
+ */
|
||||
+ mrs r10, cpsr
|
||||
+ bic r10, r10, #\hyp_iflags
|
||||
+ and r11, r11, #\hyp_iflags
|
||||
+ orr r10, r10, r11
|
||||
+ msr cpsr_cx, r10
|
||||
+ b 2f
|
||||
+
|
||||
+1:
|
||||
+ /* Trap from the guest */
|
||||
+ bl save_guest_regs
|
||||
+ .if \guest_iflags != n
|
||||
+ cpsie \guest_iflags
|
||||
+ .endif
|
||||
+2:
|
||||
/* We are ready to handle the trap, setup the registers and jump. */
|
||||
- cpsie \iflags
|
||||
adr lr, return_from_trap
|
||||
mov r0, sp
|
||||
/*
|
||||
@@ -144,20 +190,6 @@ skip_check:
|
||||
b do_trap_\trap
|
||||
.endm
|
||||
|
||||
-#define __DEFINE_TRAP_ENTRY(trap, iflags) \
|
||||
- ALIGN; \
|
||||
-trap_##trap: \
|
||||
- vector trap, iflags
|
||||
-
|
||||
-/* Trap handler which unmask IRQ/Abort, keep FIQ masked */
|
||||
-#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
|
||||
-
|
||||
-/* Trap handler which unmask Abort, keep IRQ/FIQ masked */
|
||||
-#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a)
|
||||
-
|
||||
-/* Trap handler which unmask IRQ, keep Abort/FIQ masked */
|
||||
-#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i)
|
||||
-
|
||||
.align 5
|
||||
GLOBAL(hyp_traps_vector)
|
||||
b trap_reset /* 0x00 - Reset */
|
||||
@@ -228,14 +260,62 @@ decode_vectors:
|
||||
|
||||
#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
|
||||
|
||||
-DEFINE_TRAP_ENTRY(reset)
|
||||
-DEFINE_TRAP_ENTRY(undefined_instruction)
|
||||
-DEFINE_TRAP_ENTRY(hypervisor_call)
|
||||
-DEFINE_TRAP_ENTRY(prefetch_abort)
|
||||
-DEFINE_TRAP_ENTRY(guest_sync)
|
||||
-DEFINE_TRAP_ENTRY_NOIRQ(irq)
|
||||
-DEFINE_TRAP_ENTRY_NOIRQ(fiq)
|
||||
-DEFINE_TRAP_ENTRY_NOABORT(data_abort)
|
||||
+/* Vector not used by the Hypervisor. */
|
||||
+trap_reset:
|
||||
+ vector reset
|
||||
+
|
||||
+/*
|
||||
+ * Vector only used by the Hypervisor.
|
||||
+ *
|
||||
+ * While the exception can be executed with all the interrupts (e.g.
|
||||
+ * IRQ) unmasked, the interrupted context may have purposefully masked
|
||||
+ * some of them. So we want to inherit the state from the interrupted
|
||||
+ * context.
|
||||
+ */
|
||||
+trap_undefined_instruction:
|
||||
+ vector undefined_instruction, hyp_iflags=IFLAGS_AIF
|
||||
+
|
||||
+/* We should never reach this trap */
|
||||
+trap_hypervisor_call:
|
||||
+ vector hypervisor_call
|
||||
+
|
||||
+/*
|
||||
+ * Vector only used by the hypervisor.
|
||||
+ *
|
||||
+ * While the exception can be executed with all the interrupts (e.g.
|
||||
+ * IRQ) unmasked, the interrupted context may have purposefully masked
|
||||
+ * some of them. So we want to inherit the state from the interrupted
|
||||
+ * context.
|
||||
+ */
|
||||
+trap_prefetch_abort:
|
||||
+ vector prefetch_abort, hyp_iflags=IFLAGS_AIF
|
||||
+
|
||||
+/*
|
||||
+ * Vector only used by the hypervisor.
|
||||
+ *
|
||||
+ * Data Abort should be rare and most likely fatal. It is best to not
|
||||
+ * unmask any interrupts to limit the amount of code that can run before
|
||||
+ * the Data Abort is treated.
|
||||
+ */
|
||||
+trap_data_abort:
|
||||
+ vector data_abort
|
||||
+
|
||||
+/* Vector only used by the guest. We can unmask Abort/IRQ. */
|
||||
+trap_guest_sync:
|
||||
+ vector guest_sync, guest_iflags=ai
|
||||
+
|
||||
+
|
||||
+/* Vector used by the hypervisor and the guest. */
|
||||
+trap_irq:
|
||||
+ vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F
|
||||
+
|
||||
+/*
|
||||
+ * Vector used by the hypervisor and the guest.
|
||||
+ *
|
||||
+ * FIQ are not meant to happen, so we don't unmask any interrupts.
|
||||
+ */
|
||||
+trap_fiq:
|
||||
+ vector fiq
|
||||
|
||||
return_from_trap:
|
||||
/*
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien.grall@arm.com>
|
||||
Date: Mon, 7 Oct 2019 18:10:56 +0100
|
||||
Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap
|
||||
without a change of level
|
||||
|
||||
Some of the traps without a change of the level (i.e. hypervisor ->
|
||||
hypervisor) will unmask interrupts regardless the state of them in the
|
||||
interrupted context.
|
||||
|
||||
One of the consequences is IRQ will be unmasked when receiving a
|
||||
synchronous exception (used by WARN*()). This could result to unexpected
|
||||
behavior such as deadlock (if a lock was shared with interrupts).
|
||||
|
||||
In a nutshell, interrupts should only be unmasked when it is safe to
|
||||
do. Xen only unmask IRQ and Abort interrupts, so the logic can stay
|
||||
simple:
|
||||
- hyp_error: All the interrupts are now kept masked. SError should
|
||||
be pretty rare and if ever happen then we most likely want to
|
||||
avoid any other interrupts to be generated. The potential main
|
||||
"caller" is during virtual SError synchronization on the exit
|
||||
path from the guest (see check_pending_vserror).
|
||||
|
||||
- hyp_sync: The interrupts state is inherited from the interrupted
|
||||
context.
|
||||
|
||||
- hyp_irq: All the interrupts but IRQ state are inherited from the
|
||||
interrupted context. IRQ is kept masked.
|
||||
|
||||
This is part of XSA-303.
|
||||
|
||||
Reported-by: Julien Grall <Julien.Grall@arm.com>
|
||||
Signed-off-by: Julien Grall <julien.grall@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
|
||||
---
|
||||
xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 43 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
|
||||
index 2d9a2713a1..3e41ba65b6 100644
|
||||
--- a/xen/arch/arm/arm64/entry.S
|
||||
+++ b/xen/arch/arm/arm64/entry.S
|
||||
@@ -188,24 +188,63 @@ hyp_error_invalid:
|
||||
entry hyp=1
|
||||
invalid BAD_ERROR
|
||||
|
||||
+/*
|
||||
+ * SError received while running in the hypervisor mode.
|
||||
+ *
|
||||
+ * Technically, we could unmask the IRQ if it were unmasked in the
|
||||
+ * interrupted context. However, this require to check the PSTATE. For
|
||||
+ * simplicity, as SError should be rare and potentially fatal,
|
||||
+ * all interrupts are kept masked.
|
||||
+ */
|
||||
hyp_error:
|
||||
entry hyp=1
|
||||
- msr daifclr, #2
|
||||
mov x0, sp
|
||||
bl do_trap_hyp_serror
|
||||
exit hyp=1
|
||||
|
||||
-/* Traps taken in Current EL with SP_ELx */
|
||||
+/*
|
||||
+ * Synchronous exception received while running in the hypervisor mode.
|
||||
+ *
|
||||
+ * While the exception could be executed with all the interrupts (e.g.
|
||||
+ * IRQ) unmasked, the interrupted context may have purposefully masked
|
||||
+ * some of them. So we want to inherit the state from the interrupted
|
||||
+ * context.
|
||||
+ */
|
||||
hyp_sync:
|
||||
entry hyp=1
|
||||
- msr daifclr, #6
|
||||
+
|
||||
+ /* Inherit interrupts */
|
||||
+ mrs x0, SPSR_el2
|
||||
+ and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK)
|
||||
+ msr daif, x0
|
||||
+
|
||||
mov x0, sp
|
||||
bl do_trap_hyp_sync
|
||||
exit hyp=1
|
||||
|
||||
+/*
|
||||
+ * IRQ received while running in the hypervisor mode.
|
||||
+ *
|
||||
+ * While the exception could be executed with all the interrupts but IRQ
|
||||
+ * unmasked, the interrupted context may have purposefully masked some
|
||||
+ * of them. So we want to inherit the state from the interrupt context
|
||||
+ * and keep IRQ masked.
|
||||
+ *
|
||||
+ * XXX: We may want to consider an ordering between interrupts (e.g. if
|
||||
+ * SError are masked, then IRQ should be masked too). However, this
|
||||
+ * would require some rework in some paths (e.g. panic, livepatch) to
|
||||
+ * ensure the ordering is enforced everywhere.
|
||||
+ */
|
||||
hyp_irq:
|
||||
entry hyp=1
|
||||
- msr daifclr, #4
|
||||
+
|
||||
+ /* Inherit D, A, F interrupts and keep I masked */
|
||||
+ mrs x0, SPSR_el2
|
||||
+ mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK)
|
||||
+ and x0, x0, x1
|
||||
+ orr x0, x0, #PSR_IRQ_MASK
|
||||
+ msr daif, x0
|
||||
+
|
||||
mov x0, sp
|
||||
bl do_trap_irq
|
||||
exit hyp=1
|
||||
--
|
||||
2.11.0
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
|
||||
|
||||
Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
|
||||
an EPT superpage gets shattered. The root cause is still under investigation,
|
||||
but the end result is unusable in combination with CVE-2018-12207 protections.
|
||||
|
||||
This is part of XSA-304 / CVE-2018-12207
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
|
||||
index 16eada9fa2..a71c8b0f84 100644
|
||||
--- a/xen/drivers/passthrough/vtd/extern.h
|
||||
+++ b/xen/drivers/passthrough/vtd/extern.h
|
||||
@@ -97,6 +97,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
|
||||
int __must_check me_wifi_quirk(struct domain *domain,
|
||||
u8 bus, u8 devfn, int map);
|
||||
void pci_vtd_quirk(const struct pci_dev *);
|
||||
+void quirk_iommu_caps(struct iommu *iommu);
|
||||
+
|
||||
bool_t platform_supports_intremap(void);
|
||||
bool_t platform_supports_x2apic(void);
|
||||
|
||||
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
|
||||
index b3664ecbe0..5d34f75306 100644
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -1215,6 +1215,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
|
||||
if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
|
||||
return -ENODEV;
|
||||
|
||||
+ quirk_iommu_caps(iommu);
|
||||
+
|
||||
if ( cap_fault_reg_offset(iommu->cap) +
|
||||
cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
|
||||
ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
|
||||
diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
|
||||
index d6db862678..b02688e316 100644
|
||||
--- a/xen/drivers/passthrough/vtd/quirks.c
|
||||
+++ b/xen/drivers/passthrough/vtd/quirks.c
|
||||
@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
|
||||
break;
|
||||
}
|
||||
}
|
||||
+
|
||||
+void __init quirk_iommu_caps(struct iommu *iommu)
|
||||
+{
|
||||
+ /*
|
||||
+ * IOMMU Quirks:
|
||||
+ *
|
||||
+ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
|
||||
+ * implement superpages internally.
|
||||
+ *
|
||||
+ * There are issues changing the walk length under in-flight DMA, which
|
||||
+ * has manifested as incompatibility between EPT/IOMMU sharing and the
|
||||
+ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages
|
||||
+ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
|
||||
+ * and IOMMU pagetables.
|
||||
+ *
|
||||
+ * Detection of SandyBridge unfortunately has to be done by processor
|
||||
+ * model because the client parts don't expose their IOMMUs as PCI devices
|
||||
+ * we could match with a Device ID.
|
||||
+ */
|
||||
+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
|
||||
+ boot_cpu_data.x86 == 6 &&
|
||||
+ (boot_cpu_data.x86_model == 0x2a ||
|
||||
+ boot_cpu_data.x86_model == 0x2d) )
|
||||
+ iommu->cap &= ~(0xful << 34);
|
||||
+}
|
|
@ -1,272 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/vtx: Disable executable EPT superpages to work around
|
||||
CVE-2018-12207
|
||||
|
||||
CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
|
||||
machine check exception can be generated in a corner case when an executable
|
||||
mapping changes size or cacheability without TLB invalidation. HVM guest
|
||||
kernels can trigger this to DoS the host.
|
||||
|
||||
To mitigate, in affected hardware, all EPT superpages are marked NX. When an
|
||||
instruction fetch violation is observed against the superpage, the superpage
|
||||
is shattered to 4k and has execute permissions restored. This prevents the
|
||||
guest kernel from being able to create the necessary preconditions in the iTLB
|
||||
to exploit the vulnerability.
|
||||
|
||||
This does come with a workload-dependent performance overhead, caused by
|
||||
increased TLB pressure. Performance can be restored, if guest kernels are
|
||||
trusted not to mount an attack, by specifying ept=exec-sp on the command line.
|
||||
|
||||
This is part of XSA-304 / CVE-2018-12207
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Acked-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
|
||||
index 85081fdc94..e283017015 100644
|
||||
--- a/docs/misc/xen-command-line.pandoc
|
||||
+++ b/docs/misc/xen-command-line.pandoc
|
||||
@@ -895,7 +895,7 @@ Controls for interacting with the system Extended Firmware Interface.
|
||||
uncacheable.
|
||||
|
||||
### ept
|
||||
-> `= List of [ ad=<bool>, pml=<bool> ]`
|
||||
+> `= List of [ ad=<bool>, pml=<bool>, exec-sp=<bool> ]`
|
||||
|
||||
> Applicability: Intel
|
||||
|
||||
@@ -926,6 +926,16 @@ introduced with the Nehalem architecture.
|
||||
disable PML. `pml=0` can be used to prevent the use of PML on otherwise
|
||||
capable hardware.
|
||||
|
||||
+* The `exec-sp` boolean controls whether EPT superpages with execute
|
||||
+ permissions are permitted. In general this is good for performance.
|
||||
+
|
||||
+ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can
|
||||
+ use executable superpages to crash the host. By default, executable
|
||||
+ superpages are disabled on affected hardware.
|
||||
+
|
||||
+ If HVM guest kernels are trusted not to mount a DoS against the system,
|
||||
+ this option can enabled to regain performance.
|
||||
+
|
||||
### extra_guest_irqs
|
||||
> `= [<domU number>][,<dom0 number>]`
|
||||
|
||||
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
|
||||
index 2089a77270..84191d4e4b 100644
|
||||
--- a/xen/arch/x86/hvm/hvm.c
|
||||
+++ b/xen/arch/x86/hvm/hvm.c
|
||||
@@ -1814,6 +1814,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
|
||||
break;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution
|
||||
+ * fault against a non-executable superpage, shatter it to regain
|
||||
+ * execute permissions.
|
||||
+ */
|
||||
+ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
|
||||
+ {
|
||||
+ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
|
||||
+ p2mt, p2ma);
|
||||
+
|
||||
+ if ( res )
|
||||
+ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
|
||||
+ gfn, res);
|
||||
+
|
||||
+ rc = !res;
|
||||
+ goto out_put_gfn;
|
||||
+ }
|
||||
+
|
||||
if ( violation )
|
||||
{
|
||||
/* Should #VE be emulated for this fault? */
|
||||
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
index 56519fee84..ec5ab860ad 100644
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
|
||||
|
||||
static bool __read_mostly opt_ept_pml = true;
|
||||
static s8 __read_mostly opt_ept_ad = -1;
|
||||
+int8_t __read_mostly opt_ept_exec_sp = -1;
|
||||
|
||||
static int __init parse_ept_param(const char *s)
|
||||
{
|
||||
@@ -82,6 +83,8 @@ static int __init parse_ept_param(const char *s)
|
||||
opt_ept_ad = val;
|
||||
else if ( (val = parse_boolean("pml", s, ss)) >= 0 )
|
||||
opt_ept_pml = val;
|
||||
+ else if ( (val = parse_boolean("exec-sp", s, ss)) >= 0 )
|
||||
+ opt_ept_exec_sp = val;
|
||||
else
|
||||
rc = -EINVAL;
|
||||
|
||||
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
index 26b7ddb5fe..28cba8ec28 100644
|
||||
--- a/xen/arch/x86/hvm/vmx/vmx.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmx.c
|
||||
@@ -2445,6 +2445,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
|
||||
static void __init lbr_tsx_fixup_check(void);
|
||||
static void __init bdw_erratum_bdf14_fixup_check(void);
|
||||
|
||||
+/*
|
||||
+ * Calculate whether the CPU is vulnerable to Instruction Fetch page
|
||||
+ * size-change MCEs.
|
||||
+ */
|
||||
+static bool __init has_if_pschange_mc(void)
|
||||
+{
|
||||
+ uint64_t caps = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * If we are virtualised, there is nothing we can do. Our EPT tables are
|
||||
+ * shadowed by our hypervisor, and not walked by hardware.
|
||||
+ */
|
||||
+ if ( cpu_has_hypervisor )
|
||||
+ return false;
|
||||
+
|
||||
+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
|
||||
+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
|
||||
+
|
||||
+ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
|
||||
+ * this time.
|
||||
+ */
|
||||
+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
|
||||
+ boot_cpu_data.x86 != 6 )
|
||||
+ return false;
|
||||
+
|
||||
+ switch ( boot_cpu_data.x86_model )
|
||||
+ {
|
||||
+ /*
|
||||
+ * Core processors since at least Nehalem are vulnerable.
|
||||
+ */
|
||||
+ case 0x1f: /* Auburndale / Havendale */
|
||||
+ case 0x1e: /* Nehalem */
|
||||
+ case 0x1a: /* Nehalem EP */
|
||||
+ case 0x2e: /* Nehalem EX */
|
||||
+ case 0x25: /* Westmere */
|
||||
+ case 0x2c: /* Westmere EP */
|
||||
+ case 0x2f: /* Westmere EX */
|
||||
+ case 0x2a: /* SandyBridge */
|
||||
+ case 0x2d: /* SandyBridge EP/EX */
|
||||
+ case 0x3a: /* IvyBridge */
|
||||
+ case 0x3e: /* IvyBridge EP/EX */
|
||||
+ case 0x3c: /* Haswell */
|
||||
+ case 0x3f: /* Haswell EX/EP */
|
||||
+ case 0x45: /* Haswell D */
|
||||
+ case 0x46: /* Haswell H */
|
||||
+ case 0x3d: /* Broadwell */
|
||||
+ case 0x47: /* Broadwell H */
|
||||
+ case 0x4f: /* Broadwell EP/EX */
|
||||
+ case 0x56: /* Broadwell D */
|
||||
+ case 0x4e: /* Skylake M */
|
||||
+ case 0x5e: /* Skylake D */
|
||||
+ case 0x55: /* Skylake-X / Cascade Lake */
|
||||
+ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
|
||||
+ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
|
||||
+ return true;
|
||||
+
|
||||
+ /*
|
||||
+ * Atom processors are not vulnerable.
|
||||
+ */
|
||||
+ case 0x1c: /* Pineview */
|
||||
+ case 0x26: /* Lincroft */
|
||||
+ case 0x27: /* Penwell */
|
||||
+ case 0x35: /* Cloverview */
|
||||
+ case 0x36: /* Cedarview */
|
||||
+ case 0x37: /* Baytrail / Valleyview (Silvermont) */
|
||||
+ case 0x4d: /* Avaton / Rangely (Silvermont) */
|
||||
+ case 0x4c: /* Cherrytrail / Brasswell */
|
||||
+ case 0x4a: /* Merrifield */
|
||||
+ case 0x5a: /* Moorefield */
|
||||
+ case 0x5c: /* Goldmont */
|
||||
+ case 0x5d: /* SoFIA 3G Granite/ES2.1 */
|
||||
+ case 0x65: /* SoFIA LTE AOSP */
|
||||
+ case 0x5f: /* Denverton */
|
||||
+ case 0x6e: /* Cougar Mountain */
|
||||
+ case 0x75: /* Lightning Mountain */
|
||||
+ case 0x7a: /* Gemini Lake */
|
||||
+ case 0x86: /* Jacobsville */
|
||||
+
|
||||
+ /*
|
||||
+ * Knights processors are not vulnerable.
|
||||
+ */
|
||||
+ case 0x57: /* Knights Landing */
|
||||
+ case 0x85: /* Knights Mill */
|
||||
+ return false;
|
||||
+
|
||||
+ default:
|
||||
+ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
|
||||
+ boot_cpu_data.x86_model);
|
||||
+ return true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
const struct hvm_function_table * __init start_vmx(void)
|
||||
{
|
||||
set_in_cr4(X86_CR4_VMXE);
|
||||
@@ -2465,6 +2561,17 @@ const struct hvm_function_table * __init start_vmx(void)
|
||||
*/
|
||||
if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
|
||||
{
|
||||
+ bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
|
||||
+
|
||||
+ if ( opt_ept_exec_sp == -1 )
|
||||
+ {
|
||||
+ /* Default to non-executable superpages on vulnerable hardware. */
|
||||
+ opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
|
||||
+
|
||||
+ if ( cpu_has_bug_pschange_mc )
|
||||
+ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
|
||||
+ }
|
||||
+
|
||||
vmx_function_table.hap_supported = 1;
|
||||
vmx_function_table.altp2m_supported = 1;
|
||||
|
||||
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
|
||||
index 952ebad82f..834d4798c8 100644
|
||||
--- a/xen/arch/x86/mm/p2m-ept.c
|
||||
+++ b/xen/arch/x86/mm/p2m-ept.c
|
||||
@@ -174,6 +174,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
|
||||
break;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Don't create executable superpages if we need to shatter them to
|
||||
+ * protect against CVE-2018-12207.
|
||||
+ */
|
||||
+ if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
|
||||
+ entry->x = 0;
|
||||
}
|
||||
|
||||
#define GUEST_TABLE_MAP_FAILED 0
|
||||
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
|
||||
index ebaa74449b..371b912887 100644
|
||||
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
|
||||
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <asm/hvm/trace.h>
|
||||
#include <asm/hvm/vmx/vmcs.h>
|
||||
|
||||
+extern int8_t opt_ept_exec_sp;
|
||||
+
|
||||
typedef union {
|
||||
struct {
|
||||
u64 r : 1, /* bit 0 - Read permission */
|
||||
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
|
||||
index 637259bd1f..32746aa8ae 100644
|
||||
--- a/xen/include/asm-x86/msr-index.h
|
||||
+++ b/xen/include/asm-x86/msr-index.h
|
||||
@@ -52,6 +52,7 @@
|
||||
#define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
|
||||
#define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
|
||||
#define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
|
||||
+#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
|
||||
|
||||
#define MSR_FLUSH_CMD 0x0000010b
|
||||
#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
|
|
@ -1,108 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/vtx: Allow runtime modification of the exec-sp setting
|
||||
|
||||
See patch for details.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
|
||||
index e283017015..84221fe60a 100644
|
||||
--- a/docs/misc/xen-command-line.pandoc
|
||||
+++ b/docs/misc/xen-command-line.pandoc
|
||||
@@ -936,6 +936,21 @@ introduced with the Nehalem architecture.
|
||||
If HVM guest kernels are trusted not to mount a DoS against the system,
|
||||
this option can enabled to regain performance.
|
||||
|
||||
+ This boolean may be modified at runtime using `xl set-parameters
|
||||
+ ept=[no-]exec-sp` to switch between fast and secure.
|
||||
+
|
||||
+ * When switching from secure to fast, preexisting HVM domains will run
|
||||
+ at their current performance until they are rebooted; new domains will
|
||||
+ run without any overhead.
|
||||
+
|
||||
+ * When switching from fast to secure, all HVM domains will immediately
|
||||
+ suffer a performance penalty.
|
||||
+
|
||||
+ **Warning: No guarantee is made that this runtime option will be retained
|
||||
+ indefinitely, or that it will retain this exact behaviour. It is
|
||||
+ intended as an emergency option for people who first chose fast, then
|
||||
+ change their minds to secure, and wish not to reboot.**
|
||||
+
|
||||
### extra_guest_irqs
|
||||
> `= [<domU number>][,<dom0 number>]`
|
||||
|
||||
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
index ec5ab860ad..c4d8a5ba78 100644
|
||||
--- a/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
|
||||
@@ -95,6 +95,41 @@ static int __init parse_ept_param(const char *s)
|
||||
}
|
||||
custom_param("ept", parse_ept_param);
|
||||
|
||||
+static int parse_ept_param_runtime(const char *s)
|
||||
+{
|
||||
+ int val;
|
||||
+
|
||||
+ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
|
||||
+ !(hvm_funcs.hap_capabilities &
|
||||
+ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
|
||||
+ {
|
||||
+ printk("VMX: EPT not available, or not in use - ignoring\n");
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if ( val != opt_ept_exec_sp )
|
||||
+ {
|
||||
+ struct domain *d;
|
||||
+
|
||||
+ opt_ept_exec_sp = val;
|
||||
+
|
||||
+ rcu_read_lock(&domlist_read_lock);
|
||||
+ for_each_domain ( d )
|
||||
+ if ( paging_mode_hap(d) )
|
||||
+ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
|
||||
+ rcu_read_unlock(&domlist_read_lock);
|
||||
+ }
|
||||
+
|
||||
+ printk("VMX: EPT executable superpages %sabled\n",
|
||||
+ val ? "en" : "dis");
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+custom_runtime_only_param("ept", parse_ept_param_runtime);
|
||||
+
|
||||
/* Dynamic (run-time adjusted) execution control flags. */
|
||||
u32 vmx_pin_based_exec_control __read_mostly;
|
||||
u32 vmx_cpu_based_exec_control __read_mostly;
|
||||
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
|
||||
index f518f86493..16608098b1 100644
|
||||
--- a/xen/arch/x86/mm/p2m.c
|
||||
+++ b/xen/arch/x86/mm/p2m.c
|
||||
@@ -289,15 +289,20 @@ static void change_entry_type_global(struct p2m_domain *p2m,
|
||||
p2m_type_t ot, p2m_type_t nt)
|
||||
{
|
||||
p2m->change_entry_type_global(p2m, ot, nt);
|
||||
- p2m->global_logdirty = (nt == p2m_ram_logdirty);
|
||||
+ /* Don't allow 'recalculate' operations to change the logdirty state. */
|
||||
+ if ( ot != nt )
|
||||
+ p2m->global_logdirty = (nt == p2m_ram_logdirty);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * May be called with ot = nt = p2m_ram_rw for its side effect of
|
||||
+ * recalculating all PTEs in the p2m.
|
||||
+ */
|
||||
void p2m_change_entry_type_global(struct domain *d,
|
||||
p2m_type_t ot, p2m_type_t nt)
|
||||
{
|
||||
struct p2m_domain *hostp2m = p2m_get_hostp2m(d);
|
||||
|
||||
- ASSERT(ot != nt);
|
||||
ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
|
||||
|
||||
p2m_lock(hostp2m);
|
|
@ -1,288 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
|
||||
|
||||
To protect against the TSX Async Abort speculative vulnerability, Intel have
|
||||
released new microcode for affected parts which introduce the MSR_TSX_CTRL
|
||||
control, which allows TSX to be turned off. This will be architectural on
|
||||
future parts.
|
||||
|
||||
Introduce tsx= to provide a global on/off for TSX, including its enumeration
|
||||
via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to
|
||||
guests at the moment.
|
||||
|
||||
VMs may have booted before microcode is loaded, or before hosts have rebooted,
|
||||
and they still want to migrate freely. A VM which booted seeing TSX can
|
||||
migrate safely to hosts with TSX disabled - TSX will start unconditionally
|
||||
aborting, but still behave in a manner compatible with the ABI.
|
||||
|
||||
The guest-visible behaviour is equivalent to late loading the microcode and
|
||||
setting the RTM_DISABLE bit in the course of live patching.
|
||||
|
||||
This is part of XSA-305 / CVE-2019-11135
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
|
||||
index e283017015..b7e1bf8e8b 100644
|
||||
--- a/docs/misc/xen-command-line.pandoc
|
||||
+++ b/docs/misc/xen-command-line.pandoc
|
||||
@@ -2033,6 +2033,20 @@ Xen version.
|
||||
### tsc (x86)
|
||||
> `= unstable | skewed | stable:socket`
|
||||
|
||||
+### tsx
|
||||
+ = <bool>
|
||||
+
|
||||
+ Applicability: x86
|
||||
+ Default: true
|
||||
+
|
||||
+Controls for the use of Transactional Synchronization eXtensions.
|
||||
+
|
||||
+On Intel parts released in Q3 2019 (with updated microcode), and future parts,
|
||||
+a control has been introduced which allows TSX to be turned off.
|
||||
+
|
||||
+On systems with the ability to turn TSX off, this boolean offers system wide
|
||||
+control of whether TSX is enabled or disabled.
|
||||
+
|
||||
### ucode (x86)
|
||||
> `= [<integer> | scan]`
|
||||
|
||||
diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
|
||||
index 8a8d8f060f..9b9a4435fb 100644
|
||||
--- a/xen/arch/x86/Makefile
|
||||
+++ b/xen/arch/x86/Makefile
|
||||
@@ -66,6 +66,7 @@ obj-y += sysctl.o
|
||||
obj-y += time.o
|
||||
obj-y += trace.o
|
||||
obj-y += traps.o
|
||||
+obj-y += tsx.o
|
||||
obj-y += usercopy.o
|
||||
obj-y += x86_emulate.o
|
||||
obj-$(CONFIG_TBOOT) += tboot.o
|
||||
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
|
||||
index 57e80694f2..1727497459 100644
|
||||
--- a/xen/arch/x86/cpuid.c
|
||||
+++ b/xen/arch/x86/cpuid.c
|
||||
@@ -524,6 +524,20 @@ void recalculate_cpuid_policy(struct domain *d)
|
||||
if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
|
||||
__set_bit(X86_FEATURE_ITSC, max_fs);
|
||||
|
||||
+ /*
|
||||
+ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
|
||||
+ * TSX and hide the feature bits. Migrating-in VMs may have been booted
|
||||
+ * pre-mitigation when the TSX features were visbile.
|
||||
+ *
|
||||
+ * This situation is compatible (albeit with a perf hit to any TSX code in
|
||||
+ * the guest), so allow the feature bits to remain set.
|
||||
+ */
|
||||
+ if ( cpu_has_tsx_ctrl )
|
||||
+ {
|
||||
+ __set_bit(X86_FEATURE_HLE, max_fs);
|
||||
+ __set_bit(X86_FEATURE_RTM, max_fs);
|
||||
+ }
|
||||
+
|
||||
/* Clamp the toolstacks choices to reality. */
|
||||
for ( i = 0; i < ARRAY_SIZE(fs); i++ )
|
||||
fs[i] &= max_fs[i];
|
||||
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
|
||||
index 56de0fe9e1..c2722d7c73 100644
|
||||
--- a/xen/arch/x86/msr.c
|
||||
+++ b/xen/arch/x86/msr.c
|
||||
@@ -132,6 +132,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
|
||||
case MSR_FLUSH_CMD:
|
||||
/* Write-only */
|
||||
case MSR_TSX_FORCE_ABORT:
|
||||
+ case MSR_TSX_CTRL:
|
||||
/* Not offered to guests. */
|
||||
goto gp_fault;
|
||||
|
||||
@@ -260,6 +261,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
|
||||
case MSR_ARCH_CAPABILITIES:
|
||||
/* Read-only */
|
||||
case MSR_TSX_FORCE_ABORT:
|
||||
+ case MSR_TSX_CTRL:
|
||||
/* Not offered to guests. */
|
||||
goto gp_fault;
|
||||
|
||||
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
|
||||
index cf790f36ef..c1c7c44000 100644
|
||||
--- a/xen/arch/x86/setup.c
|
||||
+++ b/xen/arch/x86/setup.c
|
||||
@@ -1594,6 +1594,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
|
||||
|
||||
early_microcode_init();
|
||||
|
||||
+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
|
||||
+
|
||||
identify_cpu(&boot_cpu_data);
|
||||
|
||||
set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
|
||||
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
|
||||
index 737a44f055..e21cf0a310 100644
|
||||
--- a/xen/arch/x86/smpboot.c
|
||||
+++ b/xen/arch/x86/smpboot.c
|
||||
@@ -376,6 +376,8 @@ void start_secondary(void *unused)
|
||||
if ( boot_cpu_has(X86_FEATURE_IBRSB) )
|
||||
wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
|
||||
|
||||
+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
|
||||
+
|
||||
if ( xen_guest )
|
||||
hypervisor_ap_setup();
|
||||
|
||||
diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
|
||||
new file mode 100644
|
||||
index 0000000000..a8ec2ccc69
|
||||
--- /dev/null
|
||||
+++ b/xen/arch/x86/tsx.c
|
||||
@@ -0,0 +1,74 @@
|
||||
+#include <xen/init.h>
|
||||
+#include <asm/msr.h>
|
||||
+
|
||||
+/*
|
||||
+ * Valid values:
|
||||
+ * 1 => Explicit tsx=1
|
||||
+ * 0 => Explicit tsx=0
|
||||
+ * -1 => Default, implicit tsx=1
|
||||
+ *
|
||||
+ * This is arranged such that the bottom bit encodes whether TSX is actually
|
||||
+ * disabled, while identifying various explicit (>=0) and implicit (<0)
|
||||
+ * conditions.
|
||||
+ */
|
||||
+int8_t __read_mostly opt_tsx = -1;
|
||||
+int8_t __read_mostly cpu_has_tsx_ctrl = -1;
|
||||
+
|
||||
+static int __init parse_tsx(const char *s)
|
||||
+{
|
||||
+ int rc = 0, val = parse_bool(s, NULL);
|
||||
+
|
||||
+ if ( val >= 0 )
|
||||
+ opt_tsx = val;
|
||||
+ else
|
||||
+ rc = -EINVAL;
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+custom_param("tsx", parse_tsx);
|
||||
+
|
||||
+void tsx_init(void)
|
||||
+{
|
||||
+ /*
|
||||
+ * This function is first called between microcode being loaded, and CPUID
|
||||
+ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL
|
||||
+ * is available.
|
||||
+ */
|
||||
+ if ( unlikely(cpu_has_tsx_ctrl < 0) )
|
||||
+ {
|
||||
+ uint64_t caps = 0;
|
||||
+
|
||||
+ if ( boot_cpu_data.cpuid_level >= 7 &&
|
||||
+ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
|
||||
+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
|
||||
+
|
||||
+ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
|
||||
+ }
|
||||
+
|
||||
+ if ( cpu_has_tsx_ctrl )
|
||||
+ {
|
||||
+ uint64_t val;
|
||||
+
|
||||
+ rdmsrl(MSR_TSX_CTRL, val);
|
||||
+
|
||||
+ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
|
||||
+ /* Check bottom bit only. Higher bits are various sentinals. */
|
||||
+ if ( !(opt_tsx & 1) )
|
||||
+ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
|
||||
+
|
||||
+ wrmsrl(MSR_TSX_CTRL, val);
|
||||
+ }
|
||||
+ else if ( opt_tsx >= 0 )
|
||||
+ printk_once(XENLOG_WARNING
|
||||
+ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Local variables:
|
||||
+ * mode: C
|
||||
+ * c-file-style: "BSD"
|
||||
+ * c-basic-offset: 4
|
||||
+ * tab-width: 4
|
||||
+ * indent-tabs-mode: nil
|
||||
+ * End:
|
||||
+ */
|
||||
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
|
||||
index 32746aa8ae..d5f3899f73 100644
|
||||
--- a/xen/include/asm-x86/msr-index.h
|
||||
+++ b/xen/include/asm-x86/msr-index.h
|
||||
@@ -53,6 +53,7 @@
|
||||
#define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
|
||||
#define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
|
||||
#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
|
||||
+#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
|
||||
|
||||
#define MSR_FLUSH_CMD 0x0000010b
|
||||
#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
|
||||
@@ -60,6 +61,10 @@
|
||||
#define MSR_TSX_FORCE_ABORT 0x0000010f
|
||||
#define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0)
|
||||
|
||||
+#define MSR_TSX_CTRL 0x00000122
|
||||
+#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0)
|
||||
+#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1)
|
||||
+
|
||||
/* Intel MSRs. Some also available on other CPUs */
|
||||
#define MSR_IA32_PERFCTR0 0x000000c1
|
||||
#define MSR_IA32_A_PERFCTR0 0x000004c1
|
||||
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
|
||||
index d33ac34d29..1b52712180 100644
|
||||
--- a/xen/include/asm-x86/processor.h
|
||||
+++ b/xen/include/asm-x86/processor.h
|
||||
@@ -263,6 +263,16 @@ static always_inline unsigned int cpuid_count_ebx(
|
||||
return ebx;
|
||||
}
|
||||
|
||||
+static always_inline unsigned int cpuid_count_edx(
|
||||
+ unsigned int leaf, unsigned int subleaf)
|
||||
+{
|
||||
+ unsigned int edx, tmp;
|
||||
+
|
||||
+ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
|
||||
+
|
||||
+ return edx;
|
||||
+}
|
||||
+
|
||||
static inline unsigned long read_cr0(void)
|
||||
{
|
||||
unsigned long cr0;
|
||||
@@ -609,6 +619,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
|
||||
return fam;
|
||||
}
|
||||
|
||||
+extern int8_t opt_tsx, cpu_has_tsx_ctrl;
|
||||
+void tsx_init(void);
|
||||
+
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
#endif /* __ASM_X86_PROCESSOR_H */
|
||||
diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
|
||||
index 89939f43c8..6529f12dae 100644
|
||||
--- a/xen/include/xen/lib.h
|
||||
+++ b/xen/include/xen/lib.h
|
||||
@@ -114,6 +114,16 @@ extern int printk_ratelimit(void);
|
||||
#define gprintk(lvl, fmt, args...) \
|
||||
printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
|
||||
|
||||
+#define printk_once(fmt, args...) \
|
||||
+({ \
|
||||
+ static bool __read_mostly once_; \
|
||||
+ if ( unlikely(!once_) ) \
|
||||
+ { \
|
||||
+ once_ = true; \
|
||||
+ printk(fmt, ## args); \
|
||||
+ } \
|
||||
+})
|
||||
+
|
||||
#ifdef NDEBUG
|
||||
|
||||
static inline void
|
|
@ -1,192 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
|
||||
|
||||
See patch documentation and comments.
|
||||
|
||||
This is part of XSA-305 / CVE-2019-11135
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
|
||||
index b7e1bf8e8b..74e1e35b88 100644
|
||||
--- a/docs/misc/xen-command-line.pandoc
|
||||
+++ b/docs/misc/xen-command-line.pandoc
|
||||
@@ -1920,7 +1920,7 @@ extreme care.**
|
||||
An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
|
||||
mitigations, including pieces of infrastructure used to virtualise certain
|
||||
mitigation features for guests. This also includes settings which `xpti`,
|
||||
-`smt`, `pv-l1tf` control, unless the respective option(s) have been
|
||||
+`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
|
||||
specified earlier on the command line.
|
||||
|
||||
Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
|
||||
@@ -2037,7 +2037,7 @@ Xen version.
|
||||
= <bool>
|
||||
|
||||
Applicability: x86
|
||||
- Default: true
|
||||
+ Default: false on parts vulnerable to TAA, true otherwise
|
||||
|
||||
Controls for the use of Transactional Synchronization eXtensions.
|
||||
|
||||
@@ -2047,6 +2047,19 @@ a control has been introduced which allows TSX to be turned off.
|
||||
On systems with the ability to turn TSX off, this boolean offers system wide
|
||||
control of whether TSX is enabled or disabled.
|
||||
|
||||
+On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
|
||||
+logic applies:
|
||||
+
|
||||
+ * An explicit `tsx=` choice is honoured, even if it is `true` and would
|
||||
+ result in a vulnerable system.
|
||||
+
|
||||
+ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
|
||||
+ mitigated by disabling TSX, as this is the lowest overhead option.
|
||||
+
|
||||
+ * If the use of TSX is important, the more expensive TAA mitigations can be
|
||||
+ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
|
||||
+ active by default.
|
||||
+
|
||||
### ucode (x86)
|
||||
> `= [<integer> | scan]`
|
||||
|
||||
diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
|
||||
index b37d40e643..800139d79c 100644
|
||||
--- a/xen/arch/x86/spec_ctrl.c
|
||||
+++ b/xen/arch/x86/spec_ctrl.c
|
||||
@@ -96,6 +96,9 @@ static int __init parse_spec_ctrl(const char *s)
|
||||
if ( opt_pv_l1tf_domu < 0 )
|
||||
opt_pv_l1tf_domu = 0;
|
||||
|
||||
+ if ( opt_tsx == -1 )
|
||||
+ opt_tsx = -3;
|
||||
+
|
||||
disable_common:
|
||||
opt_rsb_pv = false;
|
||||
opt_rsb_hvm = false;
|
||||
@@ -306,7 +309,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
||||
printk("Speculative mitigation facilities:\n");
|
||||
|
||||
/* Hardware features which pertain to speculative mitigations. */
|
||||
- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
|
||||
+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
|
||||
(_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
|
||||
(_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
|
||||
(_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
|
||||
@@ -318,7 +321,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
||||
(caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
|
||||
(caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
|
||||
(caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
|
||||
- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "");
|
||||
+ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
|
||||
+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
|
||||
+ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "");
|
||||
|
||||
/* Compiled-in support which pertains to mitigations. */
|
||||
if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
|
||||
@@ -332,7 +337,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
||||
"\n");
|
||||
|
||||
/* Settings for Xen's protection, irrespective of guests. */
|
||||
- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
|
||||
+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
|
||||
thunk == THUNK_NONE ? "N/A" :
|
||||
thunk == THUNK_RETPOLINE ? "RETPOLINE" :
|
||||
thunk == THUNK_LFENCE ? "LFENCE" :
|
||||
@@ -341,6 +346,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
|
||||
(default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
|
||||
!boot_cpu_has(X86_FEATURE_SSBD) ? "" :
|
||||
(default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
|
||||
+ !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
|
||||
+ (opt_tsx & 1) ? " TSX+" : " TSX-",
|
||||
opt_ibpb ? " IBPB" : "",
|
||||
opt_l1d_flush ? " L1D_FLUSH" : "",
|
||||
opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "");
|
||||
@@ -862,6 +869,7 @@ void __init init_speculation_mitigations(void)
|
||||
{
|
||||
enum ind_thunk thunk = THUNK_DEFAULT;
|
||||
bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
|
||||
+ bool cpu_has_bug_taa;
|
||||
uint64_t caps = 0;
|
||||
|
||||
if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
|
||||
@@ -1086,6 +1094,53 @@ void __init init_speculation_mitigations(void)
|
||||
"enabled. Mitigations will not be fully effective. Please\n"
|
||||
"choose an explicit smt=<bool> setting. See XSA-297.\n");
|
||||
|
||||
+ /*
|
||||
+ * Vulnerability to TAA is a little complicated to quantify.
|
||||
+ *
|
||||
+ * In the pipeline, it is just another way to get speculative access to
|
||||
+ * stale load port, store buffer or fill buffer data, and therefore can be
|
||||
+ * considered a superset of MDS (on TSX-capable parts). On parts which
|
||||
+ * predate MDS_NO, the existing VERW flushing will mitigate this
|
||||
+ * sidechannel as well.
|
||||
+ *
|
||||
+ * On parts which contain MDS_NO, the lack of VERW flushing means that an
|
||||
+ * attacker can still use TSX to target microarchitectural buffers to leak
|
||||
+ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts
|
||||
+ * which have MDS_NO but lack TAA_NO.
|
||||
+ *
|
||||
+ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
|
||||
+ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so
|
||||
+ * we check both to spot TSX in a microcode/cmdline independent way.
|
||||
+ */
|
||||
+ cpu_has_bug_taa =
|
||||
+ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
|
||||
+ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
|
||||
+
|
||||
+ /*
|
||||
+ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
|
||||
+ * the MDS mitigation of disabling HT and using VERW flushing.
|
||||
+ *
|
||||
+ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
|
||||
+ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
|
||||
+ * advertised, and there isn't a MD_CLEAR_2 flag to use...
|
||||
+ *
|
||||
+ * If we're on affected hardware, able to do something about it (which
|
||||
+ * implies that VERW now works), no explicit TSX choice and traditional
|
||||
+ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
|
||||
+ * plausibly value TSX higher than Hyperthreading...), disable TSX to
|
||||
+ * mitigate TAA.
|
||||
+ */
|
||||
+ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
|
||||
+ ((hw_smt_enabled && opt_smt) ||
|
||||
+ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
|
||||
+ {
|
||||
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
|
||||
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
|
||||
+
|
||||
+ opt_tsx = 0;
|
||||
+ tsx_init();
|
||||
+ }
|
||||
+
|
||||
print_details(thunk, caps);
|
||||
|
||||
/*
|
||||
diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
|
||||
index a8ec2ccc69..2d202a0d4e 100644
|
||||
--- a/xen/arch/x86/tsx.c
|
||||
+++ b/xen/arch/x86/tsx.c
|
||||
@@ -5,7 +5,8 @@
|
||||
* Valid values:
|
||||
* 1 => Explicit tsx=1
|
||||
* 0 => Explicit tsx=0
|
||||
- * -1 => Default, implicit tsx=1
|
||||
+ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
|
||||
+ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
|
||||
*
|
||||
* This is arranged such that the bottom bit encodes whether TSX is actually
|
||||
* disabled, while identifying various explicit (>=0) and implicit (<0)
|
||||
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
|
||||
index d5f3899f73..3971b992d3 100644
|
||||
--- a/xen/include/asm-x86/msr-index.h
|
||||
+++ b/xen/include/asm-x86/msr-index.h
|
||||
@@ -54,6 +54,7 @@
|
||||
#define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
|
||||
#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
|
||||
#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
|
||||
+#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
|
||||
|
||||
#define MSR_FLUSH_CMD 0x0000010b
|
||||
#define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
|
|
@ -0,0 +1,93 @@
|
|||
From 9f807cf84a9a7a011cf1df7895c54d6031a7596d Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <julien@xen.org>
|
||||
Date: Thu, 19 Dec 2019 08:12:21 +0000
|
||||
Subject: [PATCH] xen/arm: Place a speculation barrier sequence following an
|
||||
eret instruction
|
||||
|
||||
Some CPUs can speculate past an ERET instruction and potentially perform
|
||||
speculative accesses to memory before processing the exception return.
|
||||
Since the register state is often controlled by lower privilege level
|
||||
at the point of an ERET, this could potentially be used as part of a
|
||||
side-channel attack.
|
||||
|
||||
Newer CPUs may implement a new SB barrier instruction which acts
|
||||
as an architected speculation barrier. For current CPUs, the sequence
|
||||
DSB; ISB is known to prevent speculation.
|
||||
|
||||
The latter sequence is heavier than SB but it would never be executed
|
||||
(this is speculation after all!).
|
||||
|
||||
Introduce a new macro 'sb' that could be used when a speculation barrier
|
||||
is required. For now it is using dsb; isb but this could easily be
|
||||
updated to cater SB in the future.
|
||||
|
||||
This is XSA-312.
|
||||
|
||||
Signed-off-by: Julien Grall <julien@xen.org>
|
||||
---
|
||||
xen/arch/arm/arm32/entry.S | 1 +
|
||||
xen/arch/arm/arm64/entry.S | 3 +++
|
||||
xen/include/asm-arm/macros.h | 9 +++++++++
|
||||
3 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
|
||||
index 31ccfb2631..b228d44b19 100644
|
||||
--- a/xen/arch/arm/arm32/entry.S
|
||||
+++ b/xen/arch/arm/arm32/entry.S
|
||||
@@ -426,6 +426,7 @@ return_to_hypervisor:
|
||||
add sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */
|
||||
clrex
|
||||
eret
|
||||
+ sb
|
||||
|
||||
/*
|
||||
* struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next)
|
||||
diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
|
||||
index d35855af96..175ea2981e 100644
|
||||
--- a/xen/arch/arm/arm64/entry.S
|
||||
+++ b/xen/arch/arm/arm64/entry.S
|
||||
@@ -354,6 +354,7 @@ guest_sync:
|
||||
*/
|
||||
mov x1, xzr
|
||||
eret
|
||||
+ sb
|
||||
|
||||
check_wa2:
|
||||
/* ARM_SMCCC_ARCH_WORKAROUND_2 handling */
|
||||
@@ -393,6 +394,7 @@ wa2_end:
|
||||
#endif /* !CONFIG_ARM_SSBD */
|
||||
mov x0, xzr
|
||||
eret
|
||||
+ sb
|
||||
guest_sync_slowpath:
|
||||
/*
|
||||
* x0/x1 may have been scratch by the fast path above, so avoid
|
||||
@@ -457,6 +459,7 @@ return_from_trap:
|
||||
ldr lr, [sp], #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */
|
||||
|
||||
eret
|
||||
+ sb
|
||||
|
||||
/*
|
||||
* Consume pending SError generated by the guest if any.
|
||||
diff --git a/xen/include/asm-arm/macros.h b/xen/include/asm-arm/macros.h
|
||||
index 91ea3505e4..4833671f4c 100644
|
||||
--- a/xen/include/asm-arm/macros.h
|
||||
+++ b/xen/include/asm-arm/macros.h
|
||||
@@ -20,4 +20,13 @@
|
||||
.endr
|
||||
.endm
|
||||
|
||||
+ /*
|
||||
+ * Speculative barrier
|
||||
+ * XXX: Add support for the 'sb' instruction
|
||||
+ */
|
||||
+ .macro sb
|
||||
+ dsb nsh
|
||||
+ isb
|
||||
+ .endm
|
||||
+
|
||||
#endif /* __ASM_ARM_MACROS_H */
|
||||
--
|
||||
2.17.1
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: xenoprof: clear buffer intended to be shared with guests
|
||||
|
||||
alloc_xenheap_pages() making use of MEMF_no_scrub is fine for Xen
|
||||
internally used allocations, but buffers allocated to be shared with
|
||||
(unpriviliged) guests need to be zapped of their prior content.
|
||||
|
||||
This is part of XSA-313.
|
||||
|
||||
Reported-by: Ilja Van Sprundel <ivansprundel@ioactive.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Wei Liu <wl@xen.org>
|
||||
|
||||
--- a/xen/common/xenoprof.c
|
||||
+++ b/xen/common/xenoprof.c
|
||||
@@ -253,6 +253,9 @@ static int alloc_xenoprof_struct(
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
+ for ( i = 0; i < npages; ++i )
|
||||
+ clear_page(d->xenoprof->rawbuf + i * PAGE_SIZE);
|
||||
+
|
||||
d->xenoprof->npages = npages;
|
||||
d->xenoprof->nbuf = nvcpu;
|
||||
d->xenoprof->bufsize = bufsize;
|
|
@ -0,0 +1,132 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: xenoprof: limit consumption of shared buffer data
|
||||
|
||||
Since a shared buffer can be written to by the guest, we may only read
|
||||
the head and tail pointers from there (all other fields should only ever
|
||||
be written to). Furthermore, for any particular operation the two values
|
||||
must be read exactly once, with both checks and consumption happening
|
||||
with the thus read values. (The backtrace related xenoprof_buf_space()
|
||||
use in xenoprof_log_event() is an exception: The values used there get
|
||||
re-checked by every subsequent xenoprof_add_sample().)
|
||||
|
||||
Since that code needed touching, also fix the double increment of the
|
||||
lost samples count in case the backtrace related xenoprof_add_sample()
|
||||
invocation in xenoprof_log_event() fails.
|
||||
|
||||
Where code is being touched anyway, add const as appropriate, but take
|
||||
the opportunity to entirely drop the now unused domain parameter of
|
||||
xenoprof_buf_space().
|
||||
|
||||
This is part of XSA-313.
|
||||
|
||||
Reported-by: Ilja Van Sprundel <ivansprundel@ioactive.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Wei Liu <wl@xen.org>
|
||||
|
||||
--- a/xen/common/xenoprof.c
|
||||
+++ b/xen/common/xenoprof.c
|
||||
@@ -479,25 +479,22 @@ static int add_passive_list(XEN_GUEST_HA
|
||||
|
||||
|
||||
/* Get space in the buffer */
|
||||
-static int xenoprof_buf_space(struct domain *d, xenoprof_buf_t * buf, int size)
|
||||
+static int xenoprof_buf_space(int head, int tail, int size)
|
||||
{
|
||||
- int head, tail;
|
||||
-
|
||||
- head = xenoprof_buf(d, buf, event_head);
|
||||
- tail = xenoprof_buf(d, buf, event_tail);
|
||||
-
|
||||
return ((tail > head) ? 0 : size) + tail - head - 1;
|
||||
}
|
||||
|
||||
/* Check for space and add a sample. Return 1 if successful, 0 otherwise. */
|
||||
-static int xenoprof_add_sample(struct domain *d, xenoprof_buf_t *buf,
|
||||
+static int xenoprof_add_sample(const struct domain *d,
|
||||
+ const struct xenoprof_vcpu *v,
|
||||
uint64_t eip, int mode, int event)
|
||||
{
|
||||
+ xenoprof_buf_t *buf = v->buffer;
|
||||
int head, tail, size;
|
||||
|
||||
head = xenoprof_buf(d, buf, event_head);
|
||||
tail = xenoprof_buf(d, buf, event_tail);
|
||||
- size = xenoprof_buf(d, buf, event_size);
|
||||
+ size = v->event_size;
|
||||
|
||||
/* make sure indexes in shared buffer are sane */
|
||||
if ( (head < 0) || (head >= size) || (tail < 0) || (tail >= size) )
|
||||
@@ -506,7 +503,7 @@ static int xenoprof_add_sample(struct do
|
||||
return 0;
|
||||
}
|
||||
|
||||
- if ( xenoprof_buf_space(d, buf, size) > 0 )
|
||||
+ if ( xenoprof_buf_space(head, tail, size) > 0 )
|
||||
{
|
||||
xenoprof_buf(d, buf, event_log[head].eip) = eip;
|
||||
xenoprof_buf(d, buf, event_log[head].mode) = mode;
|
||||
@@ -530,7 +527,6 @@ static int xenoprof_add_sample(struct do
|
||||
int xenoprof_add_trace(struct vcpu *vcpu, uint64_t pc, int mode)
|
||||
{
|
||||
struct domain *d = vcpu->domain;
|
||||
- xenoprof_buf_t *buf = d->xenoprof->vcpu[vcpu->vcpu_id].buffer;
|
||||
|
||||
/* Do not accidentally write an escape code due to a broken frame. */
|
||||
if ( pc == XENOPROF_ESCAPE_CODE )
|
||||
@@ -539,7 +535,8 @@ int xenoprof_add_trace(struct vcpu *vcpu
|
||||
return 0;
|
||||
}
|
||||
|
||||
- return xenoprof_add_sample(d, buf, pc, mode, 0);
|
||||
+ return xenoprof_add_sample(d, &d->xenoprof->vcpu[vcpu->vcpu_id],
|
||||
+ pc, mode, 0);
|
||||
}
|
||||
|
||||
void xenoprof_log_event(struct vcpu *vcpu, const struct cpu_user_regs *regs,
|
||||
@@ -570,17 +567,22 @@ void xenoprof_log_event(struct vcpu *vcp
|
||||
/* Provide backtrace if requested. */
|
||||
if ( backtrace_depth > 0 )
|
||||
{
|
||||
- if ( (xenoprof_buf_space(d, buf, v->event_size) < 2) ||
|
||||
- !xenoprof_add_sample(d, buf, XENOPROF_ESCAPE_CODE, mode,
|
||||
- XENOPROF_TRACE_BEGIN) )
|
||||
+ if ( xenoprof_buf_space(xenoprof_buf(d, buf, event_head),
|
||||
+ xenoprof_buf(d, buf, event_tail),
|
||||
+ v->event_size) < 2 )
|
||||
{
|
||||
xenoprof_buf(d, buf, lost_samples)++;
|
||||
lost_samples++;
|
||||
return;
|
||||
}
|
||||
+
|
||||
+ /* xenoprof_add_sample() will increment lost_samples on failure */
|
||||
+ if ( !xenoprof_add_sample(d, v, XENOPROF_ESCAPE_CODE, mode,
|
||||
+ XENOPROF_TRACE_BEGIN) )
|
||||
+ return;
|
||||
}
|
||||
|
||||
- if ( xenoprof_add_sample(d, buf, pc, mode, event) )
|
||||
+ if ( xenoprof_add_sample(d, v, pc, mode, event) )
|
||||
{
|
||||
if ( is_active(vcpu->domain) )
|
||||
active_samples++;
|
||||
--- a/xen/include/xen/xenoprof.h
|
||||
+++ b/xen/include/xen/xenoprof.h
|
||||
@@ -61,12 +61,12 @@ struct xenoprof {
|
||||
|
||||
#ifndef CONFIG_COMPAT
|
||||
#define XENOPROF_COMPAT(x) 0
|
||||
-#define xenoprof_buf(d, b, field) ((b)->field)
|
||||
+#define xenoprof_buf(d, b, field) ACCESS_ONCE((b)->field)
|
||||
#else
|
||||
#define XENOPROF_COMPAT(x) ((x)->is_compat)
|
||||
-#define xenoprof_buf(d, b, field) (*(!(d)->xenoprof->is_compat ? \
|
||||
- &(b)->native.field : \
|
||||
- &(b)->compat.field))
|
||||
+#define xenoprof_buf(d, b, field) ACCESS_ONCE(*(!(d)->xenoprof->is_compat \
|
||||
+ ? &(b)->native.field \
|
||||
+ : &(b)->compat.field))
|
||||
#endif
|
||||
|
||||
struct domain;
|
|
@ -0,0 +1,121 @@
|
|||
From ab49f005f7d01d4004d76f2e295d31aca7d4f93a Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <jgrall@amazon.com>
|
||||
Date: Thu, 20 Feb 2020 20:54:40 +0000
|
||||
Subject: [PATCH] xen/rwlock: Add missing memory barrier in the unlock path of
|
||||
rwlock
|
||||
|
||||
The rwlock unlock paths are using atomic_sub() to release the lock.
|
||||
However the implementation of atomic_sub() rightfully doesn't contain a
|
||||
memory barrier. On Arm, this means a processor is allowed to re-order
|
||||
the memory access with the preceeding access.
|
||||
|
||||
In other words, the unlock may be seen by another processor before all
|
||||
the memory accesses within the "critical" section.
|
||||
|
||||
The rwlock paths already contains barrier indirectly, but they are not
|
||||
very useful without the counterpart in the unlock paths.
|
||||
|
||||
The memory barriers are not necessary on x86 because loads/stores are
|
||||
not re-ordered with lock instructions.
|
||||
|
||||
So add arch_lock_release_barrier() in the unlock paths that will only
|
||||
add memory barrier on Arm.
|
||||
|
||||
Take the opportunity to document each lock paths explaining why a
|
||||
barrier is not necessary.
|
||||
|
||||
This is XSA-314.
|
||||
|
||||
Signed-off-by: Julien Grall <jgrall@amazon.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
|
||||
---
|
||||
xen/include/xen/rwlock.h | 29 ++++++++++++++++++++++++++++-
|
||||
1 file changed, 28 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h
|
||||
index 3dfea1ac2a..516486306f 100644
|
||||
--- a/xen/include/xen/rwlock.h
|
||||
+++ b/xen/include/xen/rwlock.h
|
||||
@@ -48,6 +48,10 @@ static inline int _read_trylock(rwlock_t *lock)
|
||||
if ( likely(!(cnts & _QW_WMASK)) )
|
||||
{
|
||||
cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts);
|
||||
+ /*
|
||||
+ * atomic_add_return() is a full barrier so no need for an
|
||||
+ * arch_lock_acquire_barrier().
|
||||
+ */
|
||||
if ( likely(!(cnts & _QW_WMASK)) )
|
||||
return 1;
|
||||
atomic_sub(_QR_BIAS, &lock->cnts);
|
||||
@@ -64,11 +68,19 @@ static inline void _read_lock(rwlock_t *lock)
|
||||
u32 cnts;
|
||||
|
||||
cnts = atomic_add_return(_QR_BIAS, &lock->cnts);
|
||||
+ /*
|
||||
+ * atomic_add_return() is a full barrier so no need for an
|
||||
+ * arch_lock_acquire_barrier().
|
||||
+ */
|
||||
if ( likely(!(cnts & _QW_WMASK)) )
|
||||
return;
|
||||
|
||||
/* The slowpath will decrement the reader count, if necessary. */
|
||||
queue_read_lock_slowpath(lock);
|
||||
+ /*
|
||||
+ * queue_read_lock_slowpath() is using spinlock and therefore is a
|
||||
+ * full barrier. So no need for an arch_lock_acquire_barrier().
|
||||
+ */
|
||||
}
|
||||
|
||||
static inline void _read_lock_irq(rwlock_t *lock)
|
||||
@@ -92,6 +104,7 @@ static inline unsigned long _read_lock_irqsave(rwlock_t *lock)
|
||||
*/
|
||||
static inline void _read_unlock(rwlock_t *lock)
|
||||
{
|
||||
+ arch_lock_release_barrier();
|
||||
/*
|
||||
* Atomically decrement the reader count
|
||||
*/
|
||||
@@ -121,11 +134,20 @@ static inline int _rw_is_locked(rwlock_t *lock)
|
||||
*/
|
||||
static inline void _write_lock(rwlock_t *lock)
|
||||
{
|
||||
- /* Optimize for the unfair lock case where the fair flag is 0. */
|
||||
+ /*
|
||||
+ * Optimize for the unfair lock case where the fair flag is 0.
|
||||
+ *
|
||||
+ * atomic_cmpxchg() is a full barrier so no need for an
|
||||
+ * arch_lock_acquire_barrier().
|
||||
+ */
|
||||
if ( atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0 )
|
||||
return;
|
||||
|
||||
queue_write_lock_slowpath(lock);
|
||||
+ /*
|
||||
+ * queue_write_lock_slowpath() is using spinlock and therefore is a
|
||||
+ * full barrier. So no need for an arch_lock_acquire_barrier().
|
||||
+ */
|
||||
}
|
||||
|
||||
static inline void _write_lock_irq(rwlock_t *lock)
|
||||
@@ -157,11 +179,16 @@ static inline int _write_trylock(rwlock_t *lock)
|
||||
if ( unlikely(cnts) )
|
||||
return 0;
|
||||
|
||||
+ /*
|
||||
+ * atomic_cmpxchg() is a full barrier so no need for an
|
||||
+ * arch_lock_acquire_barrier().
|
||||
+ */
|
||||
return likely(atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0);
|
||||
}
|
||||
|
||||
static inline void _write_unlock(rwlock_t *lock)
|
||||
{
|
||||
+ arch_lock_release_barrier();
|
||||
/*
|
||||
* If the writer field is atomic, it can be cleared directly.
|
||||
* Otherwise, an atomic subtraction will be used to clear it.
|
||||
--
|
||||
2.17.1
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
From: Ross Lagerwall <ross.lagerwall@citrix.com>
|
||||
Subject: xen/gnttab: Fix error path in map_grant_ref()
|
||||
|
||||
Part of XSA-295 (c/s 863e74eb2cffb) inadvertently re-positioned the brackets,
|
||||
changing the logic. If the _set_status() call fails, the grant_map hypercall
|
||||
would fail with a status of 1 (rc != GNTST_okay) instead of the expected
|
||||
negative GNTST_* error.
|
||||
|
||||
This error path can be taken due to bad guest state, and causes net/blk-back
|
||||
in Linux to crash.
|
||||
|
||||
This is XSA-316.
|
||||
|
||||
Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Julien Grall <jgrall@amazon.com>
|
||||
|
||||
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
|
||||
index 9fd6e60416..4b5344dc21 100644
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -1031,7 +1031,7 @@ map_grant_ref(
|
||||
{
|
||||
if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act,
|
||||
op->flags & GNTMAP_readonly, 1,
|
||||
- ld->domain_id) != GNTST_okay) )
|
||||
+ ld->domain_id)) != GNTST_okay )
|
||||
goto act_release_out;
|
||||
|
||||
if ( !act->pin )
|
|
@ -0,0 +1,39 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: gnttab: fix GNTTABOP_copy continuation handling
|
||||
|
||||
The XSA-226 fix was flawed - the backwards transformation on rc was done
|
||||
too early, causing a continuation to not get invoked when the need for
|
||||
preemption was determined at the very first iteration of the request.
|
||||
This in particular means that all of the status fields of the individual
|
||||
operations would be left untouched, i.e. set to whatever the caller may
|
||||
or may not have initialized them to.
|
||||
|
||||
This is part of XSA-318.
|
||||
|
||||
Reported-by: Pawel Wieczorkiewicz <wipawel@amazon.de>
|
||||
Tested-by: Pawel Wieczorkiewicz <wipawel@amazon.de>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Juergen Gross <jgross@suse.com>
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -3576,8 +3576,7 @@ do_grant_table_op(
|
||||
rc = gnttab_copy(copy, count);
|
||||
if ( rc > 0 )
|
||||
{
|
||||
- rc = count - rc;
|
||||
- guest_handle_add_offset(copy, rc);
|
||||
+ guest_handle_add_offset(copy, count - rc);
|
||||
uop = guest_handle_cast(copy, void);
|
||||
}
|
||||
break;
|
||||
@@ -3644,6 +3643,9 @@ do_grant_table_op(
|
||||
out:
|
||||
if ( rc > 0 || opaque_out != 0 )
|
||||
{
|
||||
+ /* Adjust rc, see gnttab_copy() for why this is needed. */
|
||||
+ if ( cmd == GNTTABOP_copy )
|
||||
+ rc = count - rc;
|
||||
ASSERT(rc < count);
|
||||
ASSERT((opaque_out & GNTTABOP_CMD_MASK) == 0);
|
||||
rc = hypercall_create_continuation(__HYPERVISOR_grant_table_op, "ihi",
|
Loading…
Reference in New Issue