system/xen: Updated for version 4.17.0.

Signed-off-by: Mario Preksavec <mario@slackware.hr>

Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
Mario Preksavec 2022-12-27 03:46:38 +01:00 committed by Willy Sudiarto Raharjo
parent 45f679b708
commit 4c5d49121f
No known key found for this signature in database
GPG Key ID: 3F617144D7238786
17 changed files with 63 additions and 1328 deletions

View File

@ -57,7 +57,7 @@ kernel-xen.sh: This script builds the Linux Kernel for a Xen Hypervisor.
* To make things a bit easier, a copy of Xen EFI binary can be found here:
http://slackware.hr/~mario/xen/xen-4.16.1.efi.gz
http://slackware.hr/~mario/xen/xen-4.17.0.efi.gz
!!! Make sure to understand what are you doing at this point, you could
easily lose your data. Always create backups !!!

View File

@ -1,6 +1,6 @@
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86 5.15.27 Kernel Configuration
# Linux/x86 5.15.80 Kernel Configuration
#
CONFIG_CC_VERSION_TEXT="gcc (GCC) 11.2.0"
CONFIG_CC_IS_GCC=y
@ -15,6 +15,7 @@ CONFIG_CC_CAN_LINK=y
CONFIG_CC_CAN_LINK_STATIC=y
CONFIG_CC_HAS_ASM_GOTO=y
CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y
CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y
CONFIG_CC_HAS_ASM_INLINE=y
CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y
CONFIG_IRQ_WORK=y
@ -348,7 +349,6 @@ CONFIG_X86_FEATURE_NAMES=y
CONFIG_X86_X2APIC=y
CONFIG_X86_MPPARSE=y
# CONFIG_GOLDFISH is not set
CONFIG_RETPOLINE=y
CONFIG_X86_CPU_RESCTRL=y
# CONFIG_X86_EXTENDED_PLATFORM is not set
CONFIG_X86_INTEL_LPSS=y
@ -517,6 +517,14 @@ CONFIG_HAVE_LIVEPATCH=y
CONFIG_LIVEPATCH=y
# end of Processor type and features
CONFIG_CC_HAS_RETURN_THUNK=y
CONFIG_SPECULATION_MITIGATIONS=y
CONFIG_PAGE_TABLE_ISOLATION=y
CONFIG_RETPOLINE=y
CONFIG_RETHUNK=y
CONFIG_CPU_UNRET_ENTRY=y
CONFIG_CPU_IBPB_ENTRY=y
CONFIG_CPU_IBRS_ENTRY=y
CONFIG_ARCH_HAS_ADD_PAGES=y
CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y
CONFIG_USE_PERCPU_NUMA_NODE_ID=y
@ -741,6 +749,7 @@ CONFIG_HAVE_KPROBES_ON_FTRACE=y
CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y
CONFIG_HAVE_NMI=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
CONFIG_HAVE_DMA_CONTIGUOUS=y
CONFIG_GENERIC_SMP_IDLE_THREAD=y
@ -2470,6 +2479,7 @@ CONFIG_PNPACPI=y
CONFIG_BLK_DEV=y
CONFIG_BLK_DEV_NULL_BLK=m
CONFIG_BLK_DEV_FD=y
# CONFIG_BLK_DEV_FD_RAWCMD is not set
CONFIG_CDROM=y
CONFIG_PARIDE=m
@ -3131,6 +3141,7 @@ CONFIG_ATL1=m
CONFIG_ATL1E=m
CONFIG_ATL1C=m
CONFIG_ALX=m
CONFIG_CX_ECAT=m
CONFIG_NET_VENDOR_BROADCOM=y
CONFIG_B44=m
CONFIG_B44_PCI_AUTOSELECT=y
@ -3148,8 +3159,6 @@ CONFIG_BNXT=m
CONFIG_BNXT_SRIOV=y
CONFIG_BNXT_FLOWER_OFFLOAD=y
CONFIG_BNXT_HWMON=y
CONFIG_NET_VENDOR_BROCADE=y
CONFIG_BNA=m
CONFIG_NET_VENDOR_CADENCE=y
CONFIG_MACB=m
CONFIG_MACB_USE_HWSTAMP=y
@ -3174,7 +3183,6 @@ CONFIG_CHELSIO_IPSEC_INLINE=m
CONFIG_NET_VENDOR_CISCO=y
CONFIG_ENIC=m
# CONFIG_NET_VENDOR_CORTINA is not set
CONFIG_CX_ECAT=m
CONFIG_DNET=m
CONFIG_NET_VENDOR_DEC=y
CONFIG_NET_TULIP=y
@ -3229,8 +3237,6 @@ CONFIG_I40EVF=m
CONFIG_ICE=m
CONFIG_FM10K=m
CONFIG_IGC=m
CONFIG_NET_VENDOR_MICROSOFT=y
CONFIG_MICROSOFT_MANA=m
CONFIG_JME=m
CONFIG_NET_VENDOR_LITEX=y
CONFIG_NET_VENDOR_MARVELL=y
@ -3280,10 +3286,13 @@ CONFIG_KS8851_MLL=m
CONFIG_KSZ884X_PCI=m
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
CONFIG_NET_VENDOR_MICROSOFT=y
CONFIG_MICROSOFT_MANA=m
CONFIG_NET_VENDOR_MYRI=y
CONFIG_MYRI10GE=m
CONFIG_MYRI10GE_DCA=y
CONFIG_FEALNX=m
# CONFIG_NET_VENDOR_NI is not set
CONFIG_NET_VENDOR_NATSEMI=y
CONFIG_NATSEMI=m
CONFIG_NS83820=m
@ -3296,7 +3305,6 @@ CONFIG_NFP=m
CONFIG_NFP_APP_FLOWER=y
CONFIG_NFP_APP_ABM_NIC=y
# CONFIG_NFP_DEBUG is not set
# CONFIG_NET_VENDOR_NI is not set
CONFIG_NET_VENDOR_8390=y
CONFIG_PCMCIA_AXNET=m
CONFIG_NE2K_PCI=m
@ -3324,6 +3332,8 @@ CONFIG_QED_RDMA=y
CONFIG_QED_ISCSI=y
CONFIG_QED_FCOE=y
CONFIG_QED_OOO=y
CONFIG_NET_VENDOR_BROCADE=y
CONFIG_BNA=m
CONFIG_NET_VENDOR_QUALCOMM=y
# CONFIG_QCOM_EMAC is not set
# CONFIG_RMNET is not set
@ -3344,6 +3354,11 @@ CONFIG_ROCKER=m
CONFIG_NET_VENDOR_SAMSUNG=y
CONFIG_SXGBE_ETH=m
CONFIG_NET_VENDOR_SEEQ=y
CONFIG_NET_VENDOR_SILAN=y
CONFIG_SC92031=m
CONFIG_NET_VENDOR_SIS=y
CONFIG_SIS900=m
CONFIG_SIS190=m
CONFIG_NET_VENDOR_SOLARFLARE=y
CONFIG_SFC=m
CONFIG_SFC_MTD=y
@ -3352,11 +3367,6 @@ CONFIG_SFC_SRIOV=y
# CONFIG_SFC_MCDI_LOGGING is not set
CONFIG_SFC_FALCON=m
CONFIG_SFC_FALCON_MTD=y
CONFIG_NET_VENDOR_SILAN=y
CONFIG_SC92031=m
CONFIG_NET_VENDOR_SIS=y
CONFIG_SIS900=m
CONFIG_SIS190=m
CONFIG_NET_VENDOR_SMSC=y
CONFIG_PCMCIA_SMC91C92=m
CONFIG_EPIC100=m
@ -7471,6 +7481,7 @@ CONFIG_TYPEC_TCPCI=m
CONFIG_TYPEC_RT1711H=m
CONFIG_TYPEC_TCPCI_MAXIM=m
CONFIG_TYPEC_FUSB302=m
CONFIG_TYPEC_WCOVE=m
CONFIG_TYPEC_UCSI=m
CONFIG_UCSI_CCG=m
CONFIG_UCSI_ACPI=m
@ -9502,7 +9513,6 @@ CONFIG_SECURITY_DMESG_RESTRICT=y
CONFIG_SECURITY=y
CONFIG_SECURITYFS=y
CONFIG_SECURITY_NETWORK=y
CONFIG_PAGE_TABLE_ISOLATION=y
CONFIG_SECURITY_INFINIBAND=y
CONFIG_SECURITY_NETWORK_XFRM=y
# CONFIG_SECURITY_PATH is not set
@ -9648,8 +9658,7 @@ CONFIG_CRYPTO_CRC32=m
CONFIG_CRYPTO_CRC32_PCLMUL=m
CONFIG_CRYPTO_XXHASH=m
CONFIG_CRYPTO_BLAKE2B=y
CONFIG_CRYPTO_BLAKE2S=m
CONFIG_CRYPTO_BLAKE2S_X86=m
CONFIG_CRYPTO_BLAKE2S_X86=y
CONFIG_CRYPTO_CRCT10DIF=y
CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m
CONFIG_CRYPTO_GHASH=y
@ -9741,29 +9750,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE=y
CONFIG_CRYPTO_STATS=y
CONFIG_CRYPTO_HASH_INFO=y
#
# Crypto library routines
#
CONFIG_CRYPTO_LIB_AES=y
CONFIG_CRYPTO_LIB_ARC4=y
CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m
CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m
CONFIG_CRYPTO_LIB_BLAKE2S=m
CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m
CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m
CONFIG_CRYPTO_LIB_CHACHA=m
CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m
CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m
CONFIG_CRYPTO_LIB_CURVE25519=m
CONFIG_CRYPTO_LIB_DES=y
CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11
CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m
CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m
CONFIG_CRYPTO_LIB_POLY1305=m
CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRYPTO_LIB_SHA256=y
CONFIG_CRYPTO_LIB_SM4=m
CONFIG_CRYPTO_HW=y
CONFIG_CRYPTO_DEV_PADLOCK=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m
@ -9835,6 +9821,31 @@ CONFIG_GENERIC_IOMAP=y
CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y
CONFIG_ARCH_HAS_FAST_MULTIPLIER=y
CONFIG_ARCH_USE_SYM_ANNOTATIONS=y
#
# Crypto library routines
#
CONFIG_CRYPTO_LIB_AES=y
CONFIG_CRYPTO_LIB_ARC4=y
CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=y
CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=y
CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m
CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m
CONFIG_CRYPTO_LIB_CHACHA=m
CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m
CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m
CONFIG_CRYPTO_LIB_CURVE25519=m
CONFIG_CRYPTO_LIB_DES=y
CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11
CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m
CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m
CONFIG_CRYPTO_LIB_POLY1305=m
CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRYPTO_LIB_SHA256=y
CONFIG_CRYPTO_LIB_SM4=m
# end of Crypto library routines
CONFIG_LIB_MEMNEQ=y
CONFIG_CRC_CCITT=m
CONFIG_CRC16=y
CONFIG_CRC_T10DIF=y
@ -9985,6 +9996,8 @@ CONFIG_SYMBOLIC_ERRNAME=y
CONFIG_DEBUG_BUGVERBOSE=y
# end of printk and dmesg options
CONFIG_AS_HAS_NON_CONST_LEB128=y
#
# Compile-time checks and compiler options
#
@ -10209,7 +10222,6 @@ CONFIG_STRICT_DEVMEM=y
#
# x86 Debugging
#
CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y
CONFIG_EARLY_PRINTK_USB=y
CONFIG_X86_VERBOSE_BOOTUP=y
CONFIG_EARLY_PRINTK=y

View File

@ -5,8 +5,8 @@
# Written by Chris Abela <chris.abela@maltats.com>, 20100515
# Modified by Mario Preksavec <mario@slackware.hr>
KERNEL=${KERNEL:-5.15.27}
XEN=${XEN:-4.16.1}
KERNEL=${KERNEL:-5.15.80}
XEN=${XEN:-4.17.0}
ROOTMOD=${ROOTMOD:-ext4}
ROOTFS=${ROOTFS:-ext4}

View File

@ -7,7 +7,7 @@
set -e
KERNEL=${KERNEL:-5.15.27}
KERNEL=${KERNEL:-5.15.80}
# Build an image for the root file system and another for the swap
# Default values : 8GB and 500MB resepectively.

View File

@ -25,12 +25,12 @@
cd $(dirname $0) ; CWD=$(pwd)
PRGNAM=xen
VERSION=${VERSION:-4.16.2}
VERSION=${VERSION:-4.17.0}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
PKGTYPE=${PKGTYPE:-tgz}
SEABIOS=${SEABIOS:-1.14.0}
SEABIOS=${SEABIOS:-1.16.0}
OVMF=${OVMF:-20210824_7b4a99be8a}
IPXE=${IPXE:-3c040ad387099483102708bb1839110bc788cefb}

View File

@ -1,9 +1,9 @@
PRGNAM="xen"
VERSION="4.16.2"
VERSION="4.17.0"
HOMEPAGE="http://www.xenproject.org/"
DOWNLOAD="UNSUPPORTED"
MD5SUM=""
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.17.0.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-3c040ad387099483102708bb1839110bc788cefb.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \
@ -13,9 +13,9 @@ DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/polarssl-1.1.4-gpl.tgz \
http://mirror.slackware.hr/sources/xen-extfiles/gmp-4.3.2.tar.bz2 \
http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.14.0.tar.gz \
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.16.0.tar.gz \
http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20210824_7b4a99be8a.tar.bz2"
MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \
MD5SUM_x86_64="b215062ff053378eed41e4a3e05081df \
23ba00d5e2c5b4343d12665af73e1cb5 \
36cc57650cffda9a0269493be2a169bb \
debc62758716a169df9f62e6ab2bc634 \
@ -25,7 +25,7 @@ MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \
7b72caf22b01464ee7d6165f2fd85f44 \
dd60683d7057917e34630b4a787932e8 \
e26becb8a6a2b6695f6b3e8097593db8 \
9df3b7de6376850d09161137e7a9b61f \
1411e7647ef93424fe88fea5d0ef9a82 \
322d42a3378394b5486acc1564651a4f"
REQUIRES="acpica yajl"
MAINTAINER="Mario Preksavec"

View File

@ -1,59 +0,0 @@
From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001
From: Julien Grall <jgrall@amazon.com>
Date: Mon, 6 Jun 2022 06:17:25 +0000
Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying
During the domain destroy process, the domain will still be accessible
until it is fully destroyed. So does the P2M because we don't bail
out early if is_dying is non-zero. If a domain has permission to
modify the other domain's P2M (i.e. dom0, or a stubdomain), then
foreign mapping can be added past relinquish_p2m_mapping().
Therefore, we need to prevent mapping to be added when the domain
is dying. This commit prevents such adding of mapping by adding the
d->is_dying check to p2m_set_entry(). Also this commit enhances the
check in relinquish_p2m_mapping() to make sure that no mappings can
be added in the P2M after the P2M lock is released.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Tested-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
xen/arch/arm/p2m.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index fb71fa4c1c90..cbeff90f4371 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m,
{
int rc = 0;
+ /*
+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will
+ * be dropped in relinquish_p2m_mapping(). As the P2M will still
+ * be accessible after, we need to prevent mapping to be added when the
+ * domain is dying.
+ */
+ if ( unlikely(p2m->domain->is_dying) )
+ return -ENOMEM;
+
while ( nr )
{
unsigned long mask;
@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d)
unsigned int order;
gfn_t start, end;
+ BUG_ON(!d->is_dying);
+ /* No mappings can be added in the P2M after the P2M lock is released. */
p2m_write_lock(p2m);
start = p2m->lowest_mapped_gfn;
--
2.37.1

View File

@ -1,165 +0,0 @@
From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001
From: Julien Grall <jgrall@amazon.com>
Date: Mon, 6 Jun 2022 06:17:26 +0000
Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate
page tables
At the moment the P2M page tables will be freed when the domain structure
is freed without any preemption. As the P2M is quite large, iterating
through this may take more time than it is reasonable without intermediate
preemption (to run softirqs and perhaps scheduler).
Split p2m_teardown() in two parts: one preemptible and called when
relinquishing the resources, the other one non-preemptible and called
when freeing the domain structure.
As we are now freeing the P2M pages early, we also need to prevent
further allocation if someone call p2m_set_entry() past p2m_teardown()
(I wasn't able to prove this will never happen). This is done by
the checking domain->is_dying from previous patch in p2m_set_entry().
Similarly, we want to make sure that no-one can accessed the free
pages. Therefore the root is cleared before freeing pages.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Tested-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
xen/arch/arm/domain.c | 10 +++++++--
xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++---
xen/include/asm-arm/p2m.h | 13 +++++++++--
3 files changed, 63 insertions(+), 7 deletions(-)
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 96e1b235501d..2694c39127c5 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -789,10 +789,10 @@ fail:
void arch_domain_destroy(struct domain *d)
{
/* IOMMU page table is shared with P2M, always call
- * iommu_domain_destroy() before p2m_teardown().
+ * iommu_domain_destroy() before p2m_final_teardown().
*/
iommu_domain_destroy(d);
- p2m_teardown(d);
+ p2m_final_teardown(d);
domain_vgic_free(d);
domain_vuart_free(d);
free_xenheap_page(d->shared_info);
@@ -996,6 +996,7 @@ enum {
PROG_xen,
PROG_page,
PROG_mapping,
+ PROG_p2m,
PROG_done,
};
@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d)
if ( ret )
return ret;
+ PROGRESS(p2m):
+ ret = p2m_teardown(d);
+ if ( ret )
+ return ret;
+
PROGRESS(done):
break;
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index cbeff90f4371..3bcd1e897e88 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d)
spin_unlock(&vmid_alloc_lock);
}
-void p2m_teardown(struct domain *d)
+int p2m_teardown(struct domain *d)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ unsigned long count = 0;
struct page_info *pg;
+ unsigned int i;
+ int rc = 0;
+
+ p2m_write_lock(p2m);
+
+ /*
+ * We are about to free the intermediate page-tables, so clear the
+ * root to prevent any walk to use them.
+ */
+ for ( i = 0; i < P2M_ROOT_PAGES; i++ )
+ clear_and_clean_page(p2m->root + i);
+
+ /*
+ * The domain will not be scheduled anymore, so in theory we should
+ * not need to flush the TLBs. Do it for safety purpose.
+ *
+ * Note that all the devices have already been de-assigned. So we don't
+ * need to flush the IOMMU TLB here.
+ */
+ p2m_force_tlb_flush_sync(p2m);
+
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
+ {
+ free_domheap_page(pg);
+ count++;
+ /* Arbitrarily preempt every 512 iterations */
+ if ( !(count % 512) && hypercall_preempt_check() )
+ {
+ rc = -ERESTART;
+ break;
+ }
+ }
+
+ p2m_write_unlock(p2m);
+
+ return rc;
+}
+
+void p2m_final_teardown(struct domain *d)
+{
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
/* p2m not actually initialized */
if ( !p2m->domain )
return;
- while ( (pg = page_list_remove_head(&p2m->pages)) )
- free_domheap_page(pg);
+ ASSERT(page_list_empty(&p2m->pages));
if ( p2m->root )
free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index 8f11d9c97b5d..b3ba83283e11 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -192,8 +192,17 @@ void setup_virt_paging(void);
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
-/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct domain *d);
+/*
+ * The P2M resources are freed in two parts:
+ * - p2m_teardown() will be called when relinquish the resources. It
+ * will free large resources (e.g. intermediate page-tables) that
+ * requires preemption.
+ * - p2m_final_teardown() will be called when domain struct is been
+ * freed. This *cannot* be preempted and therefore one small
+ * resources should be freed here.
+ */
+int p2m_teardown(struct domain *d);
+void p2m_final_teardown(struct domain *d);
/*
* Remove mapping refcount on each mapping page in the p2m
--
2.37.1

View File

@ -1,113 +0,0 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown()
Add a new parameter to p2m_teardown() in order to select whether the
root page table should also be freed. Note that all users are
adjusted to pass the parameter to remove the root page tables, so
behavior is not modified.
No functional change intended.
This is part of CVE-2022-33746 / XSA-410.
Suggested-by: Julien Grall <julien@xen.org>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
int p2m_alloc_table(struct p2m_domain *p2m);
/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct p2m_domain *p2m);
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
void p2m_final_teardown(struct domain *d);
/* Add a page to a domain's p2m table */
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d
}
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i]);
+ p2m_teardown(d->arch.altp2m_p2m[i], true);
}
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
- p2m_teardown(d->arch.nested_p2m[i]);
+ p2m_teardown(d->arch.nested_p2m[i], true);
}
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d, NULL);
- p2m_teardown(p2m_get_hostp2m(d));
+ p2m_teardown(p2m_get_hostp2m(d), true);
/* Free any memory that the p2m teardown released */
paging_lock(d);
hap_set_allocation(d, 0, NULL);
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p
* hvm fixme: when adding support for pvh non-hardware domains, this path must
* cleanup any foreign p2m types (release refcnts on them).
*/
-void p2m_teardown(struct p2m_domain *p2m)
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
/* Return all the p2m pages to Xen.
* We know we don't have any extra mappings to these pages */
{
- struct page_info *pg;
+ struct page_info *pg, *root_pg = NULL;
struct domain *d;
if (p2m == NULL)
@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_lock(p2m);
ASSERT(atomic_read(&d->shr_pages) == 0);
- p2m->phys_table = pagetable_null();
+
+ if ( remove_root )
+ p2m->phys_table = pagetable_null();
+ else if ( !pagetable_is_null(p2m->phys_table) )
+ {
+ root_pg = pagetable_get_page(p2m->phys_table);
+ clear_domain_page(pagetable_get_mfn(p2m->phys_table));
+ }
while ( (pg = page_list_remove_head(&p2m->pages)) )
- d->arch.paging.free_page(d, pg);
+ if ( pg != root_pg )
+ d->arch.paging.free_page(d, pg);
+
+ if ( root_pg )
+ page_list_add(root_pg, &p2m->pages);
+
p2m_unlock(p2m);
}
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32
paging_unlock(d);
out_unlocked:
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
- p2m_teardown(p2m);
+ p2m_teardown(p2m, true);
if ( rv != 0 && pg != NULL )
{
pg->count_info &= ~PGC_count_mask;
@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain
shadow_teardown(d, NULL);
/* It is now safe to pull down the p2m map. */
- p2m_teardown(p2m_get_hostp2m(d));
+ p2m_teardown(p2m_get_hostp2m(d), true);
/* Free any shadow memory that the p2m teardown released */
paging_lock(d);
shadow_set_allocation(d, 0, NULL);

View File

@ -1,62 +0,0 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/HAP: adjust monitor table related error handling
hap_make_monitor_table() will return INVALID_MFN if it encounters an
error condition, but hap_update_paging_modes() wasnt handling this
value, resulting in an inappropriate value being stored in
monitor_table. This would subsequently misguide at least
hap_vcpu_teardown(). Avoid this by bailing early.
Further, when a domain has/was already crashed or (perhaps less
important as there's no such path known to lead here) is already dying,
avoid calling domain_crash() on it again - that's at best confusing.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -39,6 +39,7 @@
#include <asm/domain.h>
#include <xen/numa.h>
#include <asm/hvm/nestedhvm.h>
+#include <public/sched.h>
#include "private.h"
@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru
return m4mfn;
oom:
- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n");
- domain_crash(d);
+ if ( !d->is_dying &&
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ {
+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n",
+ d);
+ domain_crash(d);
+ }
return INVALID_MFN;
}
@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru
if ( pagetable_is_null(v->arch.hvm.monitor_table) )
{
mfn_t mmfn = hap_make_monitor_table(v);
+
+ if ( mfn_eq(mmfn, INVALID_MFN) )
+ goto unlock;
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
make_cr3(v, mmfn);
hvm_update_host_cr3(v);
@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru
/* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
hap_update_cr3(v, 0, false);
+ unlock:
paging_unlock(d);
put_gfn(d, cr3_gfn);
}

View File

@ -1,60 +0,0 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow()
Subsequently sh_set_toplevel_shadow() will be adjusted to install a
blank entry in case prealloc fails. There are, in fact, pre-existing
error paths which would put in place a blank entry. The 4- and 2-level
code in sh_update_cr3(), however, assume the top level entry to be
valid.
Hence bail from the function in the unlikely event that it's not. Note
that 3-level logic works differently: In particular a guest is free to
supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid)
entries. The guest will crash, but we already cope with that.
Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(),
and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change
in security context, but add a respective assertion.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu
/* Now figure out the new contents: is this a valid guest MFN? */
if ( !mfn_valid(gmfn) )
{
+ ASSERT(mfn_eq(gmfn, INVALID_MFN));
new_entry = pagetable_null();
goto install_new_entry;
}
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
guest_flush_tlb_mask(d, d->dirty_cpumask);
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
+ {
+ ASSERT(d->is_dying || d->is_shutting_down);
+ return;
+ }
if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
{
mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]);
@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
guest_flush_tlb_mask(d, d->dirty_cpumask);
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
+ {
+ ASSERT(d->is_dying || d->is_shutting_down);
+ return;
+ }
#else
#error This should never happen
#endif

View File

@ -1,255 +0,0 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/shadow: tolerate failure in shadow_prealloc()
Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
the pre-allocation and instead return true/false. Modify
shadow_prealloc() to crash the domain on allocation failure (if the
domain is not already dying), as shadow cannot operate normally after
that. Modify callers to also gracefully handle {_,}shadow_prealloc()
failing to fulfill the request.
Note this in turn requires adjusting the callers of
sh_make_monitor_table() also to handle it returning INVALID_MFN.
sh_update_paging_modes() is also modified to add additional error
paths in case of allocation failure, some of those will return with
null monitor page tables (and the domain likely crashed). This is no
different that current error paths, but the newly introduced ones are
more likely to trigger.
The now added failure points in sh_update_paging_modes() also require
that on some error return paths the previous structures are cleared,
and thus monitor table is null.
While there adjust the 'type' parameter type of shadow_prealloc() to
unsigned int rather than u32.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -36,6 +36,7 @@
#include <asm/flushtlb.h>
#include <asm/shadow.h>
#include <xen/numa.h>
+#include <public/sched.h>
#include "private.h"
DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc
/* Make sure there are at least count order-sized pages
* available in the shadow page pool. */
-static void _shadow_prealloc(struct domain *d, unsigned int pages)
+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
{
struct vcpu *v;
struct page_info *sp, *t;
mfn_t smfn;
int i;
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ return true;
/* Shouldn't have enabled shadows if we've no vcpus. */
ASSERT(d->vcpu && d->vcpu[0]);
@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma
sh_unpin(d, smfn);
/* See if that freed up enough space */
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ return true;
}
/* Stage two: all shadow pages are in use in hierarchies that are
@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma
if ( d->arch.paging.shadow.free_pages >= pages )
{
guest_flush_tlb_mask(d, d->dirty_cpumask);
- return;
+ return true;
}
}
}
@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma
d->arch.paging.shadow.total_pages,
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
- BUG();
+
+ ASSERT(d->is_dying);
+
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+
+ return false;
}
/* Make sure there are at least count pages of the order according to
@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma
* This must be called before any calls to shadow_alloc(). Since this
* will free existing shadows to make room, it must be called early enough
* to avoid freeing shadows that the caller is currently working on. */
-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
{
- return _shadow_prealloc(d, shadow_size(type) * count);
+ bool ret = _shadow_prealloc(d, shadow_size(type) * count);
+
+ if ( !ret && !d->is_dying &&
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ /*
+ * Failing to allocate memory required for shadow usage can only result in
+ * a domain crash, do it here rather that relying on every caller to do it.
+ */
+ domain_crash(d);
+
+ return ret;
}
/* Deliberately free all the memory we can: this will tear down all of
@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t
static struct page_info *
shadow_alloc_p2m_page(struct domain *d)
{
- struct page_info *pg;
+ struct page_info *pg = NULL;
/* This is called both from the p2m code (which never holds the
* paging lock) and the log-dirty code (which always does). */
@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
d->arch.paging.shadow.p2m_pages,
shadow_min_acceptable_pages(d));
}
- paging_unlock(d);
- return NULL;
+ goto out;
}
- shadow_prealloc(d, SH_type_p2m_table, 1);
+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
+ goto out;
+
pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
d->arch.paging.shadow.p2m_pages++;
d->arch.paging.shadow.total_pages--;
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+ out:
paging_unlock(d);
return pg;
@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain
else if ( d->arch.paging.shadow.total_pages > pages )
{
/* Need to return memory to domheap */
- _shadow_prealloc(d, 1);
+ if ( !_shadow_prealloc(d, 1) )
+ return -ENOMEM;
+
sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
ASSERT(sp);
/*
@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc
if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
{
int i;
+
+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
+ return;
+
for(i = 0; i < SHADOW_OOS_PAGES; i++)
- {
- shadow_prealloc(d, SH_type_oos_snapshot, 1);
v->arch.paging.shadow.oos_snapshot[i] =
shadow_alloc(d, SH_type_oos_snapshot, 0);
- }
}
#endif /* OOS */
@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc
mfn_t mmfn = sh_make_monitor_table(
v, v->arch.paging.mode->shadow.shadow_levels);
+ if ( mfn_eq(mmfn, INVALID_MFN) )
+ return;
+
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
make_cr3(v, mmfn);
hvm_update_host_cr3(v);
@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc
v->arch.hvm.monitor_table = pagetable_null();
new_mfn = sh_make_monitor_table(
v, v->arch.paging.mode->shadow.shadow_levels);
+ if ( mfn_eq(new_mfn, INVALID_MFN) )
+ {
+ sh_destroy_monitor_table(v, old_mfn,
+ old_mode->shadow.shadow_levels);
+ return;
+ }
v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
mfn_x(new_mfn));
@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu
if ( !mfn_valid(smfn) )
{
/* Make sure there's enough free shadow memory. */
- shadow_prealloc(d, root_type, 1);
+ if ( !shadow_prealloc(d, root_type, 1) )
+ {
+ new_entry = pagetable_null();
+ goto install_new_entry;
+ }
+
/* Shadow the page. */
smfn = make_shadow(v, gmfn, root_type);
}
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct
ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
/* Guarantee we can get the memory we need */
- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
+ return INVALID_MFN;
+
m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
mfn_to_page(m4mfn)->shadow_flags = 4;
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v,
* Preallocate shadow pages *before* removing writable accesses
* otherwhise an OOS L1 might be demoted and promoted again with
* writable mappings. */
- shadow_prealloc(d,
- SH_type_l1_shadow,
- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
+ if ( !shadow_prealloc(d, SH_type_l1_shadow,
+ GUEST_PAGING_LEVELS < 4
+ ? 1 : GUEST_PAGING_LEVELS - 1) )
+ {
+ paging_unlock(d);
+ put_gfn(d, gfn_x(gfn));
+ return 0;
+ }
rc = gw_remove_write_accesses(v, va, &gw);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf
void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
/* Shadow page allocation functions */
-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
+ unsigned int count);
mfn_t shadow_alloc(struct domain *d,
u32 shadow_type,
unsigned long backpointer);

View File

@ -1,82 +0,0 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: refuse new allocations for dying domains
This will in particular prevent any attempts to add entries to the p2m,
once - in a subsequent change - non-root entries have been removed.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc
ASSERT(paging_locked_by_me(d));
+ if ( unlikely(d->is_dying) )
+ return NULL;
+
pg = page_list_remove_head(&d->arch.paging.hap.freelist);
if ( unlikely(!pg) )
return NULL;
@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p
d->arch.paging.hap.p2m_pages++;
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
}
- else if ( !d->arch.paging.p2m_alloc_failed )
+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying )
{
d->arch.paging.p2m_alloc_failed = 1;
dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n",
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo
if ( d->arch.paging.shadow.free_pages >= pages )
return true;
+ if ( unlikely(d->is_dying) )
+ /* No reclaim when the domain is dying, teardown will take care of it. */
+ return false;
+
/* Shouldn't have enabled shadows if we've no vcpus. */
ASSERT(d->vcpu && d->vcpu[0]);
@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
- ASSERT(d->is_dying);
+ ASSERT_UNREACHABLE();
guest_flush_tlb_mask(d, d->dirty_cpumask);
@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo
* to avoid freeing shadows that the caller is currently working on. */
bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
{
- bool ret = _shadow_prealloc(d, shadow_size(type) * count);
+ bool ret;
+
+ if ( unlikely(d->is_dying) )
+ return false;
- if ( !ret && !d->is_dying &&
- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ ret = _shadow_prealloc(d, shadow_size(type) * count);
+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
/*
* Failing to allocate memory required for shadow usage can only result in
* a domain crash, do it here rather that relying on every caller to do it.
@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d)
{
struct page_info *pg = NULL;
+ if ( unlikely(d->is_dying) )
+ return NULL;
+
/* This is called both from the p2m code (which never holds the
* paging lock) and the log-dirty code (which always does). */
paging_lock_recursive(d);

View File

@ -1,96 +0,0 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: truly free paging pool memory for dying domains
Modify {hap,shadow}_free to free the page immediately if the domain is
dying, so that pages don't accumulate in the pool when
{shadow,hap}_final_teardown() get called. This is to limit the amount of
work which needs to be done there (in a non-preemptable manner).
Note the call to shadow_free() in shadow_free_p2m_page() is moved after
increasing total_pages, so that the decrease done in shadow_free() in
case the domain is dying doesn't underflow the counter, even if just for
a short interval.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m
ASSERT(paging_locked_by_me(d));
+ /*
+ * For dying domains, actually free the memory here. This way less work is
+ * left to hap_final_teardown(), which cannot easily have preemption checks
+ * added.
+ */
+ if ( unlikely(d->is_dying) )
+ {
+ free_domheap_page(pg);
+ d->arch.paging.hap.total_pages--;
+ return;
+ }
+
d->arch.paging.hap.free_pages++;
page_list_add_tail(pg, &d->arch.paging.hap.freelist);
}
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d,
void shadow_free(struct domain *d, mfn_t smfn)
{
struct page_info *next = NULL, *sp = mfn_to_page(smfn);
+ bool dying = ACCESS_ONCE(d->is_dying);
struct page_list_head *pin_list;
unsigned int pages;
u32 shadow_type;
@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t
* just before the allocator hands the page out again. */
page_set_tlbflush_timestamp(sp);
perfc_decr(shadow_alloc_count);
- page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
+
+ /*
+ * For dying domains, actually free the memory here. This way less
+ * work is left to shadow_final_teardown(), which cannot easily have
+ * preemption checks added.
+ */
+ if ( unlikely(dying) )
+ {
+ /*
+ * The backpointer field (sh.back) used by shadow code aliases the
+ * domain owner field, unconditionally clear it here to avoid
+ * free_domheap_page() attempting to parse it.
+ */
+ page_set_owner(sp, NULL);
+ free_domheap_page(sp);
+ }
+ else
+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
+
sp = next;
}
- d->arch.paging.shadow.free_pages += pages;
+ if ( unlikely(dying) )
+ d->arch.paging.shadow.total_pages -= pages;
+ else
+ d->arch.paging.shadow.free_pages += pages;
}
/* Divert a page from the pool to be used by the p2m mapping.
@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s
* paging lock) and the log-dirty code (which always does). */
paging_lock_recursive(d);
- shadow_free(d, page_to_mfn(pg));
d->arch.paging.shadow.p2m_pages--;
d->arch.paging.shadow.total_pages++;
+ shadow_free(d, page_to_mfn(pg));
paging_unlock(d);
}

View File

@ -1,159 +0,0 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: free the paging memory pool preemptively
The paging memory pool is currently freed in two different places:
from {shadow,hap}_teardown() via domain_relinquish_resources() and
from {shadow,hap}_final_teardown() via complete_domain_destroy().
While the former does handle preemption, the later doesn't.
Attempt to move as much p2m related freeing as possible to happen
before the call to {shadow,hap}_teardown(), so that most memory can be
freed in a preemptive way. In order to avoid causing issues to
existing callers leave the root p2m page tables set and free them in
{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free
the page immediately if the domain is dying, so that pages don't
accumulate in the pool when {shadow,hap}_final_teardown() get called.
Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's
the place where altp2m_active gets disabled now.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -38,7 +38,6 @@
#include <xen/livepatch.h>
#include <public/sysctl.h>
#include <public/hvm/hvm_vcpu.h>
-#include <asm/altp2m.h>
#include <asm/regs.h>
#include <asm/mc146818rtc.h>
#include <asm/system.h>
@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d
vpmu_destroy(v);
}
- if ( altp2m_active(d) )
- {
- for_each_vcpu ( d, v )
- altp2m_vcpu_disable_ve(v);
- }
-
if ( is_pv_domain(d) )
{
for_each_vcpu ( d, v )
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -28,6 +28,7 @@
#include <xen/domain_page.h>
#include <xen/guest_access.h>
#include <xen/keyhandler.h>
+#include <asm/altp2m.h>
#include <asm/event.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d
unsigned int i;
if ( hvm_altp2m_supported() )
- {
- d->arch.altp2m_active = 0;
-
- if ( d->arch.altp2m_eptp )
- {
- free_xenheap_page(d->arch.altp2m_eptp);
- d->arch.altp2m_eptp = NULL;
- }
-
- if ( d->arch.altp2m_visible_eptp )
- {
- free_xenheap_page(d->arch.altp2m_visible_eptp);
- d->arch.altp2m_visible_eptp = NULL;
- }
-
for ( i = 0; i < MAX_ALTP2M; i++ )
p2m_teardown(d->arch.altp2m_p2m[i], true);
- }
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d
paging_lock(d);
hap_set_allocation(d, 0, NULL);
ASSERT(d->arch.paging.hap.p2m_pages == 0);
+ ASSERT(d->arch.paging.hap.free_pages == 0);
+ ASSERT(d->arch.paging.hap.total_pages == 0);
paging_unlock(d);
}
@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v)
void hap_teardown(struct domain *d, bool *preempted)
{
struct vcpu *v;
+ unsigned int i;
ASSERT(d->is_dying);
ASSERT(d != current->domain);
@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool
for_each_vcpu ( d, v )
hap_vcpu_teardown(v);
+ /* Leave the root pt in case we get further attempts to modify the p2m. */
+ if ( hvm_altp2m_supported() )
+ {
+ if ( altp2m_active(d) )
+ for_each_vcpu ( d, v )
+ altp2m_vcpu_disable_ve(v);
+
+ d->arch.altp2m_active = 0;
+
+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp);
+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
+
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+ p2m_teardown(d->arch.altp2m_p2m[i], false);
+ }
+
+ /* Destroy nestedp2m's after altp2m. */
+ for ( i = 0; i < MAX_NESTEDP2M; i++ )
+ p2m_teardown(d->arch.nested_p2m[i], false);
+
+ p2m_teardown(p2m_get_hostp2m(d), false);
+
paging_lock(d); /* Keep various asserts happy */
if ( d->arch.paging.hap.total_pages != 0 )
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b
for_each_vcpu ( d, v )
shadow_vcpu_teardown(v);
+ p2m_teardown(p2m_get_hostp2m(d), false);
+
paging_lock(d);
+ /*
+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find
+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages
+ * because the domain is dying.
+ */
+ shadow_blow_tables(d);
+
#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
/* Free the virtual-TLB array attached to each vcpu */
for_each_vcpu(d, v)
@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain
d->arch.paging.shadow.total_pages,
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
+ ASSERT(!d->arch.paging.shadow.total_pages);
+ ASSERT(!d->arch.paging.shadow.free_pages);
+ ASSERT(!d->arch.paging.shadow.p2m_pages);
paging_unlock(d);
}

View File

@ -1,171 +0,0 @@
From: Julien Grall <jgrall@amazon.com>
Subject: xen/x86: p2m: Add preemption in p2m_teardown()
The list p2m->pages contain all the pages used by the P2M. On large
instance this can be quite large and the time spent to call
d->arch.paging.free_page() will take more than 1ms for a 80GB guest
on a Xen running in nested environment on a c5.metal.
By extrapolation, it would take > 100ms for a 8TB guest (what we
current security support). So add some preemption in p2m_teardown()
and propagate to the callers. Note there are 3 places where
the preemption is not enabled:
- hap_final_teardown()/shadow_final_teardown(): We are
preventing update the P2M once the domain is dying (so
no more pages could be allocated) and most of the P2M pages
will be freed in preemptive manneer when relinquishing the
resources. So this is fine to disable preemption.
- shadow_enable(): This is fine because it will undo the allocation
that may have been made by p2m_alloc_table() (so only the root
page table).
The preemption is arbitrarily checked every 1024 iterations.
Note that with the current approach, Xen doesn't keep track on whether
the alt/nested P2Ms have been cleared. So there are some redundant work.
However, this is not expected to incurr too much overhead (the P2M lock
shouldn't be contended during teardown). So this is optimization is
left outside of the security event.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
int p2m_alloc_table(struct p2m_domain *p2m);
/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted);
void p2m_final_teardown(struct domain *d);
/* Add a page to a domain's p2m table */
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d
if ( hvm_altp2m_supported() )
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i], true);
+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL);
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
- p2m_teardown(d->arch.nested_p2m[i], true);
+ p2m_teardown(d->arch.nested_p2m[i], true, NULL);
}
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d, NULL);
- p2m_teardown(p2m_get_hostp2m(d), true);
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
/* Free any memory that the p2m teardown released */
paging_lock(d);
hap_set_allocation(d, 0, NULL);
@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool
FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i], false);
+ {
+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted);
+ if ( preempted && *preempted )
+ return;
+ }
}
/* Destroy nestedp2m's after altp2m. */
for ( i = 0; i < MAX_NESTEDP2M; i++ )
- p2m_teardown(d->arch.nested_p2m[i], false);
+ {
+ p2m_teardown(d->arch.nested_p2m[i], false, preempted);
+ if ( preempted && *preempted )
+ return;
+ }
- p2m_teardown(p2m_get_hostp2m(d), false);
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
+ if ( preempted && *preempted )
+ return;
paging_lock(d); /* Keep various asserts happy */
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p
* hvm fixme: when adding support for pvh non-hardware domains, this path must
* cleanup any foreign p2m types (release refcnts on them).
*/
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted)
/* Return all the p2m pages to Xen.
* We know we don't have any extra mappings to these pages */
{
struct page_info *pg, *root_pg = NULL;
struct domain *d;
+ unsigned int i = 0;
if (p2m == NULL)
return;
@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m
}
while ( (pg = page_list_remove_head(&p2m->pages)) )
- if ( pg != root_pg )
- d->arch.paging.free_page(d, pg);
+ {
+ if ( pg == root_pg )
+ continue;
+
+ d->arch.paging.free_page(d, pg);
+
+ /* Arbitrarily check preemption every 1024 iterations */
+ if ( preempted && !(++i % 1024) && general_preempt_check() )
+ {
+ *preempted = true;
+ break;
+ }
+ }
if ( root_pg )
page_list_add(root_pg, &p2m->pages);
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32
out_locked:
paging_unlock(d);
out_unlocked:
+ /*
+ * This is fine to ignore the preemption here because only the root
+ * will be allocated by p2m_alloc_table().
+ */
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
- p2m_teardown(p2m, true);
+ p2m_teardown(p2m, true, NULL);
if ( rv != 0 && pg != NULL )
{
pg->count_info &= ~PGC_count_mask;
@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b
for_each_vcpu ( d, v )
shadow_vcpu_teardown(v);
- p2m_teardown(p2m_get_hostp2m(d), false);
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
+ if ( preempted && *preempted )
+ return;
paging_lock(d);
@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain
shadow_teardown(d, NULL);
/* It is now safe to pull down the p2m map. */
- p2m_teardown(p2m_get_hostp2m(d), true);
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
/* Free any shadow memory that the p2m teardown released */
paging_lock(d);
shadow_set_allocation(d, 0, NULL);

View File

@ -1,55 +0,0 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: gnttab: correct locking on transitive grant copy error path
While the comment next to the lock dropping in preparation of
recursively calling acquire_grant_for_copy() mistakenly talks about the
rd == td case (excluded a few lines further up), the same concerns apply
to the calling of release_grant_for_copy() on a subsequent error path.
This is CVE-2022-33748 / XSA-411.
Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend code comment.
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -2622,9 +2622,8 @@ acquire_grant_for_copy(
trans_domid);
/*
- * acquire_grant_for_copy() could take the lock on the
- * remote table (if rd == td), so we have to drop the lock
- * here and reacquire.
+ * acquire_grant_for_copy() will take the lock on the remote table,
+ * so we have to drop the lock here and reacquire.
*/
active_entry_release(act);
grant_read_unlock(rgt);
@@ -2661,11 +2660,25 @@ acquire_grant_for_copy(
act->trans_gref != trans_gref ||
!act->is_sub_page)) )
{
+ /*
+ * Like above for acquire_grant_for_copy() we need to drop and then
+ * re-acquire the locks here to prevent lock order inversion issues.
+ * Unlike for acquire_grant_for_copy() we don't need to re-check
+ * anything, as release_grant_for_copy() doesn't depend on the grant
+ * table entry: It only updates internal state and the status flags.
+ */
+ active_entry_release(act);
+ grant_read_unlock(rgt);
+
release_grant_for_copy(td, trans_gref, readonly);
rcu_unlock_domain(td);
+
+ grant_read_lock(rgt);
+ act = active_entry_acquire(rgt, gref);
reduce_status_for_pin(rd, act, status, readonly);
active_entry_release(act);
grant_read_unlock(rgt);
+
put_page(*page);
*page = NULL;
return ERESTART;