system/xen: Updated for version 4.16.2.
Signed-off-by: Mario Preksavec <mario@slackware.hr> Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
parent
aae988ef83
commit
b75c769415
|
@ -25,8 +25,8 @@
|
|||
cd $(dirname $0) ; CWD=$(pwd)
|
||||
|
||||
PRGNAM=xen
|
||||
VERSION=${VERSION:-4.16.1}
|
||||
BUILD=${BUILD:-2}
|
||||
VERSION=${VERSION:-4.16.2}
|
||||
BUILD=${BUILD:-1}
|
||||
TAG=${TAG:-_SBo}
|
||||
PKGTYPE=${PKGTYPE:-tgz}
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
PRGNAM="xen"
|
||||
VERSION="4.16.1"
|
||||
VERSION="4.16.2"
|
||||
HOMEPAGE="http://www.xenproject.org/"
|
||||
DOWNLOAD="UNSUPPORTED"
|
||||
MD5SUM=""
|
||||
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
|
||||
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-3c040ad387099483102708bb1839110bc788cefb.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \
|
||||
|
@ -15,7 +15,7 @@ DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
|
|||
http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.14.0.tar.gz \
|
||||
http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20210824_7b4a99be8a.tar.bz2"
|
||||
MD5SUM_x86_64="1c2cd4f7f966c1d455aab630953e5fad \
|
||||
MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \
|
||||
23ba00d5e2c5b4343d12665af73e1cb5 \
|
||||
36cc57650cffda9a0269493be2a169bb \
|
||||
debc62758716a169df9f62e6ab2bc634 \
|
||||
|
|
|
@ -1,170 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/pv: Clean up _get_page_type()
|
||||
|
||||
Various fixes for clarity, ahead of making complicated changes.
|
||||
|
||||
* Split the overflow check out of the if/else chain for type handling, as
|
||||
it's somewhat unrelated.
|
||||
* Comment the main if/else chain to explain what is going on. Adjust one
|
||||
ASSERT() and state the bit layout for validate-locked and partial states.
|
||||
* Correct the comment about TLB flushing, as it's backwards. The problem
|
||||
case is when writeable mappings are retained to a page becoming read-only,
|
||||
as it allows the guest to bypass Xen's safety checks for updates.
|
||||
* Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not
|
||||
valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat
|
||||
all reads as explicitly volatile. The only thing preventing the validated
|
||||
wait-loop being infinite is the compiler barrier hidden in cpu_relax().
|
||||
* Replace one page_get_owner(page) with the already-calculated 'd' already in
|
||||
scope.
|
||||
|
||||
No functional change.
|
||||
|
||||
This is part of XSA-401 / CVE-2022-26362.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index 796faca64103..ddd32f88c798 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
|
||||
static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
bool preemptible)
|
||||
{
|
||||
- unsigned long nx, x, y = page->u.inuse.type_info;
|
||||
+ unsigned long nx, x;
|
||||
int rc = 0;
|
||||
|
||||
ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
|
||||
ASSERT(!in_irq());
|
||||
|
||||
- for ( ; ; )
|
||||
+ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
|
||||
{
|
||||
x = y;
|
||||
nx = x + 1;
|
||||
+
|
||||
if ( unlikely((nx & PGT_count_mask) == 0) )
|
||||
{
|
||||
gdprintk(XENLOG_WARNING,
|
||||
@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
mfn_x(page_to_mfn(page)));
|
||||
return -EINVAL;
|
||||
}
|
||||
- else if ( unlikely((x & PGT_count_mask) == 0) )
|
||||
+
|
||||
+ if ( unlikely((x & PGT_count_mask) == 0) )
|
||||
{
|
||||
+ /*
|
||||
+ * Typeref 0 -> 1.
|
||||
+ *
|
||||
+ * Type changes are permitted when the typeref is 0. If the type
|
||||
+ * actually changes, the page needs re-validating.
|
||||
+ */
|
||||
struct domain *d = page_get_owner(page);
|
||||
|
||||
if ( d && shadow_mode_enabled(d) )
|
||||
@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
{
|
||||
/*
|
||||
* On type change we check to flush stale TLB entries. It is
|
||||
- * vital that no other CPUs are left with mappings of a frame
|
||||
- * which is about to become writeable to the guest.
|
||||
+ * vital that no other CPUs are left with writeable mappings
|
||||
+ * to a frame which is intending to become pgtable/segdesc.
|
||||
*/
|
||||
cpumask_t *mask = this_cpu(scratch_cpumask);
|
||||
|
||||
@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
|
||||
if ( unlikely(!cpumask_empty(mask)) &&
|
||||
/* Shadow mode: track only writable pages. */
|
||||
- (!shadow_mode_enabled(page_get_owner(page)) ||
|
||||
+ (!shadow_mode_enabled(d) ||
|
||||
((nx & PGT_type_mask) == PGT_writable_page)) )
|
||||
{
|
||||
perfc_incr(need_flush_tlb_flush);
|
||||
@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
}
|
||||
else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
|
||||
{
|
||||
- /* Don't log failure if it could be a recursive-mapping attempt. */
|
||||
+ /*
|
||||
+ * else, we're trying to take a new reference, of the wrong type.
|
||||
+ *
|
||||
+ * This (being able to prohibit use of the wrong type) is what the
|
||||
+ * typeref system exists for, but skip printing the failure if it
|
||||
+ * looks like a recursive mapping, as subsequent logic might
|
||||
+ * ultimately permit the attempt.
|
||||
+ */
|
||||
if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
|
||||
(type == PGT_l1_page_table) )
|
||||
return -EINVAL;
|
||||
@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
}
|
||||
else if ( unlikely(!(x & PGT_validated)) )
|
||||
{
|
||||
+ /*
|
||||
+ * else, the count is non-zero, and we're grabbing the right type;
|
||||
+ * but the page hasn't been validated yet.
|
||||
+ *
|
||||
+ * The page is in one of two states (depending on PGT_partial),
|
||||
+ * and should have exactly one reference.
|
||||
+ */
|
||||
+ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
|
||||
+
|
||||
if ( !(x & PGT_partial) )
|
||||
{
|
||||
- /* Someone else is updating validation of this page. Wait... */
|
||||
+ /*
|
||||
+ * The page has been left in the "validate locked" state
|
||||
+ * (i.e. PGT_[type] | 1) which means that a concurrent caller
|
||||
+ * of _get_page_type() is in the middle of validation.
|
||||
+ *
|
||||
+ * Spin waiting for the concurrent user to complete (partial
|
||||
+ * or fully validated), then restart our attempt to acquire a
|
||||
+ * type reference.
|
||||
+ */
|
||||
do {
|
||||
if ( preemptible && hypercall_preempt_check() )
|
||||
return -EINTR;
|
||||
cpu_relax();
|
||||
- } while ( (y = page->u.inuse.type_info) == x );
|
||||
+ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
|
||||
continue;
|
||||
}
|
||||
- /* Type ref count was left at 1 when PGT_partial got set. */
|
||||
- ASSERT((x & PGT_count_mask) == 1);
|
||||
+
|
||||
+ /*
|
||||
+ * The page has been left in the "partial" state
|
||||
+ * (i.e., PGT_[type] | PGT_partial | 1).
|
||||
+ *
|
||||
+ * Rather than bumping the type count, we need to try to grab the
|
||||
+ * validation lock; if we succeed, we need to validate the page,
|
||||
+ * then drop the general ref associated with the PGT_partial bit.
|
||||
+ *
|
||||
+ * We grab the validation lock by setting nx to (PGT_[type] | 1)
|
||||
+ * (i.e., non-zero type count, neither PGT_validated nor
|
||||
+ * PGT_partial set).
|
||||
+ */
|
||||
nx = x & ~PGT_partial;
|
||||
}
|
||||
|
||||
@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
}
|
||||
|
||||
out:
|
||||
+ /*
|
||||
+ * Did we drop the PGT_partial bit when acquiring the typeref? If so,
|
||||
+ * drop the general reference that went along with it.
|
||||
+ *
|
||||
+ * N.B. validate_page() may have have re-set PGT_partial, not reflected in
|
||||
+ * nx, but will have taken an extra ref when doing so.
|
||||
+ */
|
||||
if ( (x & PGT_partial) && !(nx & PGT_partial) )
|
||||
put_page(page);
|
||||
|
|
@ -1,191 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
|
||||
|
||||
_get_page_type() suffers from a race condition where it incorrectly assumes
|
||||
that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
|
||||
cannot have changed in-between. Consider:
|
||||
|
||||
CPU A:
|
||||
1. Creates an L2e referencing pg
|
||||
`-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
|
||||
2. Issues flush_tlb_mask()
|
||||
CPU B:
|
||||
3. Creates a writeable mapping of pg
|
||||
`-> _get_page_type(pg, PGT_writable_page), count increases to 1
|
||||
4. Writes into new mapping, creating a TLB entry for pg
|
||||
5. Removes the writeable mapping of pg
|
||||
`-> _put_page_type(pg), count goes back down to 0
|
||||
CPU A:
|
||||
7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table
|
||||
|
||||
CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
|
||||
suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred
|
||||
until after the guest is prohibited from creating new writeable mappings,
|
||||
which is after step 7.
|
||||
|
||||
Defer all safety actions until after the cmpxchg() has successfully taken the
|
||||
intended typeref, because that is what prevents concurrent users from using
|
||||
the old type.
|
||||
|
||||
Also remove the early validation for writeable and shared pages. This removes
|
||||
race conditions where one half of a parallel mapping attempt can return
|
||||
successfully before:
|
||||
* The IOMMU pagetables are in sync with the new page type
|
||||
* Writeable mappings to shared pages have been torn down
|
||||
|
||||
This is part of XSA-401 / CVE-2022-26362.
|
||||
|
||||
Reported-by: Jann Horn <jannh@google.com>
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index ddd32f88c798..1693b580b152 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
* Type changes are permitted when the typeref is 0. If the type
|
||||
* actually changes, the page needs re-validating.
|
||||
*/
|
||||
- struct domain *d = page_get_owner(page);
|
||||
-
|
||||
- if ( d && shadow_mode_enabled(d) )
|
||||
- shadow_prepare_page_type_change(d, page, type);
|
||||
|
||||
ASSERT(!(x & PGT_pae_xen_l2));
|
||||
if ( (x & PGT_type_mask) != type )
|
||||
{
|
||||
- /*
|
||||
- * On type change we check to flush stale TLB entries. It is
|
||||
- * vital that no other CPUs are left with writeable mappings
|
||||
- * to a frame which is intending to become pgtable/segdesc.
|
||||
- */
|
||||
- cpumask_t *mask = this_cpu(scratch_cpumask);
|
||||
-
|
||||
- BUG_ON(in_irq());
|
||||
- cpumask_copy(mask, d->dirty_cpumask);
|
||||
-
|
||||
- /* Don't flush if the timestamp is old enough */
|
||||
- tlbflush_filter(mask, page->tlbflush_timestamp);
|
||||
-
|
||||
- if ( unlikely(!cpumask_empty(mask)) &&
|
||||
- /* Shadow mode: track only writable pages. */
|
||||
- (!shadow_mode_enabled(d) ||
|
||||
- ((nx & PGT_type_mask) == PGT_writable_page)) )
|
||||
- {
|
||||
- perfc_incr(need_flush_tlb_flush);
|
||||
- /*
|
||||
- * If page was a page table make sure the flush is
|
||||
- * performed using an IPI in order to avoid changing the
|
||||
- * type of a page table page under the feet of
|
||||
- * spurious_page_fault().
|
||||
- */
|
||||
- flush_mask(mask,
|
||||
- (x & PGT_type_mask) &&
|
||||
- (x & PGT_type_mask) <= PGT_root_page_table
|
||||
- ? FLUSH_TLB | FLUSH_FORCE_IPI
|
||||
- : FLUSH_TLB);
|
||||
- }
|
||||
-
|
||||
- /* We lose existing type and validity. */
|
||||
nx &= ~(PGT_type_mask | PGT_validated);
|
||||
nx |= type;
|
||||
-
|
||||
- /*
|
||||
- * No special validation needed for writable pages.
|
||||
- * Page tables and GDT/LDT need to be scanned for validity.
|
||||
- */
|
||||
- if ( type == PGT_writable_page || type == PGT_shared_page )
|
||||
- nx |= PGT_validated;
|
||||
}
|
||||
}
|
||||
else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
|
||||
@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * One typeref has been taken and is now globally visible.
|
||||
+ *
|
||||
+ * The page is either in the "validate locked" state (PGT_[type] | 1) or
|
||||
+ * fully validated (PGT_[type] | PGT_validated | >0).
|
||||
+ */
|
||||
+
|
||||
+ if ( unlikely((x & PGT_count_mask) == 0) )
|
||||
+ {
|
||||
+ struct domain *d = page_get_owner(page);
|
||||
+
|
||||
+ if ( d && shadow_mode_enabled(d) )
|
||||
+ shadow_prepare_page_type_change(d, page, type);
|
||||
+
|
||||
+ if ( (x & PGT_type_mask) != type )
|
||||
+ {
|
||||
+ /*
|
||||
+ * On type change we check to flush stale TLB entries. It is
|
||||
+ * vital that no other CPUs are left with writeable mappings
|
||||
+ * to a frame which is intending to become pgtable/segdesc.
|
||||
+ */
|
||||
+ cpumask_t *mask = this_cpu(scratch_cpumask);
|
||||
+
|
||||
+ BUG_ON(in_irq());
|
||||
+ cpumask_copy(mask, d->dirty_cpumask);
|
||||
+
|
||||
+ /* Don't flush if the timestamp is old enough */
|
||||
+ tlbflush_filter(mask, page->tlbflush_timestamp);
|
||||
+
|
||||
+ if ( unlikely(!cpumask_empty(mask)) &&
|
||||
+ /* Shadow mode: track only writable pages. */
|
||||
+ (!shadow_mode_enabled(d) ||
|
||||
+ ((nx & PGT_type_mask) == PGT_writable_page)) )
|
||||
+ {
|
||||
+ perfc_incr(need_flush_tlb_flush);
|
||||
+ /*
|
||||
+ * If page was a page table make sure the flush is
|
||||
+ * performed using an IPI in order to avoid changing the
|
||||
+ * type of a page table page under the feet of
|
||||
+ * spurious_page_fault().
|
||||
+ */
|
||||
+ flush_mask(mask,
|
||||
+ (x & PGT_type_mask) &&
|
||||
+ (x & PGT_type_mask) <= PGT_root_page_table
|
||||
+ ? FLUSH_TLB | FLUSH_FORCE_IPI
|
||||
+ : FLUSH_TLB);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
|
||||
(type == PGT_writable_page)) )
|
||||
{
|
||||
@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
|
||||
if ( unlikely(!(nx & PGT_validated)) )
|
||||
{
|
||||
- if ( !(x & PGT_partial) )
|
||||
+ /*
|
||||
+ * No special validation needed for writable or shared pages. Page
|
||||
+ * tables and GDT/LDT need to have their contents audited.
|
||||
+ *
|
||||
+ * per validate_page(), non-atomic updates are fine here.
|
||||
+ */
|
||||
+ if ( type == PGT_writable_page || type == PGT_shared_page )
|
||||
+ page->u.inuse.type_info |= PGT_validated;
|
||||
+ else
|
||||
{
|
||||
- page->nr_validated_ptes = 0;
|
||||
- page->partial_flags = 0;
|
||||
- page->linear_pt_count = 0;
|
||||
+ if ( !(x & PGT_partial) )
|
||||
+ {
|
||||
+ page->nr_validated_ptes = 0;
|
||||
+ page->partial_flags = 0;
|
||||
+ page->linear_pt_count = 0;
|
||||
+ }
|
||||
+
|
||||
+ rc = validate_page(page, type, preemptible);
|
||||
}
|
||||
- rc = validate_page(page, type, preemptible);
|
||||
}
|
||||
|
||||
out:
|
|
@ -1,43 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/page: Introduce _PAGE_* constants for memory types
|
||||
|
||||
... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
|
||||
constants. These are going to be needed by forthcoming logic.
|
||||
|
||||
No functional change.
|
||||
|
||||
This is part of XSA-402.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
|
||||
index 1d080cffbe84..2e542050f65a 100644
|
||||
--- a/xen/include/asm-x86/page.h
|
||||
+++ b/xen/include/asm-x86/page.h
|
||||
@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
|
||||
|
||||
#define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
|
||||
|
||||
+/* Memory types, encoded under Xen's choice of MSR_PAT. */
|
||||
+#define _PAGE_WB ( 0)
|
||||
+#define _PAGE_WT ( _PAGE_PWT)
|
||||
+#define _PAGE_UCM ( _PAGE_PCD )
|
||||
+#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT)
|
||||
+#define _PAGE_WC (_PAGE_PAT )
|
||||
+#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT)
|
||||
+
|
||||
/*
|
||||
* Debug option: Ensure that granted mappings are not implicitly unmapped.
|
||||
* WARNING: This will need to be disabled to run OSes that use the spare PTE
|
||||
@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
|
||||
#define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED)
|
||||
#define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \
|
||||
_PAGE_DIRTY | _PAGE_RW)
|
||||
-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
|
||||
-#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
|
||||
+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
|
||||
+#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC)
|
||||
#define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
|
||||
|
||||
#define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
|
|
@ -1,213 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86: Don't change the cacheability of the directmap
|
||||
|
||||
Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
|
||||
in response to guest mapping requests") attempted to keep the cacheability
|
||||
consistent between different mappings of the same page.
|
||||
|
||||
The reason wasn't described in the changelog, but it is understood to be in
|
||||
regards to a concern over machine check exceptions, owing to errata when using
|
||||
mixed cacheabilities. It did this primarily by updating Xen's mapping of the
|
||||
page in the direct map when the guest mapped a page with reduced cacheability.
|
||||
|
||||
Unfortunately, the logic didn't actually prevent mixed cacheability from
|
||||
occurring:
|
||||
* A guest could map a page normally, and then map the same page with
|
||||
different cacheability; nothing prevented this.
|
||||
* The cacheability of the directmap was always latest-takes-precedence in
|
||||
terms of guest requests.
|
||||
* Grant-mapped frames with lesser cacheability didn't adjust the page's
|
||||
cacheattr settings.
|
||||
* The map_domain_page() function still unconditionally created WB mappings,
|
||||
irrespective of the page's cacheattr settings.
|
||||
|
||||
Additionally, update_xen_mappings() had a bug where the alias calculation was
|
||||
wrong for mfn's which were .init content, which should have been treated as
|
||||
fully guest pages, not Xen pages.
|
||||
|
||||
Worse yet, the logic introduced a vulnerability whereby necessary
|
||||
pagetable/segdesc adjustments made by Xen in the validation logic could become
|
||||
non-coherent between the cache and main memory. The CPU could subsequently
|
||||
operate on the stale value in the cache, rather than the safe value in main
|
||||
memory.
|
||||
|
||||
The directmap contains primarily mappings of RAM. PAT/MTRR conflict
|
||||
resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
|
||||
cacheability resolves to being coherent. The special case is WC mappings,
|
||||
which are non-coherent against MTRR=WB regions (except for fully-coherent
|
||||
CPUs).
|
||||
|
||||
Xen must not have any WC cacheability in the directmap, to prevent Xen's
|
||||
actions from creating non-coherency. (Guest actions creating non-coherency is
|
||||
dealt with in subsequent patches.) As all memory types for MTRR=WB ranges
|
||||
inter-operate coherently, so leave Xen's directmap mappings as WB.
|
||||
|
||||
Only PV guests with access to devices can use reduced-cacheability mappings to
|
||||
begin with, and they're trusted not to mount DoSs against the system anyway.
|
||||
|
||||
Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
|
||||
Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
|
||||
count. Retain the check in get_page_from_l1e() for special_pages() because a
|
||||
guest has no business using reduced cacheability on these.
|
||||
|
||||
This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
|
||||
|
||||
This is CVE-2022-26363, part of XSA-402.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index c6429b0f749a..ab32d13a1a0d 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
|
||||
return (page_get_owner(page) == dom_io);
|
||||
}
|
||||
|
||||
-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
|
||||
-{
|
||||
- int err = 0;
|
||||
- bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
|
||||
- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
|
||||
- unsigned long xen_va =
|
||||
- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
|
||||
-
|
||||
- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
|
||||
- return 0;
|
||||
-
|
||||
- if ( unlikely(alias) && cacheattr )
|
||||
- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
|
||||
- if ( !err )
|
||||
- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
|
||||
- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
|
||||
- if ( unlikely(alias) && !cacheattr && !err )
|
||||
- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
|
||||
-
|
||||
- return err;
|
||||
-}
|
||||
-
|
||||
#ifndef NDEBUG
|
||||
struct mmio_emul_range_ctxt {
|
||||
const struct domain *d;
|
||||
@@ -1009,47 +987,14 @@ get_page_from_l1e(
|
||||
goto could_not_pin;
|
||||
}
|
||||
|
||||
- if ( pte_flags_to_cacheattr(l1f) !=
|
||||
- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
|
||||
+ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
|
||||
{
|
||||
- unsigned long x, nx, y = page->count_info;
|
||||
- unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
|
||||
- int err;
|
||||
-
|
||||
- if ( is_special_page(page) )
|
||||
- {
|
||||
- if ( write )
|
||||
- put_page_type(page);
|
||||
- put_page(page);
|
||||
- gdprintk(XENLOG_WARNING,
|
||||
- "Attempt to change cache attributes of Xen heap page\n");
|
||||
- return -EACCES;
|
||||
- }
|
||||
-
|
||||
- do {
|
||||
- x = y;
|
||||
- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
|
||||
- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
|
||||
-
|
||||
- err = update_xen_mappings(mfn, cacheattr);
|
||||
- if ( unlikely(err) )
|
||||
- {
|
||||
- cacheattr = y & PGC_cacheattr_mask;
|
||||
- do {
|
||||
- x = y;
|
||||
- nx = (x & ~PGC_cacheattr_mask) | cacheattr;
|
||||
- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
|
||||
-
|
||||
- if ( write )
|
||||
- put_page_type(page);
|
||||
- put_page(page);
|
||||
-
|
||||
- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
|
||||
- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
|
||||
- mfn, get_gpfn_from_mfn(mfn),
|
||||
- l1e_get_intpte(l1e), l1e_owner->domain_id);
|
||||
- return err;
|
||||
- }
|
||||
+ if ( write )
|
||||
+ put_page_type(page);
|
||||
+ put_page(page);
|
||||
+ gdprintk(XENLOG_WARNING,
|
||||
+ "Attempt to change cache attributes of Xen heap page\n");
|
||||
+ return -EACCES;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
|
||||
*/
|
||||
static int cleanup_page_mappings(struct page_info *page)
|
||||
{
|
||||
- unsigned int cacheattr =
|
||||
- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
|
||||
int rc = 0;
|
||||
unsigned long mfn = mfn_x(page_to_mfn(page));
|
||||
|
||||
/*
|
||||
- * If we've modified xen mappings as a result of guest cache
|
||||
- * attributes, restore them to the "normal" state.
|
||||
- */
|
||||
- if ( unlikely(cacheattr) )
|
||||
- {
|
||||
- page->count_info &= ~PGC_cacheattr_mask;
|
||||
-
|
||||
- BUG_ON(is_special_page(page));
|
||||
-
|
||||
- rc = update_xen_mappings(mfn, 0);
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
* If this may be in a PV domain's IOMMU, remove it.
|
||||
*
|
||||
* NB that writable xenheap pages have their type set and cleared by
|
||||
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
||||
index cb9052749963..8a9a43bb0a9d 100644
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -69,25 +69,22 @@
|
||||
/* Set when is using a page as a page table */
|
||||
#define _PGC_page_table PG_shift(3)
|
||||
#define PGC_page_table PG_mask(1, 3)
|
||||
- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
|
||||
-#define PGC_cacheattr_base PG_shift(6)
|
||||
-#define PGC_cacheattr_mask PG_mask(7, 6)
|
||||
/* Page is broken? */
|
||||
-#define _PGC_broken PG_shift(7)
|
||||
-#define PGC_broken PG_mask(1, 7)
|
||||
+#define _PGC_broken PG_shift(4)
|
||||
+#define PGC_broken PG_mask(1, 4)
|
||||
/* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
|
||||
-#define PGC_state PG_mask(3, 9)
|
||||
-#define PGC_state_inuse PG_mask(0, 9)
|
||||
-#define PGC_state_offlining PG_mask(1, 9)
|
||||
-#define PGC_state_offlined PG_mask(2, 9)
|
||||
-#define PGC_state_free PG_mask(3, 9)
|
||||
+#define PGC_state PG_mask(3, 6)
|
||||
+#define PGC_state_inuse PG_mask(0, 6)
|
||||
+#define PGC_state_offlining PG_mask(1, 6)
|
||||
+#define PGC_state_offlined PG_mask(2, 6)
|
||||
+#define PGC_state_free PG_mask(3, 6)
|
||||
#define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
|
||||
/* Page is not reference counted (see below for caveats) */
|
||||
-#define _PGC_extra PG_shift(10)
|
||||
-#define PGC_extra PG_mask(1, 10)
|
||||
+#define _PGC_extra PG_shift(7)
|
||||
+#define PGC_extra PG_mask(1, 7)
|
||||
|
||||
/* Count of references to this frame. */
|
||||
-#define PGC_count_width PG_shift(10)
|
||||
+#define PGC_count_width PG_shift(7)
|
||||
#define PGC_count_mask ((1UL<<PGC_count_width)-1)
|
||||
|
||||
/*
|
|
@ -1,284 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86: Split cache_flush() out of cache_writeback()
|
||||
|
||||
Subsequent changes will want a fully flushing version.
|
||||
|
||||
Use the new helper rather than opencoding it in flush_area_local(). This
|
||||
resolves an outstanding issue where the conditional sfence is on the wrong
|
||||
side of the clflushopt loop. clflushopt is ordered with respect to older
|
||||
stores, not to younger stores.
|
||||
|
||||
Rename gnttab_cache_flush()'s helper to avoid colliding in name.
|
||||
grant_table.c can see the prototype from cache.h so the build fails
|
||||
otherwise.
|
||||
|
||||
This is part of XSA-402.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
Xen 4.16 and earlier:
|
||||
* Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
|
||||
syncing" to split cache_writeback() out of the IOMMU logic, but without the
|
||||
associated hooks changes.
|
||||
|
||||
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
|
||||
index 25798df50f54..0c912b8669f8 100644
|
||||
--- a/xen/arch/x86/flushtlb.c
|
||||
+++ b/xen/arch/x86/flushtlb.c
|
||||
@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
|
||||
if ( flags & FLUSH_CACHE )
|
||||
{
|
||||
const struct cpuinfo_x86 *c = ¤t_cpu_data;
|
||||
- unsigned long i, sz = 0;
|
||||
+ unsigned long sz = 0;
|
||||
|
||||
if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
|
||||
sz = 1UL << (order + PAGE_SHIFT);
|
||||
@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
|
||||
c->x86_clflush_size && c->x86_cache_size && sz &&
|
||||
((sz >> 10) < c->x86_cache_size) )
|
||||
{
|
||||
- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
|
||||
- for ( i = 0; i < sz; i += c->x86_clflush_size )
|
||||
- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
|
||||
- " clflush %0",
|
||||
- "data16 clflush %0", /* clflushopt */
|
||||
- X86_FEATURE_CLFLUSHOPT,
|
||||
- "m" (((const char *)va)[i]));
|
||||
+ cache_flush(va, sz);
|
||||
flags &= ~FLUSH_CACHE;
|
||||
}
|
||||
else
|
||||
@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
|
||||
return flags;
|
||||
}
|
||||
|
||||
+void cache_flush(const void *addr, unsigned int size)
|
||||
+{
|
||||
+ /*
|
||||
+ * This function may be called before current_cpu_data is established.
|
||||
+ * Hence a fallback is needed to prevent the loop below becoming infinite.
|
||||
+ */
|
||||
+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
|
||||
+ const void *end = addr + size;
|
||||
+
|
||||
+ addr -= (unsigned long)addr & (clflush_size - 1);
|
||||
+ for ( ; addr < end; addr += clflush_size )
|
||||
+ {
|
||||
+ /*
|
||||
+ * Note regarding the "ds" prefix use: it's faster to do a clflush
|
||||
+ * + prefix than a clflush + nop, and hence the prefix is added instead
|
||||
+ * of letting the alternative framework fill the gap by appending nops.
|
||||
+ */
|
||||
+ alternative_io("ds; clflush %[p]",
|
||||
+ "data16 clflush %[p]", /* clflushopt */
|
||||
+ X86_FEATURE_CLFLUSHOPT,
|
||||
+ /* no outputs */,
|
||||
+ [p] "m" (*(const char *)(addr)));
|
||||
+ }
|
||||
+
|
||||
+ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
|
||||
+}
|
||||
+
|
||||
+void cache_writeback(const void *addr, unsigned int size)
|
||||
+{
|
||||
+ unsigned int clflush_size;
|
||||
+ const void *end = addr + size;
|
||||
+
|
||||
+ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
|
||||
+ if ( !boot_cpu_has(X86_FEATURE_CLWB) )
|
||||
+ return cache_flush(addr, size);
|
||||
+
|
||||
+ /*
|
||||
+ * This function may be called before current_cpu_data is established.
|
||||
+ * Hence a fallback is needed to prevent the loop below becoming infinite.
|
||||
+ */
|
||||
+ clflush_size = current_cpu_data.x86_clflush_size ?: 16;
|
||||
+ addr -= (unsigned long)addr & (clflush_size - 1);
|
||||
+ for ( ; addr < end; addr += clflush_size )
|
||||
+ {
|
||||
+/*
|
||||
+ * The arguments to a macro must not include preprocessor directives. Doing so
|
||||
+ * results in undefined behavior, so we have to create some defines here in
|
||||
+ * order to avoid it.
|
||||
+ */
|
||||
+#if defined(HAVE_AS_CLWB)
|
||||
+# define CLWB_ENCODING "clwb %[p]"
|
||||
+#elif defined(HAVE_AS_XSAVEOPT)
|
||||
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
|
||||
+#else
|
||||
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
|
||||
+#endif
|
||||
+
|
||||
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
|
||||
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
|
||||
+# define INPUT BASE_INPUT
|
||||
+#else
|
||||
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
|
||||
+#endif
|
||||
+
|
||||
+ asm volatile (CLWB_ENCODING :: INPUT(addr));
|
||||
+
|
||||
+#undef INPUT
|
||||
+#undef BASE_INPUT
|
||||
+#undef CLWB_ENCODING
|
||||
+ }
|
||||
+
|
||||
+ asm volatile ("sfence" ::: "memory");
|
||||
+}
|
||||
+
|
||||
unsigned int guest_flush_tlb_flags(const struct domain *d)
|
||||
{
|
||||
bool shadow = paging_mode_shadow(d);
|
||||
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
|
||||
index 66f8ce71741c..4c742cd8fe81 100644
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
|
||||
+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
|
||||
{
|
||||
struct domain *d, *owner;
|
||||
struct page_info *page;
|
||||
@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
|
||||
return -EFAULT;
|
||||
for ( ; ; )
|
||||
{
|
||||
- int ret = cache_flush(&op, cur_ref);
|
||||
+ int ret = _cache_flush(&op, cur_ref);
|
||||
|
||||
if ( ret < 0 )
|
||||
return ret;
|
||||
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
|
||||
index 01e010a10d61..401079299725 100644
|
||||
--- a/xen/drivers/passthrough/vtd/extern.h
|
||||
+++ b/xen/drivers/passthrough/vtd/extern.h
|
||||
@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
|
||||
struct pci_dev *pdev,
|
||||
u16 did, u16 size, u64 addr);
|
||||
|
||||
-unsigned int get_cache_line_size(void);
|
||||
void flush_all_cache(void);
|
||||
|
||||
uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
|
||||
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
|
||||
index 8975c1de61bc..bc377c9bcfa4 100644
|
||||
--- a/xen/drivers/passthrough/vtd/iommu.c
|
||||
+++ b/xen/drivers/passthrough/vtd/iommu.c
|
||||
@@ -31,6 +31,7 @@
|
||||
#include <xen/pci.h>
|
||||
#include <xen/pci_regs.h>
|
||||
#include <xen/keyhandler.h>
|
||||
+#include <asm/cache.h>
|
||||
#include <asm/msi.h>
|
||||
#include <asm/nops.h>
|
||||
#include <asm/irq.h>
|
||||
@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
|
||||
}
|
||||
}
|
||||
|
||||
-static void sync_cache(const void *addr, unsigned int size)
|
||||
-{
|
||||
- static unsigned long clflush_size = 0;
|
||||
- const void *end = addr + size;
|
||||
-
|
||||
- if ( clflush_size == 0 )
|
||||
- clflush_size = get_cache_line_size();
|
||||
-
|
||||
- addr -= (unsigned long)addr & (clflush_size - 1);
|
||||
- for ( ; addr < end; addr += clflush_size )
|
||||
-/*
|
||||
- * The arguments to a macro must not include preprocessor directives. Doing so
|
||||
- * results in undefined behavior, so we have to create some defines here in
|
||||
- * order to avoid it.
|
||||
- */
|
||||
-#if defined(HAVE_AS_CLWB)
|
||||
-# define CLWB_ENCODING "clwb %[p]"
|
||||
-#elif defined(HAVE_AS_XSAVEOPT)
|
||||
-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
|
||||
-#else
|
||||
-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
|
||||
-#endif
|
||||
-
|
||||
-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
|
||||
-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
|
||||
-# define INPUT BASE_INPUT
|
||||
-#else
|
||||
-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
|
||||
-#endif
|
||||
- /*
|
||||
- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
|
||||
- * + prefix than a clflush + nop, and hence the prefix is added instead
|
||||
- * of letting the alternative framework fill the gap by appending nops.
|
||||
- */
|
||||
- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
|
||||
- "data16 clflush %[p]", /* clflushopt */
|
||||
- X86_FEATURE_CLFLUSHOPT,
|
||||
- CLWB_ENCODING,
|
||||
- X86_FEATURE_CLWB, /* no outputs */,
|
||||
- INPUT(addr));
|
||||
-#undef INPUT
|
||||
-#undef BASE_INPUT
|
||||
-#undef CLWB_ENCODING
|
||||
-
|
||||
- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
|
||||
- "sfence", X86_FEATURE_CLWB);
|
||||
-}
|
||||
-
|
||||
/* Allocate page table, return its machine address */
|
||||
uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
|
||||
{
|
||||
@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
|
||||
clear_page(vaddr);
|
||||
|
||||
if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
|
||||
- sync_cache(vaddr, PAGE_SIZE);
|
||||
+ cache_writeback(vaddr, PAGE_SIZE);
|
||||
unmap_domain_page(vaddr);
|
||||
cur_pg++;
|
||||
}
|
||||
@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
|
||||
iommu->nr_pt_levels = agaw_to_level(agaw);
|
||||
|
||||
if ( !ecap_coherent(iommu->ecap) )
|
||||
- vtd_ops.sync_cache = sync_cache;
|
||||
+ vtd_ops.sync_cache = cache_writeback;
|
||||
|
||||
/* allocate domain id bitmap */
|
||||
iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
|
||||
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
|
||||
index 6681dccd6970..55f0faa521cb 100644
|
||||
--- a/xen/drivers/passthrough/vtd/x86/vtd.c
|
||||
+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
|
||||
@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
|
||||
unmap_domain_page(va);
|
||||
}
|
||||
|
||||
-unsigned int get_cache_line_size(void)
|
||||
-{
|
||||
- return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
|
||||
-}
|
||||
-
|
||||
void flush_all_cache()
|
||||
{
|
||||
wbinvd();
|
||||
diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
|
||||
index 1f7173d8c72c..e4770efb22b9 100644
|
||||
--- a/xen/include/asm-x86/cache.h
|
||||
+++ b/xen/include/asm-x86/cache.h
|
||||
@@ -11,4 +11,11 @@
|
||||
|
||||
#define __read_mostly __section(".data.read_mostly")
|
||||
|
||||
+#ifndef __ASSEMBLY__
|
||||
+
|
||||
+void cache_flush(const void *addr, unsigned int size);
|
||||
+void cache_writeback(const void *addr, unsigned int size);
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
#endif
|
|
@ -1,83 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/amd: Work around CLFLUSH ordering on older parts
|
||||
|
||||
On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
|
||||
including reads and writes to the address, and LFENCE/SFENCE instructions.
|
||||
|
||||
This creates a multitude of problematic corner cases, laid out in the manual.
|
||||
Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
|
||||
|
||||
This is part of XSA-402.
|
||||
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
|
||||
index a8e37dbb1f5c..b3b9a0df5fed 100644
|
||||
--- a/xen/arch/x86/cpu/amd.c
|
||||
+++ b/xen/arch/x86/cpu/amd.c
|
||||
@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
|
||||
if (!cpu_has_lfence_dispatch)
|
||||
__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
|
||||
|
||||
+ /*
|
||||
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
|
||||
+ * everything, including reads and writes to address, and
|
||||
+ * LFENCE/SFENCE instructions.
|
||||
+ */
|
||||
+ if (!cpu_has_clflushopt)
|
||||
+ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
|
||||
+
|
||||
switch(c->x86)
|
||||
{
|
||||
case 0xf ... 0x11:
|
||||
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
|
||||
index 0c912b8669f8..dcbb4064012e 100644
|
||||
--- a/xen/arch/x86/flushtlb.c
|
||||
+++ b/xen/arch/x86/flushtlb.c
|
||||
@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
|
||||
return flags;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
|
||||
+ * including reads and writes to address, and LFENCE/SFENCE instructions.
|
||||
+ *
|
||||
+ * This function only works safely after alternatives have run. Luckily, at
|
||||
+ * the time of writing, we don't flush the caches that early.
|
||||
+ */
|
||||
void cache_flush(const void *addr, unsigned int size)
|
||||
{
|
||||
/*
|
||||
@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
|
||||
unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
|
||||
const void *end = addr + size;
|
||||
|
||||
+ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
|
||||
+
|
||||
addr -= (unsigned long)addr & (clflush_size - 1);
|
||||
for ( ; addr < end; addr += clflush_size )
|
||||
{
|
||||
@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
|
||||
[p] "m" (*(const char *)(addr)));
|
||||
}
|
||||
|
||||
- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
|
||||
+ alternative_2("",
|
||||
+ "sfence", X86_FEATURE_CLFLUSHOPT,
|
||||
+ "mfence", X86_BUG_CLFLUSH_MFENCE);
|
||||
}
|
||||
|
||||
void cache_writeback(const void *addr, unsigned int size)
|
||||
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
|
||||
index 7413febd7ad8..ff3157d52d13 100644
|
||||
--- a/xen/include/asm-x86/cpufeatures.h
|
||||
+++ b/xen/include/asm-x86/cpufeatures.h
|
||||
@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch
|
||||
|
||||
#define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
|
||||
#define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
|
||||
+#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
|
||||
|
||||
/* Total number of capability words, inc synth and bug words. */
|
||||
#define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
|
|
@ -1,148 +0,0 @@
|
|||
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Subject: x86/pv: Track and flush non-coherent mappings of RAM
|
||||
|
||||
There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
|
||||
devices that make non-coherent writes. The Linux sound subsystem makes
|
||||
extensive use of this technique.
|
||||
|
||||
For such usecases, the guest's DMA buffer is mapped and consistently used as
|
||||
WC, and Xen doesn't interact with the buffer.
|
||||
|
||||
However, a mischevious guest can use WC mappings to deliberately create
|
||||
non-coherency between the cache and RAM, and use this to trick Xen into
|
||||
validating a pagetable which isn't actually safe.
|
||||
|
||||
Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set
|
||||
it whenever a non-coherent writeable mapping is created. If the page is used
|
||||
as anything other than PGT_writable_page, force a cache flush before
|
||||
validation. Also force a cache flush before the page is returned to the heap.
|
||||
|
||||
This is CVE-2022-26364, part of XSA-402.
|
||||
|
||||
Reported-by: Jann Horn <jannh@google.com>
|
||||
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
||||
index ab32d13a1a0d..bab9624fabb7 100644
|
||||
--- a/xen/arch/x86/mm.c
|
||||
+++ b/xen/arch/x86/mm.c
|
||||
@@ -997,6 +997,15 @@ get_page_from_l1e(
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Track writeable non-coherent mappings to RAM pages, to trigger a cache
|
||||
+ * flush later if the target is used as anything but a PGT_writeable page.
|
||||
+ * We care about all writeable mappings, including foreign mappings.
|
||||
+ */
|
||||
+ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
|
||||
+ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
|
||||
+ set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
|
||||
+
|
||||
return 0;
|
||||
|
||||
could_not_pin:
|
||||
@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
|
||||
}
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Flush the cache if there were previously non-coherent writeable
|
||||
+ * mappings of this page. This forces the page to be coherent before it
|
||||
+ * is freed back to the heap.
|
||||
+ */
|
||||
+ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
|
||||
+ {
|
||||
+ void *addr = __map_domain_page(page);
|
||||
+
|
||||
+ cache_flush(addr, PAGE_SIZE);
|
||||
+ unmap_domain_page(addr);
|
||||
+ }
|
||||
+
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
|
||||
if ( unlikely(!(nx & PGT_validated)) )
|
||||
{
|
||||
/*
|
||||
+ * Flush the cache if there were previously non-coherent mappings of
|
||||
+ * this page, and we're trying to use it as anything other than a
|
||||
+ * writeable page. This forces the page to be coherent before we
|
||||
+ * validate its contents for safety.
|
||||
+ */
|
||||
+ if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
|
||||
+ {
|
||||
+ void *addr = __map_domain_page(page);
|
||||
+
|
||||
+ cache_flush(addr, PAGE_SIZE);
|
||||
+ unmap_domain_page(addr);
|
||||
+
|
||||
+ page->u.inuse.type_info &= ~PGT_non_coherent;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
* No special validation needed for writable or shared pages. Page
|
||||
* tables and GDT/LDT need to have their contents audited.
|
||||
*
|
||||
diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
|
||||
index 0325618c9883..81c72e61ed55 100644
|
||||
--- a/xen/arch/x86/pv/grant_table.c
|
||||
+++ b/xen/arch/x86/pv/grant_table.c
|
||||
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
|
||||
|
||||
ol1e = *pl1e;
|
||||
if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
|
||||
+ {
|
||||
+ /*
|
||||
+ * We always create mappings in this path. However, our caller,
|
||||
+ * map_grant_ref(), only passes potentially non-zero cache_flags for
|
||||
+ * MMIO frames, so this path doesn't create non-coherent mappings of
|
||||
+ * RAM frames and there's no need to calculate PGT_non_coherent.
|
||||
+ */
|
||||
+ ASSERT(!cache_flags || is_iomem_page(frame));
|
||||
+
|
||||
rc = GNTST_okay;
|
||||
+ }
|
||||
|
||||
out_unlock:
|
||||
page_unlock(page);
|
||||
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
|
||||
l1e_get_flags(ol1e), addr, grant_pte_flags);
|
||||
|
||||
if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
|
||||
+ {
|
||||
+ /*
|
||||
+ * Generally, replace_grant_pv_mapping() is used to destroy mappings
|
||||
+ * (n1le = l1e_empty()), but it can be a present mapping on the
|
||||
+ * GNTABOP_unmap_and_replace path.
|
||||
+ *
|
||||
+ * In such cases, the PTE is fully transplanted from its old location
|
||||
+ * via steal_linear_addr(), so we need not perform PGT_non_coherent
|
||||
+ * checking here.
|
||||
+ */
|
||||
rc = GNTST_okay;
|
||||
+ }
|
||||
|
||||
out_unlock:
|
||||
page_unlock(page);
|
||||
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
||||
index 8a9a43bb0a9d..7464167ae192 100644
|
||||
--- a/xen/include/asm-x86/mm.h
|
||||
+++ b/xen/include/asm-x86/mm.h
|
||||
@@ -53,8 +53,12 @@
|
||||
#define _PGT_partial PG_shift(8)
|
||||
#define PGT_partial PG_mask(1, 8)
|
||||
|
||||
+/* Has this page been mapped writeable with a non-coherent memory type? */
|
||||
+#define _PGT_non_coherent PG_shift(9)
|
||||
+#define PGT_non_coherent PG_mask(1, 9)
|
||||
+
|
||||
/* Count of uses of this frame as its current type. */
|
||||
-#define PGT_count_width PG_shift(8)
|
||||
+#define PGT_count_width PG_shift(9)
|
||||
#define PGT_count_mask ((1UL<<PGT_count_width)-1)
|
||||
|
||||
/* Are the 'type mask' bits identical? */
|
|
@ -0,0 +1,59 @@
|
|||
From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <jgrall@amazon.com>
|
||||
Date: Mon, 6 Jun 2022 06:17:25 +0000
|
||||
Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying
|
||||
|
||||
During the domain destroy process, the domain will still be accessible
|
||||
until it is fully destroyed. So does the P2M because we don't bail
|
||||
out early if is_dying is non-zero. If a domain has permission to
|
||||
modify the other domain's P2M (i.e. dom0, or a stubdomain), then
|
||||
foreign mapping can be added past relinquish_p2m_mapping().
|
||||
|
||||
Therefore, we need to prevent mapping to be added when the domain
|
||||
is dying. This commit prevents such adding of mapping by adding the
|
||||
d->is_dying check to p2m_set_entry(). Also this commit enhances the
|
||||
check in relinquish_p2m_mapping() to make sure that no mappings can
|
||||
be added in the P2M after the P2M lock is released.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Julien Grall <jgrall@amazon.com>
|
||||
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
|
||||
Tested-by: Henry Wang <Henry.Wang@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
---
|
||||
xen/arch/arm/p2m.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
|
||||
index fb71fa4c1c90..cbeff90f4371 100644
|
||||
--- a/xen/arch/arm/p2m.c
|
||||
+++ b/xen/arch/arm/p2m.c
|
||||
@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m,
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
+ /*
|
||||
+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will
|
||||
+ * be dropped in relinquish_p2m_mapping(). As the P2M will still
|
||||
+ * be accessible after, we need to prevent mapping to be added when the
|
||||
+ * domain is dying.
|
||||
+ */
|
||||
+ if ( unlikely(p2m->domain->is_dying) )
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
while ( nr )
|
||||
{
|
||||
unsigned long mask;
|
||||
@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d)
|
||||
unsigned int order;
|
||||
gfn_t start, end;
|
||||
|
||||
+ BUG_ON(!d->is_dying);
|
||||
+ /* No mappings can be added in the P2M after the P2M lock is released. */
|
||||
p2m_write_lock(p2m);
|
||||
|
||||
start = p2m->lowest_mapped_gfn;
|
||||
--
|
||||
2.37.1
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001
|
||||
From: Julien Grall <jgrall@amazon.com>
|
||||
Date: Mon, 6 Jun 2022 06:17:26 +0000
|
||||
Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate
|
||||
page tables
|
||||
|
||||
At the moment the P2M page tables will be freed when the domain structure
|
||||
is freed without any preemption. As the P2M is quite large, iterating
|
||||
through this may take more time than it is reasonable without intermediate
|
||||
preemption (to run softirqs and perhaps scheduler).
|
||||
|
||||
Split p2m_teardown() in two parts: one preemptible and called when
|
||||
relinquishing the resources, the other one non-preemptible and called
|
||||
when freeing the domain structure.
|
||||
|
||||
As we are now freeing the P2M pages early, we also need to prevent
|
||||
further allocation if someone call p2m_set_entry() past p2m_teardown()
|
||||
(I wasn't able to prove this will never happen). This is done by
|
||||
the checking domain->is_dying from previous patch in p2m_set_entry().
|
||||
|
||||
Similarly, we want to make sure that no-one can accessed the free
|
||||
pages. Therefore the root is cleared before freeing pages.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Julien Grall <jgrall@amazon.com>
|
||||
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
|
||||
Tested-by: Henry Wang <Henry.Wang@arm.com>
|
||||
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
|
||||
---
|
||||
xen/arch/arm/domain.c | 10 +++++++--
|
||||
xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++---
|
||||
xen/include/asm-arm/p2m.h | 13 +++++++++--
|
||||
3 files changed, 63 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
|
||||
index 96e1b235501d..2694c39127c5 100644
|
||||
--- a/xen/arch/arm/domain.c
|
||||
+++ b/xen/arch/arm/domain.c
|
||||
@@ -789,10 +789,10 @@ fail:
|
||||
void arch_domain_destroy(struct domain *d)
|
||||
{
|
||||
/* IOMMU page table is shared with P2M, always call
|
||||
- * iommu_domain_destroy() before p2m_teardown().
|
||||
+ * iommu_domain_destroy() before p2m_final_teardown().
|
||||
*/
|
||||
iommu_domain_destroy(d);
|
||||
- p2m_teardown(d);
|
||||
+ p2m_final_teardown(d);
|
||||
domain_vgic_free(d);
|
||||
domain_vuart_free(d);
|
||||
free_xenheap_page(d->shared_info);
|
||||
@@ -996,6 +996,7 @@ enum {
|
||||
PROG_xen,
|
||||
PROG_page,
|
||||
PROG_mapping,
|
||||
+ PROG_p2m,
|
||||
PROG_done,
|
||||
};
|
||||
|
||||
@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d)
|
||||
if ( ret )
|
||||
return ret;
|
||||
|
||||
+ PROGRESS(p2m):
|
||||
+ ret = p2m_teardown(d);
|
||||
+ if ( ret )
|
||||
+ return ret;
|
||||
+
|
||||
PROGRESS(done):
|
||||
break;
|
||||
|
||||
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
|
||||
index cbeff90f4371..3bcd1e897e88 100644
|
||||
--- a/xen/arch/arm/p2m.c
|
||||
+++ b/xen/arch/arm/p2m.c
|
||||
@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d)
|
||||
spin_unlock(&vmid_alloc_lock);
|
||||
}
|
||||
|
||||
-void p2m_teardown(struct domain *d)
|
||||
+int p2m_teardown(struct domain *d)
|
||||
{
|
||||
struct p2m_domain *p2m = p2m_get_hostp2m(d);
|
||||
+ unsigned long count = 0;
|
||||
struct page_info *pg;
|
||||
+ unsigned int i;
|
||||
+ int rc = 0;
|
||||
+
|
||||
+ p2m_write_lock(p2m);
|
||||
+
|
||||
+ /*
|
||||
+ * We are about to free the intermediate page-tables, so clear the
|
||||
+ * root to prevent any walk to use them.
|
||||
+ */
|
||||
+ for ( i = 0; i < P2M_ROOT_PAGES; i++ )
|
||||
+ clear_and_clean_page(p2m->root + i);
|
||||
+
|
||||
+ /*
|
||||
+ * The domain will not be scheduled anymore, so in theory we should
|
||||
+ * not need to flush the TLBs. Do it for safety purpose.
|
||||
+ *
|
||||
+ * Note that all the devices have already been de-assigned. So we don't
|
||||
+ * need to flush the IOMMU TLB here.
|
||||
+ */
|
||||
+ p2m_force_tlb_flush_sync(p2m);
|
||||
+
|
||||
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
|
||||
+ {
|
||||
+ free_domheap_page(pg);
|
||||
+ count++;
|
||||
+ /* Arbitrarily preempt every 512 iterations */
|
||||
+ if ( !(count % 512) && hypercall_preempt_check() )
|
||||
+ {
|
||||
+ rc = -ERESTART;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p2m_write_unlock(p2m);
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
+void p2m_final_teardown(struct domain *d)
|
||||
+{
|
||||
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
|
||||
|
||||
/* p2m not actually initialized */
|
||||
if ( !p2m->domain )
|
||||
return;
|
||||
|
||||
- while ( (pg = page_list_remove_head(&p2m->pages)) )
|
||||
- free_domheap_page(pg);
|
||||
+ ASSERT(page_list_empty(&p2m->pages));
|
||||
|
||||
if ( p2m->root )
|
||||
free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
|
||||
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
|
||||
index 8f11d9c97b5d..b3ba83283e11 100644
|
||||
--- a/xen/include/asm-arm/p2m.h
|
||||
+++ b/xen/include/asm-arm/p2m.h
|
||||
@@ -192,8 +192,17 @@ void setup_virt_paging(void);
|
||||
/* Init the datastructures for later use by the p2m code */
|
||||
int p2m_init(struct domain *d);
|
||||
|
||||
-/* Return all the p2m resources to Xen. */
|
||||
-void p2m_teardown(struct domain *d);
|
||||
+/*
|
||||
+ * The P2M resources are freed in two parts:
|
||||
+ * - p2m_teardown() will be called when relinquish the resources. It
|
||||
+ * will free large resources (e.g. intermediate page-tables) that
|
||||
+ * requires preemption.
|
||||
+ * - p2m_final_teardown() will be called when domain struct is been
|
||||
+ * freed. This *cannot* be preempted and therefore one small
|
||||
+ * resources should be freed here.
|
||||
+ */
|
||||
+int p2m_teardown(struct domain *d);
|
||||
+void p2m_final_teardown(struct domain *d);
|
||||
|
||||
/*
|
||||
* Remove mapping refcount on each mapping page in the p2m
|
||||
--
|
||||
2.37.1
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
From: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown()
|
||||
|
||||
Add a new parameter to p2m_teardown() in order to select whether the
|
||||
root page table should also be freed. Note that all users are
|
||||
adjusted to pass the parameter to remove the root page tables, so
|
||||
behavior is not modified.
|
||||
|
||||
No functional change intended.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Suggested-by: Julien Grall <julien@xen.org>
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/include/asm-x86/p2m.h
|
||||
+++ b/xen/include/asm-x86/p2m.h
|
||||
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
|
||||
int p2m_alloc_table(struct p2m_domain *p2m);
|
||||
|
||||
/* Return all the p2m resources to Xen. */
|
||||
-void p2m_teardown(struct p2m_domain *p2m);
|
||||
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
|
||||
void p2m_final_teardown(struct domain *d);
|
||||
|
||||
/* Add a page to a domain's p2m table */
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d
|
||||
}
|
||||
|
||||
for ( i = 0; i < MAX_ALTP2M; i++ )
|
||||
- p2m_teardown(d->arch.altp2m_p2m[i]);
|
||||
+ p2m_teardown(d->arch.altp2m_p2m[i], true);
|
||||
}
|
||||
|
||||
/* Destroy nestedp2m's first */
|
||||
for (i = 0; i < MAX_NESTEDP2M; i++) {
|
||||
- p2m_teardown(d->arch.nested_p2m[i]);
|
||||
+ p2m_teardown(d->arch.nested_p2m[i], true);
|
||||
}
|
||||
|
||||
if ( d->arch.paging.hap.total_pages != 0 )
|
||||
hap_teardown(d, NULL);
|
||||
|
||||
- p2m_teardown(p2m_get_hostp2m(d));
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), true);
|
||||
/* Free any memory that the p2m teardown released */
|
||||
paging_lock(d);
|
||||
hap_set_allocation(d, 0, NULL);
|
||||
--- a/xen/arch/x86/mm/p2m.c
|
||||
+++ b/xen/arch/x86/mm/p2m.c
|
||||
@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p
|
||||
* hvm fixme: when adding support for pvh non-hardware domains, this path must
|
||||
* cleanup any foreign p2m types (release refcnts on them).
|
||||
*/
|
||||
-void p2m_teardown(struct p2m_domain *p2m)
|
||||
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
|
||||
/* Return all the p2m pages to Xen.
|
||||
* We know we don't have any extra mappings to these pages */
|
||||
{
|
||||
- struct page_info *pg;
|
||||
+ struct page_info *pg, *root_pg = NULL;
|
||||
struct domain *d;
|
||||
|
||||
if (p2m == NULL)
|
||||
@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m
|
||||
|
||||
p2m_lock(p2m);
|
||||
ASSERT(atomic_read(&d->shr_pages) == 0);
|
||||
- p2m->phys_table = pagetable_null();
|
||||
+
|
||||
+ if ( remove_root )
|
||||
+ p2m->phys_table = pagetable_null();
|
||||
+ else if ( !pagetable_is_null(p2m->phys_table) )
|
||||
+ {
|
||||
+ root_pg = pagetable_get_page(p2m->phys_table);
|
||||
+ clear_domain_page(pagetable_get_mfn(p2m->phys_table));
|
||||
+ }
|
||||
|
||||
while ( (pg = page_list_remove_head(&p2m->pages)) )
|
||||
- d->arch.paging.free_page(d, pg);
|
||||
+ if ( pg != root_pg )
|
||||
+ d->arch.paging.free_page(d, pg);
|
||||
+
|
||||
+ if ( root_pg )
|
||||
+ page_list_add(root_pg, &p2m->pages);
|
||||
+
|
||||
p2m_unlock(p2m);
|
||||
}
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32
|
||||
paging_unlock(d);
|
||||
out_unlocked:
|
||||
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
|
||||
- p2m_teardown(p2m);
|
||||
+ p2m_teardown(p2m, true);
|
||||
if ( rv != 0 && pg != NULL )
|
||||
{
|
||||
pg->count_info &= ~PGC_count_mask;
|
||||
@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain
|
||||
shadow_teardown(d, NULL);
|
||||
|
||||
/* It is now safe to pull down the p2m map. */
|
||||
- p2m_teardown(p2m_get_hostp2m(d));
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), true);
|
||||
/* Free any shadow memory that the p2m teardown released */
|
||||
paging_lock(d);
|
||||
shadow_set_allocation(d, 0, NULL);
|
|
@ -0,0 +1,62 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: x86/HAP: adjust monitor table related error handling
|
||||
|
||||
hap_make_monitor_table() will return INVALID_MFN if it encounters an
|
||||
error condition, but hap_update_paging_modes() wasn’t handling this
|
||||
value, resulting in an inappropriate value being stored in
|
||||
monitor_table. This would subsequently misguide at least
|
||||
hap_vcpu_teardown(). Avoid this by bailing early.
|
||||
|
||||
Further, when a domain has/was already crashed or (perhaps less
|
||||
important as there's no such path known to lead here) is already dying,
|
||||
avoid calling domain_crash() on it again - that's at best confusing.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -39,6 +39,7 @@
|
||||
#include <asm/domain.h>
|
||||
#include <xen/numa.h>
|
||||
#include <asm/hvm/nestedhvm.h>
|
||||
+#include <public/sched.h>
|
||||
|
||||
#include "private.h"
|
||||
|
||||
@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru
|
||||
return m4mfn;
|
||||
|
||||
oom:
|
||||
- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n");
|
||||
- domain_crash(d);
|
||||
+ if ( !d->is_dying &&
|
||||
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
|
||||
+ {
|
||||
+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n",
|
||||
+ d);
|
||||
+ domain_crash(d);
|
||||
+ }
|
||||
return INVALID_MFN;
|
||||
}
|
||||
|
||||
@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru
|
||||
if ( pagetable_is_null(v->arch.hvm.monitor_table) )
|
||||
{
|
||||
mfn_t mmfn = hap_make_monitor_table(v);
|
||||
+
|
||||
+ if ( mfn_eq(mmfn, INVALID_MFN) )
|
||||
+ goto unlock;
|
||||
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
|
||||
make_cr3(v, mmfn);
|
||||
hvm_update_host_cr3(v);
|
||||
@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru
|
||||
/* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
|
||||
hap_update_cr3(v, 0, false);
|
||||
|
||||
+ unlock:
|
||||
paging_unlock(d);
|
||||
put_gfn(d, cr3_gfn);
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow()
|
||||
|
||||
Subsequently sh_set_toplevel_shadow() will be adjusted to install a
|
||||
blank entry in case prealloc fails. There are, in fact, pre-existing
|
||||
error paths which would put in place a blank entry. The 4- and 2-level
|
||||
code in sh_update_cr3(), however, assume the top level entry to be
|
||||
valid.
|
||||
|
||||
Hence bail from the function in the unlikely event that it's not. Note
|
||||
that 3-level logic works differently: In particular a guest is free to
|
||||
supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid)
|
||||
entries. The guest will crash, but we already cope with that.
|
||||
|
||||
Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(),
|
||||
and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change
|
||||
in security context, but add a respective assertion.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu
|
||||
/* Now figure out the new contents: is this a valid guest MFN? */
|
||||
if ( !mfn_valid(gmfn) )
|
||||
{
|
||||
+ ASSERT(mfn_eq(gmfn, INVALID_MFN));
|
||||
new_entry = pagetable_null();
|
||||
goto install_new_entry;
|
||||
}
|
||||
--- a/xen/arch/x86/mm/shadow/multi.c
|
||||
+++ b/xen/arch/x86/mm/shadow/multi.c
|
||||
@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
|
||||
if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
|
||||
guest_flush_tlb_mask(d, d->dirty_cpumask);
|
||||
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
|
||||
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
|
||||
+ {
|
||||
+ ASSERT(d->is_dying || d->is_shutting_down);
|
||||
+ return;
|
||||
+ }
|
||||
if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
|
||||
{
|
||||
mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]);
|
||||
@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
|
||||
if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
|
||||
guest_flush_tlb_mask(d, d->dirty_cpumask);
|
||||
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
|
||||
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
|
||||
+ {
|
||||
+ ASSERT(d->is_dying || d->is_shutting_down);
|
||||
+ return;
|
||||
+ }
|
||||
#else
|
||||
#error This should never happen
|
||||
#endif
|
|
@ -0,0 +1,255 @@
|
|||
From: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Subject: x86/shadow: tolerate failure in shadow_prealloc()
|
||||
|
||||
Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
|
||||
the pre-allocation and instead return true/false. Modify
|
||||
shadow_prealloc() to crash the domain on allocation failure (if the
|
||||
domain is not already dying), as shadow cannot operate normally after
|
||||
that. Modify callers to also gracefully handle {_,}shadow_prealloc()
|
||||
failing to fulfill the request.
|
||||
|
||||
Note this in turn requires adjusting the callers of
|
||||
sh_make_monitor_table() also to handle it returning INVALID_MFN.
|
||||
sh_update_paging_modes() is also modified to add additional error
|
||||
paths in case of allocation failure, some of those will return with
|
||||
null monitor page tables (and the domain likely crashed). This is no
|
||||
different that current error paths, but the newly introduced ones are
|
||||
more likely to trigger.
|
||||
|
||||
The now added failure points in sh_update_paging_modes() also require
|
||||
that on some error return paths the previous structures are cleared,
|
||||
and thus monitor table is null.
|
||||
|
||||
While there adjust the 'type' parameter type of shadow_prealloc() to
|
||||
unsigned int rather than u32.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -36,6 +36,7 @@
|
||||
#include <asm/flushtlb.h>
|
||||
#include <asm/shadow.h>
|
||||
#include <xen/numa.h>
|
||||
+#include <public/sched.h>
|
||||
#include "private.h"
|
||||
|
||||
DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
|
||||
@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc
|
||||
|
||||
/* Make sure there are at least count order-sized pages
|
||||
* available in the shadow page pool. */
|
||||
-static void _shadow_prealloc(struct domain *d, unsigned int pages)
|
||||
+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
|
||||
{
|
||||
struct vcpu *v;
|
||||
struct page_info *sp, *t;
|
||||
mfn_t smfn;
|
||||
int i;
|
||||
|
||||
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
|
||||
+ if ( d->arch.paging.shadow.free_pages >= pages )
|
||||
+ return true;
|
||||
|
||||
/* Shouldn't have enabled shadows if we've no vcpus. */
|
||||
ASSERT(d->vcpu && d->vcpu[0]);
|
||||
@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma
|
||||
sh_unpin(d, smfn);
|
||||
|
||||
/* See if that freed up enough space */
|
||||
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
|
||||
+ if ( d->arch.paging.shadow.free_pages >= pages )
|
||||
+ return true;
|
||||
}
|
||||
|
||||
/* Stage two: all shadow pages are in use in hierarchies that are
|
||||
@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma
|
||||
if ( d->arch.paging.shadow.free_pages >= pages )
|
||||
{
|
||||
guest_flush_tlb_mask(d, d->dirty_cpumask);
|
||||
- return;
|
||||
+ return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma
|
||||
d->arch.paging.shadow.total_pages,
|
||||
d->arch.paging.shadow.free_pages,
|
||||
d->arch.paging.shadow.p2m_pages);
|
||||
- BUG();
|
||||
+
|
||||
+ ASSERT(d->is_dying);
|
||||
+
|
||||
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
|
||||
+
|
||||
+ return false;
|
||||
}
|
||||
|
||||
/* Make sure there are at least count pages of the order according to
|
||||
@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma
|
||||
* This must be called before any calls to shadow_alloc(). Since this
|
||||
* will free existing shadows to make room, it must be called early enough
|
||||
* to avoid freeing shadows that the caller is currently working on. */
|
||||
-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
|
||||
+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
|
||||
{
|
||||
- return _shadow_prealloc(d, shadow_size(type) * count);
|
||||
+ bool ret = _shadow_prealloc(d, shadow_size(type) * count);
|
||||
+
|
||||
+ if ( !ret && !d->is_dying &&
|
||||
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
|
||||
+ /*
|
||||
+ * Failing to allocate memory required for shadow usage can only result in
|
||||
+ * a domain crash, do it here rather that relying on every caller to do it.
|
||||
+ */
|
||||
+ domain_crash(d);
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
/* Deliberately free all the memory we can: this will tear down all of
|
||||
@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t
|
||||
static struct page_info *
|
||||
shadow_alloc_p2m_page(struct domain *d)
|
||||
{
|
||||
- struct page_info *pg;
|
||||
+ struct page_info *pg = NULL;
|
||||
|
||||
/* This is called both from the p2m code (which never holds the
|
||||
* paging lock) and the log-dirty code (which always does). */
|
||||
@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
|
||||
d->arch.paging.shadow.p2m_pages,
|
||||
shadow_min_acceptable_pages(d));
|
||||
}
|
||||
- paging_unlock(d);
|
||||
- return NULL;
|
||||
+ goto out;
|
||||
}
|
||||
|
||||
- shadow_prealloc(d, SH_type_p2m_table, 1);
|
||||
+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
|
||||
+ goto out;
|
||||
+
|
||||
pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
|
||||
d->arch.paging.shadow.p2m_pages++;
|
||||
d->arch.paging.shadow.total_pages--;
|
||||
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
|
||||
|
||||
+ out:
|
||||
paging_unlock(d);
|
||||
|
||||
return pg;
|
||||
@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain
|
||||
else if ( d->arch.paging.shadow.total_pages > pages )
|
||||
{
|
||||
/* Need to return memory to domheap */
|
||||
- _shadow_prealloc(d, 1);
|
||||
+ if ( !_shadow_prealloc(d, 1) )
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
|
||||
ASSERT(sp);
|
||||
/*
|
||||
@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc
|
||||
if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
|
||||
{
|
||||
int i;
|
||||
+
|
||||
+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
|
||||
+ return;
|
||||
+
|
||||
for(i = 0; i < SHADOW_OOS_PAGES; i++)
|
||||
- {
|
||||
- shadow_prealloc(d, SH_type_oos_snapshot, 1);
|
||||
v->arch.paging.shadow.oos_snapshot[i] =
|
||||
shadow_alloc(d, SH_type_oos_snapshot, 0);
|
||||
- }
|
||||
}
|
||||
#endif /* OOS */
|
||||
|
||||
@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc
|
||||
mfn_t mmfn = sh_make_monitor_table(
|
||||
v, v->arch.paging.mode->shadow.shadow_levels);
|
||||
|
||||
+ if ( mfn_eq(mmfn, INVALID_MFN) )
|
||||
+ return;
|
||||
+
|
||||
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
|
||||
make_cr3(v, mmfn);
|
||||
hvm_update_host_cr3(v);
|
||||
@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc
|
||||
v->arch.hvm.monitor_table = pagetable_null();
|
||||
new_mfn = sh_make_monitor_table(
|
||||
v, v->arch.paging.mode->shadow.shadow_levels);
|
||||
+ if ( mfn_eq(new_mfn, INVALID_MFN) )
|
||||
+ {
|
||||
+ sh_destroy_monitor_table(v, old_mfn,
|
||||
+ old_mode->shadow.shadow_levels);
|
||||
+ return;
|
||||
+ }
|
||||
v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
|
||||
SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
|
||||
mfn_x(new_mfn));
|
||||
@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu
|
||||
if ( !mfn_valid(smfn) )
|
||||
{
|
||||
/* Make sure there's enough free shadow memory. */
|
||||
- shadow_prealloc(d, root_type, 1);
|
||||
+ if ( !shadow_prealloc(d, root_type, 1) )
|
||||
+ {
|
||||
+ new_entry = pagetable_null();
|
||||
+ goto install_new_entry;
|
||||
+ }
|
||||
+
|
||||
/* Shadow the page. */
|
||||
smfn = make_shadow(v, gmfn, root_type);
|
||||
}
|
||||
--- a/xen/arch/x86/mm/shadow/hvm.c
|
||||
+++ b/xen/arch/x86/mm/shadow/hvm.c
|
||||
@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct
|
||||
ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
|
||||
|
||||
/* Guarantee we can get the memory we need */
|
||||
- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
|
||||
+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
|
||||
+ return INVALID_MFN;
|
||||
+
|
||||
m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
|
||||
mfn_to_page(m4mfn)->shadow_flags = 4;
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/multi.c
|
||||
+++ b/xen/arch/x86/mm/shadow/multi.c
|
||||
@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v,
|
||||
* Preallocate shadow pages *before* removing writable accesses
|
||||
* otherwhise an OOS L1 might be demoted and promoted again with
|
||||
* writable mappings. */
|
||||
- shadow_prealloc(d,
|
||||
- SH_type_l1_shadow,
|
||||
- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
|
||||
+ if ( !shadow_prealloc(d, SH_type_l1_shadow,
|
||||
+ GUEST_PAGING_LEVELS < 4
|
||||
+ ? 1 : GUEST_PAGING_LEVELS - 1) )
|
||||
+ {
|
||||
+ paging_unlock(d);
|
||||
+ put_gfn(d, gfn_x(gfn));
|
||||
+ return 0;
|
||||
+ }
|
||||
|
||||
rc = gw_remove_write_accesses(v, va, &gw);
|
||||
|
||||
--- a/xen/arch/x86/mm/shadow/private.h
|
||||
+++ b/xen/arch/x86/mm/shadow/private.h
|
||||
@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf
|
||||
void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
|
||||
|
||||
/* Shadow page allocation functions */
|
||||
-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
|
||||
+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
|
||||
+ unsigned int count);
|
||||
mfn_t shadow_alloc(struct domain *d,
|
||||
u32 shadow_type,
|
||||
unsigned long backpointer);
|
|
@ -0,0 +1,82 @@
|
|||
From: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Subject: x86/p2m: refuse new allocations for dying domains
|
||||
|
||||
This will in particular prevent any attempts to add entries to the p2m,
|
||||
once - in a subsequent change - non-root entries have been removed.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc
|
||||
|
||||
ASSERT(paging_locked_by_me(d));
|
||||
|
||||
+ if ( unlikely(d->is_dying) )
|
||||
+ return NULL;
|
||||
+
|
||||
pg = page_list_remove_head(&d->arch.paging.hap.freelist);
|
||||
if ( unlikely(!pg) )
|
||||
return NULL;
|
||||
@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p
|
||||
d->arch.paging.hap.p2m_pages++;
|
||||
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
|
||||
}
|
||||
- else if ( !d->arch.paging.p2m_alloc_failed )
|
||||
+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying )
|
||||
{
|
||||
d->arch.paging.p2m_alloc_failed = 1;
|
||||
dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n",
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo
|
||||
if ( d->arch.paging.shadow.free_pages >= pages )
|
||||
return true;
|
||||
|
||||
+ if ( unlikely(d->is_dying) )
|
||||
+ /* No reclaim when the domain is dying, teardown will take care of it. */
|
||||
+ return false;
|
||||
+
|
||||
/* Shouldn't have enabled shadows if we've no vcpus. */
|
||||
ASSERT(d->vcpu && d->vcpu[0]);
|
||||
|
||||
@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo
|
||||
d->arch.paging.shadow.free_pages,
|
||||
d->arch.paging.shadow.p2m_pages);
|
||||
|
||||
- ASSERT(d->is_dying);
|
||||
+ ASSERT_UNREACHABLE();
|
||||
|
||||
guest_flush_tlb_mask(d, d->dirty_cpumask);
|
||||
|
||||
@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo
|
||||
* to avoid freeing shadows that the caller is currently working on. */
|
||||
bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
|
||||
{
|
||||
- bool ret = _shadow_prealloc(d, shadow_size(type) * count);
|
||||
+ bool ret;
|
||||
+
|
||||
+ if ( unlikely(d->is_dying) )
|
||||
+ return false;
|
||||
|
||||
- if ( !ret && !d->is_dying &&
|
||||
- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
|
||||
+ ret = _shadow_prealloc(d, shadow_size(type) * count);
|
||||
+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
|
||||
/*
|
||||
* Failing to allocate memory required for shadow usage can only result in
|
||||
* a domain crash, do it here rather that relying on every caller to do it.
|
||||
@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d)
|
||||
{
|
||||
struct page_info *pg = NULL;
|
||||
|
||||
+ if ( unlikely(d->is_dying) )
|
||||
+ return NULL;
|
||||
+
|
||||
/* This is called both from the p2m code (which never holds the
|
||||
* paging lock) and the log-dirty code (which always does). */
|
||||
paging_lock_recursive(d);
|
|
@ -0,0 +1,96 @@
|
|||
From: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Subject: x86/p2m: truly free paging pool memory for dying domains
|
||||
|
||||
Modify {hap,shadow}_free to free the page immediately if the domain is
|
||||
dying, so that pages don't accumulate in the pool when
|
||||
{shadow,hap}_final_teardown() get called. This is to limit the amount of
|
||||
work which needs to be done there (in a non-preemptable manner).
|
||||
|
||||
Note the call to shadow_free() in shadow_free_p2m_page() is moved after
|
||||
increasing total_pages, so that the decrease done in shadow_free() in
|
||||
case the domain is dying doesn't underflow the counter, even if just for
|
||||
a short interval.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m
|
||||
|
||||
ASSERT(paging_locked_by_me(d));
|
||||
|
||||
+ /*
|
||||
+ * For dying domains, actually free the memory here. This way less work is
|
||||
+ * left to hap_final_teardown(), which cannot easily have preemption checks
|
||||
+ * added.
|
||||
+ */
|
||||
+ if ( unlikely(d->is_dying) )
|
||||
+ {
|
||||
+ free_domheap_page(pg);
|
||||
+ d->arch.paging.hap.total_pages--;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
d->arch.paging.hap.free_pages++;
|
||||
page_list_add_tail(pg, &d->arch.paging.hap.freelist);
|
||||
}
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d,
|
||||
void shadow_free(struct domain *d, mfn_t smfn)
|
||||
{
|
||||
struct page_info *next = NULL, *sp = mfn_to_page(smfn);
|
||||
+ bool dying = ACCESS_ONCE(d->is_dying);
|
||||
struct page_list_head *pin_list;
|
||||
unsigned int pages;
|
||||
u32 shadow_type;
|
||||
@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t
|
||||
* just before the allocator hands the page out again. */
|
||||
page_set_tlbflush_timestamp(sp);
|
||||
perfc_decr(shadow_alloc_count);
|
||||
- page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
|
||||
+
|
||||
+ /*
|
||||
+ * For dying domains, actually free the memory here. This way less
|
||||
+ * work is left to shadow_final_teardown(), which cannot easily have
|
||||
+ * preemption checks added.
|
||||
+ */
|
||||
+ if ( unlikely(dying) )
|
||||
+ {
|
||||
+ /*
|
||||
+ * The backpointer field (sh.back) used by shadow code aliases the
|
||||
+ * domain owner field, unconditionally clear it here to avoid
|
||||
+ * free_domheap_page() attempting to parse it.
|
||||
+ */
|
||||
+ page_set_owner(sp, NULL);
|
||||
+ free_domheap_page(sp);
|
||||
+ }
|
||||
+ else
|
||||
+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
|
||||
+
|
||||
sp = next;
|
||||
}
|
||||
|
||||
- d->arch.paging.shadow.free_pages += pages;
|
||||
+ if ( unlikely(dying) )
|
||||
+ d->arch.paging.shadow.total_pages -= pages;
|
||||
+ else
|
||||
+ d->arch.paging.shadow.free_pages += pages;
|
||||
}
|
||||
|
||||
/* Divert a page from the pool to be used by the p2m mapping.
|
||||
@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s
|
||||
* paging lock) and the log-dirty code (which always does). */
|
||||
paging_lock_recursive(d);
|
||||
|
||||
- shadow_free(d, page_to_mfn(pg));
|
||||
d->arch.paging.shadow.p2m_pages--;
|
||||
d->arch.paging.shadow.total_pages++;
|
||||
+ shadow_free(d, page_to_mfn(pg));
|
||||
|
||||
paging_unlock(d);
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
From: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Subject: x86/p2m: free the paging memory pool preemptively
|
||||
|
||||
The paging memory pool is currently freed in two different places:
|
||||
from {shadow,hap}_teardown() via domain_relinquish_resources() and
|
||||
from {shadow,hap}_final_teardown() via complete_domain_destroy().
|
||||
While the former does handle preemption, the later doesn't.
|
||||
|
||||
Attempt to move as much p2m related freeing as possible to happen
|
||||
before the call to {shadow,hap}_teardown(), so that most memory can be
|
||||
freed in a preemptive way. In order to avoid causing issues to
|
||||
existing callers leave the root p2m page tables set and free them in
|
||||
{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free
|
||||
the page immediately if the domain is dying, so that pages don't
|
||||
accumulate in the pool when {shadow,hap}_final_teardown() get called.
|
||||
|
||||
Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's
|
||||
the place where altp2m_active gets disabled now.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
Acked-by: Tim Deegan <tim@xen.org>
|
||||
|
||||
--- a/xen/arch/x86/domain.c
|
||||
+++ b/xen/arch/x86/domain.c
|
||||
@@ -38,7 +38,6 @@
|
||||
#include <xen/livepatch.h>
|
||||
#include <public/sysctl.h>
|
||||
#include <public/hvm/hvm_vcpu.h>
|
||||
-#include <asm/altp2m.h>
|
||||
#include <asm/regs.h>
|
||||
#include <asm/mc146818rtc.h>
|
||||
#include <asm/system.h>
|
||||
@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d
|
||||
vpmu_destroy(v);
|
||||
}
|
||||
|
||||
- if ( altp2m_active(d) )
|
||||
- {
|
||||
- for_each_vcpu ( d, v )
|
||||
- altp2m_vcpu_disable_ve(v);
|
||||
- }
|
||||
-
|
||||
if ( is_pv_domain(d) )
|
||||
{
|
||||
for_each_vcpu ( d, v )
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <xen/domain_page.h>
|
||||
#include <xen/guest_access.h>
|
||||
#include <xen/keyhandler.h>
|
||||
+#include <asm/altp2m.h>
|
||||
#include <asm/event.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/current.h>
|
||||
@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d
|
||||
unsigned int i;
|
||||
|
||||
if ( hvm_altp2m_supported() )
|
||||
- {
|
||||
- d->arch.altp2m_active = 0;
|
||||
-
|
||||
- if ( d->arch.altp2m_eptp )
|
||||
- {
|
||||
- free_xenheap_page(d->arch.altp2m_eptp);
|
||||
- d->arch.altp2m_eptp = NULL;
|
||||
- }
|
||||
-
|
||||
- if ( d->arch.altp2m_visible_eptp )
|
||||
- {
|
||||
- free_xenheap_page(d->arch.altp2m_visible_eptp);
|
||||
- d->arch.altp2m_visible_eptp = NULL;
|
||||
- }
|
||||
-
|
||||
for ( i = 0; i < MAX_ALTP2M; i++ )
|
||||
p2m_teardown(d->arch.altp2m_p2m[i], true);
|
||||
- }
|
||||
|
||||
/* Destroy nestedp2m's first */
|
||||
for (i = 0; i < MAX_NESTEDP2M; i++) {
|
||||
@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d
|
||||
paging_lock(d);
|
||||
hap_set_allocation(d, 0, NULL);
|
||||
ASSERT(d->arch.paging.hap.p2m_pages == 0);
|
||||
+ ASSERT(d->arch.paging.hap.free_pages == 0);
|
||||
+ ASSERT(d->arch.paging.hap.total_pages == 0);
|
||||
paging_unlock(d);
|
||||
}
|
||||
|
||||
@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v)
|
||||
void hap_teardown(struct domain *d, bool *preempted)
|
||||
{
|
||||
struct vcpu *v;
|
||||
+ unsigned int i;
|
||||
|
||||
ASSERT(d->is_dying);
|
||||
ASSERT(d != current->domain);
|
||||
@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool
|
||||
for_each_vcpu ( d, v )
|
||||
hap_vcpu_teardown(v);
|
||||
|
||||
+ /* Leave the root pt in case we get further attempts to modify the p2m. */
|
||||
+ if ( hvm_altp2m_supported() )
|
||||
+ {
|
||||
+ if ( altp2m_active(d) )
|
||||
+ for_each_vcpu ( d, v )
|
||||
+ altp2m_vcpu_disable_ve(v);
|
||||
+
|
||||
+ d->arch.altp2m_active = 0;
|
||||
+
|
||||
+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp);
|
||||
+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
|
||||
+
|
||||
+ for ( i = 0; i < MAX_ALTP2M; i++ )
|
||||
+ p2m_teardown(d->arch.altp2m_p2m[i], false);
|
||||
+ }
|
||||
+
|
||||
+ /* Destroy nestedp2m's after altp2m. */
|
||||
+ for ( i = 0; i < MAX_NESTEDP2M; i++ )
|
||||
+ p2m_teardown(d->arch.nested_p2m[i], false);
|
||||
+
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), false);
|
||||
+
|
||||
paging_lock(d); /* Keep various asserts happy */
|
||||
|
||||
if ( d->arch.paging.hap.total_pages != 0 )
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b
|
||||
for_each_vcpu ( d, v )
|
||||
shadow_vcpu_teardown(v);
|
||||
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), false);
|
||||
+
|
||||
paging_lock(d);
|
||||
|
||||
+ /*
|
||||
+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find
|
||||
+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages
|
||||
+ * because the domain is dying.
|
||||
+ */
|
||||
+ shadow_blow_tables(d);
|
||||
+
|
||||
#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
|
||||
/* Free the virtual-TLB array attached to each vcpu */
|
||||
for_each_vcpu(d, v)
|
||||
@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain
|
||||
d->arch.paging.shadow.total_pages,
|
||||
d->arch.paging.shadow.free_pages,
|
||||
d->arch.paging.shadow.p2m_pages);
|
||||
+ ASSERT(!d->arch.paging.shadow.total_pages);
|
||||
+ ASSERT(!d->arch.paging.shadow.free_pages);
|
||||
+ ASSERT(!d->arch.paging.shadow.p2m_pages);
|
||||
paging_unlock(d);
|
||||
}
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
From: Julien Grall <jgrall@amazon.com>
|
||||
Subject: xen/x86: p2m: Add preemption in p2m_teardown()
|
||||
|
||||
The list p2m->pages contain all the pages used by the P2M. On large
|
||||
instance this can be quite large and the time spent to call
|
||||
d->arch.paging.free_page() will take more than 1ms for a 80GB guest
|
||||
on a Xen running in nested environment on a c5.metal.
|
||||
|
||||
By extrapolation, it would take > 100ms for a 8TB guest (what we
|
||||
current security support). So add some preemption in p2m_teardown()
|
||||
and propagate to the callers. Note there are 3 places where
|
||||
the preemption is not enabled:
|
||||
- hap_final_teardown()/shadow_final_teardown(): We are
|
||||
preventing update the P2M once the domain is dying (so
|
||||
no more pages could be allocated) and most of the P2M pages
|
||||
will be freed in preemptive manneer when relinquishing the
|
||||
resources. So this is fine to disable preemption.
|
||||
- shadow_enable(): This is fine because it will undo the allocation
|
||||
that may have been made by p2m_alloc_table() (so only the root
|
||||
page table).
|
||||
|
||||
The preemption is arbitrarily checked every 1024 iterations.
|
||||
|
||||
Note that with the current approach, Xen doesn't keep track on whether
|
||||
the alt/nested P2Ms have been cleared. So there are some redundant work.
|
||||
However, this is not expected to incurr too much overhead (the P2M lock
|
||||
shouldn't be contended during teardown). So this is optimization is
|
||||
left outside of the security event.
|
||||
|
||||
This is part of CVE-2022-33746 / XSA-410.
|
||||
|
||||
Signed-off-by: Julien Grall <jgrall@amazon.com>
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
|
||||
--- a/xen/include/asm-x86/p2m.h
|
||||
+++ b/xen/include/asm-x86/p2m.h
|
||||
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
|
||||
int p2m_alloc_table(struct p2m_domain *p2m);
|
||||
|
||||
/* Return all the p2m resources to Xen. */
|
||||
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
|
||||
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted);
|
||||
void p2m_final_teardown(struct domain *d);
|
||||
|
||||
/* Add a page to a domain's p2m table */
|
||||
--- a/xen/arch/x86/mm/hap/hap.c
|
||||
+++ b/xen/arch/x86/mm/hap/hap.c
|
||||
@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d
|
||||
|
||||
if ( hvm_altp2m_supported() )
|
||||
for ( i = 0; i < MAX_ALTP2M; i++ )
|
||||
- p2m_teardown(d->arch.altp2m_p2m[i], true);
|
||||
+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL);
|
||||
|
||||
/* Destroy nestedp2m's first */
|
||||
for (i = 0; i < MAX_NESTEDP2M; i++) {
|
||||
- p2m_teardown(d->arch.nested_p2m[i], true);
|
||||
+ p2m_teardown(d->arch.nested_p2m[i], true, NULL);
|
||||
}
|
||||
|
||||
if ( d->arch.paging.hap.total_pages != 0 )
|
||||
hap_teardown(d, NULL);
|
||||
|
||||
- p2m_teardown(p2m_get_hostp2m(d), true);
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
|
||||
/* Free any memory that the p2m teardown released */
|
||||
paging_lock(d);
|
||||
hap_set_allocation(d, 0, NULL);
|
||||
@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool
|
||||
FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
|
||||
|
||||
for ( i = 0; i < MAX_ALTP2M; i++ )
|
||||
- p2m_teardown(d->arch.altp2m_p2m[i], false);
|
||||
+ {
|
||||
+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted);
|
||||
+ if ( preempted && *preempted )
|
||||
+ return;
|
||||
+ }
|
||||
}
|
||||
|
||||
/* Destroy nestedp2m's after altp2m. */
|
||||
for ( i = 0; i < MAX_NESTEDP2M; i++ )
|
||||
- p2m_teardown(d->arch.nested_p2m[i], false);
|
||||
+ {
|
||||
+ p2m_teardown(d->arch.nested_p2m[i], false, preempted);
|
||||
+ if ( preempted && *preempted )
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
- p2m_teardown(p2m_get_hostp2m(d), false);
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
|
||||
+ if ( preempted && *preempted )
|
||||
+ return;
|
||||
|
||||
paging_lock(d); /* Keep various asserts happy */
|
||||
|
||||
--- a/xen/arch/x86/mm/p2m.c
|
||||
+++ b/xen/arch/x86/mm/p2m.c
|
||||
@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p
|
||||
* hvm fixme: when adding support for pvh non-hardware domains, this path must
|
||||
* cleanup any foreign p2m types (release refcnts on them).
|
||||
*/
|
||||
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
|
||||
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted)
|
||||
/* Return all the p2m pages to Xen.
|
||||
* We know we don't have any extra mappings to these pages */
|
||||
{
|
||||
struct page_info *pg, *root_pg = NULL;
|
||||
struct domain *d;
|
||||
+ unsigned int i = 0;
|
||||
|
||||
if (p2m == NULL)
|
||||
return;
|
||||
@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m
|
||||
}
|
||||
|
||||
while ( (pg = page_list_remove_head(&p2m->pages)) )
|
||||
- if ( pg != root_pg )
|
||||
- d->arch.paging.free_page(d, pg);
|
||||
+ {
|
||||
+ if ( pg == root_pg )
|
||||
+ continue;
|
||||
+
|
||||
+ d->arch.paging.free_page(d, pg);
|
||||
+
|
||||
+ /* Arbitrarily check preemption every 1024 iterations */
|
||||
+ if ( preempted && !(++i % 1024) && general_preempt_check() )
|
||||
+ {
|
||||
+ *preempted = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
if ( root_pg )
|
||||
page_list_add(root_pg, &p2m->pages);
|
||||
--- a/xen/arch/x86/mm/shadow/common.c
|
||||
+++ b/xen/arch/x86/mm/shadow/common.c
|
||||
@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32
|
||||
out_locked:
|
||||
paging_unlock(d);
|
||||
out_unlocked:
|
||||
+ /*
|
||||
+ * This is fine to ignore the preemption here because only the root
|
||||
+ * will be allocated by p2m_alloc_table().
|
||||
+ */
|
||||
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
|
||||
- p2m_teardown(p2m, true);
|
||||
+ p2m_teardown(p2m, true, NULL);
|
||||
if ( rv != 0 && pg != NULL )
|
||||
{
|
||||
pg->count_info &= ~PGC_count_mask;
|
||||
@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b
|
||||
for_each_vcpu ( d, v )
|
||||
shadow_vcpu_teardown(v);
|
||||
|
||||
- p2m_teardown(p2m_get_hostp2m(d), false);
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
|
||||
+ if ( preempted && *preempted )
|
||||
+ return;
|
||||
|
||||
paging_lock(d);
|
||||
|
||||
@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain
|
||||
shadow_teardown(d, NULL);
|
||||
|
||||
/* It is now safe to pull down the p2m map. */
|
||||
- p2m_teardown(p2m_get_hostp2m(d), true);
|
||||
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
|
||||
/* Free any shadow memory that the p2m teardown released */
|
||||
paging_lock(d);
|
||||
shadow_set_allocation(d, 0, NULL);
|
|
@ -0,0 +1,55 @@
|
|||
From: Jan Beulich <jbeulich@suse.com>
|
||||
Subject: gnttab: correct locking on transitive grant copy error path
|
||||
|
||||
While the comment next to the lock dropping in preparation of
|
||||
recursively calling acquire_grant_for_copy() mistakenly talks about the
|
||||
rd == td case (excluded a few lines further up), the same concerns apply
|
||||
to the calling of release_grant_for_copy() on a subsequent error path.
|
||||
|
||||
This is CVE-2022-33748 / XSA-411.
|
||||
|
||||
Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling")
|
||||
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
||||
---
|
||||
v2: Extend code comment.
|
||||
|
||||
--- a/xen/common/grant_table.c
|
||||
+++ b/xen/common/grant_table.c
|
||||
@@ -2622,9 +2622,8 @@ acquire_grant_for_copy(
|
||||
trans_domid);
|
||||
|
||||
/*
|
||||
- * acquire_grant_for_copy() could take the lock on the
|
||||
- * remote table (if rd == td), so we have to drop the lock
|
||||
- * here and reacquire.
|
||||
+ * acquire_grant_for_copy() will take the lock on the remote table,
|
||||
+ * so we have to drop the lock here and reacquire.
|
||||
*/
|
||||
active_entry_release(act);
|
||||
grant_read_unlock(rgt);
|
||||
@@ -2661,11 +2660,25 @@ acquire_grant_for_copy(
|
||||
act->trans_gref != trans_gref ||
|
||||
!act->is_sub_page)) )
|
||||
{
|
||||
+ /*
|
||||
+ * Like above for acquire_grant_for_copy() we need to drop and then
|
||||
+ * re-acquire the locks here to prevent lock order inversion issues.
|
||||
+ * Unlike for acquire_grant_for_copy() we don't need to re-check
|
||||
+ * anything, as release_grant_for_copy() doesn't depend on the grant
|
||||
+ * table entry: It only updates internal state and the status flags.
|
||||
+ */
|
||||
+ active_entry_release(act);
|
||||
+ grant_read_unlock(rgt);
|
||||
+
|
||||
release_grant_for_copy(td, trans_gref, readonly);
|
||||
rcu_unlock_domain(td);
|
||||
+
|
||||
+ grant_read_lock(rgt);
|
||||
+ act = active_entry_acquire(rgt, gref);
|
||||
reduce_status_for_pin(rd, act, status, readonly);
|
||||
active_entry_release(act);
|
||||
grant_read_unlock(rgt);
|
||||
+
|
||||
put_page(*page);
|
||||
*page = NULL;
|
||||
return ERESTART;
|
Loading…
Reference in New Issue