system/xen: Updated for version 4.16.2.

Signed-off-by: Mario Preksavec <mario@slackware.hr>

Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
This commit is contained in:
Mario Preksavec 2022-10-13 11:36:16 +02:00 committed by Willy Sudiarto Raharjo
parent aae988ef83
commit b75c769415
No known key found for this signature in database
GPG Key ID: 3F617144D7238786
20 changed files with 1282 additions and 1137 deletions

View File

@ -25,8 +25,8 @@
cd $(dirname $0) ; CWD=$(pwd)
PRGNAM=xen
VERSION=${VERSION:-4.16.1}
BUILD=${BUILD:-2}
VERSION=${VERSION:-4.16.2}
BUILD=${BUILD:-1}
TAG=${TAG:-_SBo}
PKGTYPE=${PKGTYPE:-tgz}

View File

@ -1,9 +1,9 @@
PRGNAM="xen"
VERSION="4.16.1"
VERSION="4.16.2"
HOMEPAGE="http://www.xenproject.org/"
DOWNLOAD="UNSUPPORTED"
MD5SUM=""
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-3c040ad387099483102708bb1839110bc788cefb.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \
@ -15,7 +15,7 @@ DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \
http://mirror.slackware.hr/sources/xen-seabios/seabios-1.14.0.tar.gz \
http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20210824_7b4a99be8a.tar.bz2"
MD5SUM_x86_64="1c2cd4f7f966c1d455aab630953e5fad \
MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \
23ba00d5e2c5b4343d12665af73e1cb5 \
36cc57650cffda9a0269493be2a169bb \
debc62758716a169df9f62e6ab2bc634 \

View File

@ -1,170 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Clean up _get_page_type()
Various fixes for clarity, ahead of making complicated changes.
* Split the overflow check out of the if/else chain for type handling, as
it's somewhat unrelated.
* Comment the main if/else chain to explain what is going on. Adjust one
ASSERT() and state the bit layout for validate-locked and partial states.
* Correct the comment about TLB flushing, as it's backwards. The problem
case is when writeable mappings are retained to a page becoming read-only,
as it allows the guest to bypass Xen's safety checks for updates.
* Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not
valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat
all reads as explicitly volatile. The only thing preventing the validated
wait-loop being infinite is the compiler barrier hidden in cpu_relax().
* Replace one page_get_owner(page) with the already-calculated 'd' already in
scope.
No functional change.
This is part of XSA-401 / CVE-2022-26362.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 796faca64103..ddd32f88c798 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
static int _get_page_type(struct page_info *page, unsigned long type,
bool preemptible)
{
- unsigned long nx, x, y = page->u.inuse.type_info;
+ unsigned long nx, x;
int rc = 0;
ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
ASSERT(!in_irq());
- for ( ; ; )
+ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
{
x = y;
nx = x + 1;
+
if ( unlikely((nx & PGT_count_mask) == 0) )
{
gdprintk(XENLOG_WARNING,
@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
mfn_x(page_to_mfn(page)));
return -EINVAL;
}
- else if ( unlikely((x & PGT_count_mask) == 0) )
+
+ if ( unlikely((x & PGT_count_mask) == 0) )
{
+ /*
+ * Typeref 0 -> 1.
+ *
+ * Type changes are permitted when the typeref is 0. If the type
+ * actually changes, the page needs re-validating.
+ */
struct domain *d = page_get_owner(page);
if ( d && shadow_mode_enabled(d) )
@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
{
/*
* On type change we check to flush stale TLB entries. It is
- * vital that no other CPUs are left with mappings of a frame
- * which is about to become writeable to the guest.
+ * vital that no other CPUs are left with writeable mappings
+ * to a frame which is intending to become pgtable/segdesc.
*/
cpumask_t *mask = this_cpu(scratch_cpumask);
@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
if ( unlikely(!cpumask_empty(mask)) &&
/* Shadow mode: track only writable pages. */
- (!shadow_mode_enabled(page_get_owner(page)) ||
+ (!shadow_mode_enabled(d) ||
((nx & PGT_type_mask) == PGT_writable_page)) )
{
perfc_incr(need_flush_tlb_flush);
@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
}
else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
{
- /* Don't log failure if it could be a recursive-mapping attempt. */
+ /*
+ * else, we're trying to take a new reference, of the wrong type.
+ *
+ * This (being able to prohibit use of the wrong type) is what the
+ * typeref system exists for, but skip printing the failure if it
+ * looks like a recursive mapping, as subsequent logic might
+ * ultimately permit the attempt.
+ */
if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
(type == PGT_l1_page_table) )
return -EINVAL;
@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
}
else if ( unlikely(!(x & PGT_validated)) )
{
+ /*
+ * else, the count is non-zero, and we're grabbing the right type;
+ * but the page hasn't been validated yet.
+ *
+ * The page is in one of two states (depending on PGT_partial),
+ * and should have exactly one reference.
+ */
+ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
+
if ( !(x & PGT_partial) )
{
- /* Someone else is updating validation of this page. Wait... */
+ /*
+ * The page has been left in the "validate locked" state
+ * (i.e. PGT_[type] | 1) which means that a concurrent caller
+ * of _get_page_type() is in the middle of validation.
+ *
+ * Spin waiting for the concurrent user to complete (partial
+ * or fully validated), then restart our attempt to acquire a
+ * type reference.
+ */
do {
if ( preemptible && hypercall_preempt_check() )
return -EINTR;
cpu_relax();
- } while ( (y = page->u.inuse.type_info) == x );
+ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
continue;
}
- /* Type ref count was left at 1 when PGT_partial got set. */
- ASSERT((x & PGT_count_mask) == 1);
+
+ /*
+ * The page has been left in the "partial" state
+ * (i.e., PGT_[type] | PGT_partial | 1).
+ *
+ * Rather than bumping the type count, we need to try to grab the
+ * validation lock; if we succeed, we need to validate the page,
+ * then drop the general ref associated with the PGT_partial bit.
+ *
+ * We grab the validation lock by setting nx to (PGT_[type] | 1)
+ * (i.e., non-zero type count, neither PGT_validated nor
+ * PGT_partial set).
+ */
nx = x & ~PGT_partial;
}
@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
}
out:
+ /*
+ * Did we drop the PGT_partial bit when acquiring the typeref? If so,
+ * drop the general reference that went along with it.
+ *
+ * N.B. validate_page() may have have re-set PGT_partial, not reflected in
+ * nx, but will have taken an extra ref when doing so.
+ */
if ( (x & PGT_partial) && !(nx & PGT_partial) )
put_page(page);

View File

@ -1,191 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
_get_page_type() suffers from a race condition where it incorrectly assumes
that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
cannot have changed in-between. Consider:
CPU A:
1. Creates an L2e referencing pg
`-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
2. Issues flush_tlb_mask()
CPU B:
3. Creates a writeable mapping of pg
`-> _get_page_type(pg, PGT_writable_page), count increases to 1
4. Writes into new mapping, creating a TLB entry for pg
5. Removes the writeable mapping of pg
`-> _put_page_type(pg), count goes back down to 0
CPU A:
7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table
CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred
until after the guest is prohibited from creating new writeable mappings,
which is after step 7.
Defer all safety actions until after the cmpxchg() has successfully taken the
intended typeref, because that is what prevents concurrent users from using
the old type.
Also remove the early validation for writeable and shared pages. This removes
race conditions where one half of a parallel mapping attempt can return
successfully before:
* The IOMMU pagetables are in sync with the new page type
* Writeable mappings to shared pages have been torn down
This is part of XSA-401 / CVE-2022-26362.
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index ddd32f88c798..1693b580b152 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
* Type changes are permitted when the typeref is 0. If the type
* actually changes, the page needs re-validating.
*/
- struct domain *d = page_get_owner(page);
-
- if ( d && shadow_mode_enabled(d) )
- shadow_prepare_page_type_change(d, page, type);
ASSERT(!(x & PGT_pae_xen_l2));
if ( (x & PGT_type_mask) != type )
{
- /*
- * On type change we check to flush stale TLB entries. It is
- * vital that no other CPUs are left with writeable mappings
- * to a frame which is intending to become pgtable/segdesc.
- */
- cpumask_t *mask = this_cpu(scratch_cpumask);
-
- BUG_ON(in_irq());
- cpumask_copy(mask, d->dirty_cpumask);
-
- /* Don't flush if the timestamp is old enough */
- tlbflush_filter(mask, page->tlbflush_timestamp);
-
- if ( unlikely(!cpumask_empty(mask)) &&
- /* Shadow mode: track only writable pages. */
- (!shadow_mode_enabled(d) ||
- ((nx & PGT_type_mask) == PGT_writable_page)) )
- {
- perfc_incr(need_flush_tlb_flush);
- /*
- * If page was a page table make sure the flush is
- * performed using an IPI in order to avoid changing the
- * type of a page table page under the feet of
- * spurious_page_fault().
- */
- flush_mask(mask,
- (x & PGT_type_mask) &&
- (x & PGT_type_mask) <= PGT_root_page_table
- ? FLUSH_TLB | FLUSH_FORCE_IPI
- : FLUSH_TLB);
- }
-
- /* We lose existing type and validity. */
nx &= ~(PGT_type_mask | PGT_validated);
nx |= type;
-
- /*
- * No special validation needed for writable pages.
- * Page tables and GDT/LDT need to be scanned for validity.
- */
- if ( type == PGT_writable_page || type == PGT_shared_page )
- nx |= PGT_validated;
}
}
else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
return -EINTR;
}
+ /*
+ * One typeref has been taken and is now globally visible.
+ *
+ * The page is either in the "validate locked" state (PGT_[type] | 1) or
+ * fully validated (PGT_[type] | PGT_validated | >0).
+ */
+
+ if ( unlikely((x & PGT_count_mask) == 0) )
+ {
+ struct domain *d = page_get_owner(page);
+
+ if ( d && shadow_mode_enabled(d) )
+ shadow_prepare_page_type_change(d, page, type);
+
+ if ( (x & PGT_type_mask) != type )
+ {
+ /*
+ * On type change we check to flush stale TLB entries. It is
+ * vital that no other CPUs are left with writeable mappings
+ * to a frame which is intending to become pgtable/segdesc.
+ */
+ cpumask_t *mask = this_cpu(scratch_cpumask);
+
+ BUG_ON(in_irq());
+ cpumask_copy(mask, d->dirty_cpumask);
+
+ /* Don't flush if the timestamp is old enough */
+ tlbflush_filter(mask, page->tlbflush_timestamp);
+
+ if ( unlikely(!cpumask_empty(mask)) &&
+ /* Shadow mode: track only writable pages. */
+ (!shadow_mode_enabled(d) ||
+ ((nx & PGT_type_mask) == PGT_writable_page)) )
+ {
+ perfc_incr(need_flush_tlb_flush);
+ /*
+ * If page was a page table make sure the flush is
+ * performed using an IPI in order to avoid changing the
+ * type of a page table page under the feet of
+ * spurious_page_fault().
+ */
+ flush_mask(mask,
+ (x & PGT_type_mask) &&
+ (x & PGT_type_mask) <= PGT_root_page_table
+ ? FLUSH_TLB | FLUSH_FORCE_IPI
+ : FLUSH_TLB);
+ }
+ }
+ }
+
if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
(type == PGT_writable_page)) )
{
@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
if ( unlikely(!(nx & PGT_validated)) )
{
- if ( !(x & PGT_partial) )
+ /*
+ * No special validation needed for writable or shared pages. Page
+ * tables and GDT/LDT need to have their contents audited.
+ *
+ * per validate_page(), non-atomic updates are fine here.
+ */
+ if ( type == PGT_writable_page || type == PGT_shared_page )
+ page->u.inuse.type_info |= PGT_validated;
+ else
{
- page->nr_validated_ptes = 0;
- page->partial_flags = 0;
- page->linear_pt_count = 0;
+ if ( !(x & PGT_partial) )
+ {
+ page->nr_validated_ptes = 0;
+ page->partial_flags = 0;
+ page->linear_pt_count = 0;
+ }
+
+ rc = validate_page(page, type, preemptible);
}
- rc = validate_page(page, type, preemptible);
}
out:

View File

@ -1,43 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/page: Introduce _PAGE_* constants for memory types
... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
constants. These are going to be needed by forthcoming logic.
No functional change.
This is part of XSA-402.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 1d080cffbe84..2e542050f65a 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
#define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+/* Memory types, encoded under Xen's choice of MSR_PAT. */
+#define _PAGE_WB ( 0)
+#define _PAGE_WT ( _PAGE_PWT)
+#define _PAGE_UCM ( _PAGE_PCD )
+#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT)
+#define _PAGE_WC (_PAGE_PAT )
+#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT)
+
/*
* Debug option: Ensure that granted mappings are not implicitly unmapped.
* WARNING: This will need to be disabled to run OSes that use the spare PTE
@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
#define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED)
#define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \
_PAGE_DIRTY | _PAGE_RW)
-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
-#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
+#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC)
#define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
#define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */

View File

@ -1,213 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Don't change the cacheability of the directmap
Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
in response to guest mapping requests") attempted to keep the cacheability
consistent between different mappings of the same page.
The reason wasn't described in the changelog, but it is understood to be in
regards to a concern over machine check exceptions, owing to errata when using
mixed cacheabilities. It did this primarily by updating Xen's mapping of the
page in the direct map when the guest mapped a page with reduced cacheability.
Unfortunately, the logic didn't actually prevent mixed cacheability from
occurring:
* A guest could map a page normally, and then map the same page with
different cacheability; nothing prevented this.
* The cacheability of the directmap was always latest-takes-precedence in
terms of guest requests.
* Grant-mapped frames with lesser cacheability didn't adjust the page's
cacheattr settings.
* The map_domain_page() function still unconditionally created WB mappings,
irrespective of the page's cacheattr settings.
Additionally, update_xen_mappings() had a bug where the alias calculation was
wrong for mfn's which were .init content, which should have been treated as
fully guest pages, not Xen pages.
Worse yet, the logic introduced a vulnerability whereby necessary
pagetable/segdesc adjustments made by Xen in the validation logic could become
non-coherent between the cache and main memory. The CPU could subsequently
operate on the stale value in the cache, rather than the safe value in main
memory.
The directmap contains primarily mappings of RAM. PAT/MTRR conflict
resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
cacheability resolves to being coherent. The special case is WC mappings,
which are non-coherent against MTRR=WB regions (except for fully-coherent
CPUs).
Xen must not have any WC cacheability in the directmap, to prevent Xen's
actions from creating non-coherency. (Guest actions creating non-coherency is
dealt with in subsequent patches.) As all memory types for MTRR=WB ranges
inter-operate coherently, so leave Xen's directmap mappings as WB.
Only PV guests with access to devices can use reduced-cacheability mappings to
begin with, and they're trusted not to mount DoSs against the system anyway.
Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
count. Retain the check in get_page_from_l1e() for special_pages() because a
guest has no business using reduced cacheability on these.
This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
This is CVE-2022-26363, part of XSA-402.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index c6429b0f749a..ab32d13a1a0d 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
return (page_get_owner(page) == dom_io);
}
-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
-{
- int err = 0;
- bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
- unsigned long xen_va =
- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
-
- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
- return 0;
-
- if ( unlikely(alias) && cacheattr )
- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
- if ( !err )
- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
- if ( unlikely(alias) && !cacheattr && !err )
- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
-
- return err;
-}
-
#ifndef NDEBUG
struct mmio_emul_range_ctxt {
const struct domain *d;
@@ -1009,47 +987,14 @@ get_page_from_l1e(
goto could_not_pin;
}
- if ( pte_flags_to_cacheattr(l1f) !=
- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
+ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
{
- unsigned long x, nx, y = page->count_info;
- unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
- int err;
-
- if ( is_special_page(page) )
- {
- if ( write )
- put_page_type(page);
- put_page(page);
- gdprintk(XENLOG_WARNING,
- "Attempt to change cache attributes of Xen heap page\n");
- return -EACCES;
- }
-
- do {
- x = y;
- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
- err = update_xen_mappings(mfn, cacheattr);
- if ( unlikely(err) )
- {
- cacheattr = y & PGC_cacheattr_mask;
- do {
- x = y;
- nx = (x & ~PGC_cacheattr_mask) | cacheattr;
- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
- if ( write )
- put_page_type(page);
- put_page(page);
-
- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
- mfn, get_gpfn_from_mfn(mfn),
- l1e_get_intpte(l1e), l1e_owner->domain_id);
- return err;
- }
+ if ( write )
+ put_page_type(page);
+ put_page(page);
+ gdprintk(XENLOG_WARNING,
+ "Attempt to change cache attributes of Xen heap page\n");
+ return -EACCES;
}
return 0;
@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
*/
static int cleanup_page_mappings(struct page_info *page)
{
- unsigned int cacheattr =
- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
int rc = 0;
unsigned long mfn = mfn_x(page_to_mfn(page));
/*
- * If we've modified xen mappings as a result of guest cache
- * attributes, restore them to the "normal" state.
- */
- if ( unlikely(cacheattr) )
- {
- page->count_info &= ~PGC_cacheattr_mask;
-
- BUG_ON(is_special_page(page));
-
- rc = update_xen_mappings(mfn, 0);
- }
-
- /*
* If this may be in a PV domain's IOMMU, remove it.
*
* NB that writable xenheap pages have their type set and cleared by
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index cb9052749963..8a9a43bb0a9d 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -69,25 +69,22 @@
/* Set when is using a page as a page table */
#define _PGC_page_table PG_shift(3)
#define PGC_page_table PG_mask(1, 3)
- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
-#define PGC_cacheattr_base PG_shift(6)
-#define PGC_cacheattr_mask PG_mask(7, 6)
/* Page is broken? */
-#define _PGC_broken PG_shift(7)
-#define PGC_broken PG_mask(1, 7)
+#define _PGC_broken PG_shift(4)
+#define PGC_broken PG_mask(1, 4)
/* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
-#define PGC_state PG_mask(3, 9)
-#define PGC_state_inuse PG_mask(0, 9)
-#define PGC_state_offlining PG_mask(1, 9)
-#define PGC_state_offlined PG_mask(2, 9)
-#define PGC_state_free PG_mask(3, 9)
+#define PGC_state PG_mask(3, 6)
+#define PGC_state_inuse PG_mask(0, 6)
+#define PGC_state_offlining PG_mask(1, 6)
+#define PGC_state_offlined PG_mask(2, 6)
+#define PGC_state_free PG_mask(3, 6)
#define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
/* Page is not reference counted (see below for caveats) */
-#define _PGC_extra PG_shift(10)
-#define PGC_extra PG_mask(1, 10)
+#define _PGC_extra PG_shift(7)
+#define PGC_extra PG_mask(1, 7)
/* Count of references to this frame. */
-#define PGC_count_width PG_shift(10)
+#define PGC_count_width PG_shift(7)
#define PGC_count_mask ((1UL<<PGC_count_width)-1)
/*

View File

@ -1,284 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Split cache_flush() out of cache_writeback()
Subsequent changes will want a fully flushing version.
Use the new helper rather than opencoding it in flush_area_local(). This
resolves an outstanding issue where the conditional sfence is on the wrong
side of the clflushopt loop. clflushopt is ordered with respect to older
stores, not to younger stores.
Rename gnttab_cache_flush()'s helper to avoid colliding in name.
grant_table.c can see the prototype from cache.h so the build fails
otherwise.
This is part of XSA-402.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Xen 4.16 and earlier:
* Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
syncing" to split cache_writeback() out of the IOMMU logic, but without the
associated hooks changes.
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 25798df50f54..0c912b8669f8 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
if ( flags & FLUSH_CACHE )
{
const struct cpuinfo_x86 *c = &current_cpu_data;
- unsigned long i, sz = 0;
+ unsigned long sz = 0;
if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
sz = 1UL << (order + PAGE_SHIFT);
@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
c->x86_clflush_size && c->x86_cache_size && sz &&
((sz >> 10) < c->x86_cache_size) )
{
- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
- for ( i = 0; i < sz; i += c->x86_clflush_size )
- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
- " clflush %0",
- "data16 clflush %0", /* clflushopt */
- X86_FEATURE_CLFLUSHOPT,
- "m" (((const char *)va)[i]));
+ cache_flush(va, sz);
flags &= ~FLUSH_CACHE;
}
else
@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
return flags;
}
+void cache_flush(const void *addr, unsigned int size)
+{
+ /*
+ * This function may be called before current_cpu_data is established.
+ * Hence a fallback is needed to prevent the loop below becoming infinite.
+ */
+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+ const void *end = addr + size;
+
+ addr -= (unsigned long)addr & (clflush_size - 1);
+ for ( ; addr < end; addr += clflush_size )
+ {
+ /*
+ * Note regarding the "ds" prefix use: it's faster to do a clflush
+ * + prefix than a clflush + nop, and hence the prefix is added instead
+ * of letting the alternative framework fill the gap by appending nops.
+ */
+ alternative_io("ds; clflush %[p]",
+ "data16 clflush %[p]", /* clflushopt */
+ X86_FEATURE_CLFLUSHOPT,
+ /* no outputs */,
+ [p] "m" (*(const char *)(addr)));
+ }
+
+ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+}
+
+void cache_writeback(const void *addr, unsigned int size)
+{
+ unsigned int clflush_size;
+ const void *end = addr + size;
+
+ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
+ if ( !boot_cpu_has(X86_FEATURE_CLWB) )
+ return cache_flush(addr, size);
+
+ /*
+ * This function may be called before current_cpu_data is established.
+ * Hence a fallback is needed to prevent the loop below becoming infinite.
+ */
+ clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+ addr -= (unsigned long)addr & (clflush_size - 1);
+ for ( ; addr < end; addr += clflush_size )
+ {
+/*
+ * The arguments to a macro must not include preprocessor directives. Doing so
+ * results in undefined behavior, so we have to create some defines here in
+ * order to avoid it.
+ */
+#if defined(HAVE_AS_CLWB)
+# define CLWB_ENCODING "clwb %[p]"
+#elif defined(HAVE_AS_XSAVEOPT)
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+#else
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+#endif
+
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+# define INPUT BASE_INPUT
+#else
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+#endif
+
+ asm volatile (CLWB_ENCODING :: INPUT(addr));
+
+#undef INPUT
+#undef BASE_INPUT
+#undef CLWB_ENCODING
+ }
+
+ asm volatile ("sfence" ::: "memory");
+}
+
unsigned int guest_flush_tlb_flags(const struct domain *d)
{
bool shadow = paging_mode_shadow(d);
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 66f8ce71741c..4c742cd8fe81 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
return 0;
}
-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
{
struct domain *d, *owner;
struct page_info *page;
@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
return -EFAULT;
for ( ; ; )
{
- int ret = cache_flush(&op, cur_ref);
+ int ret = _cache_flush(&op, cur_ref);
if ( ret < 0 )
return ret;
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index 01e010a10d61..401079299725 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
struct pci_dev *pdev,
u16 did, u16 size, u64 addr);
-unsigned int get_cache_line_size(void);
void flush_all_cache(void);
uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 8975c1de61bc..bc377c9bcfa4 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -31,6 +31,7 @@
#include <xen/pci.h>
#include <xen/pci_regs.h>
#include <xen/keyhandler.h>
+#include <asm/cache.h>
#include <asm/msi.h>
#include <asm/nops.h>
#include <asm/irq.h>
@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
}
}
-static void sync_cache(const void *addr, unsigned int size)
-{
- static unsigned long clflush_size = 0;
- const void *end = addr + size;
-
- if ( clflush_size == 0 )
- clflush_size = get_cache_line_size();
-
- addr -= (unsigned long)addr & (clflush_size - 1);
- for ( ; addr < end; addr += clflush_size )
-/*
- * The arguments to a macro must not include preprocessor directives. Doing so
- * results in undefined behavior, so we have to create some defines here in
- * order to avoid it.
- */
-#if defined(HAVE_AS_CLWB)
-# define CLWB_ENCODING "clwb %[p]"
-#elif defined(HAVE_AS_XSAVEOPT)
-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
-#else
-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
-#endif
-
-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
-# define INPUT BASE_INPUT
-#else
-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
-#endif
- /*
- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
- * + prefix than a clflush + nop, and hence the prefix is added instead
- * of letting the alternative framework fill the gap by appending nops.
- */
- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
- "data16 clflush %[p]", /* clflushopt */
- X86_FEATURE_CLFLUSHOPT,
- CLWB_ENCODING,
- X86_FEATURE_CLWB, /* no outputs */,
- INPUT(addr));
-#undef INPUT
-#undef BASE_INPUT
-#undef CLWB_ENCODING
-
- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
- "sfence", X86_FEATURE_CLWB);
-}
-
/* Allocate page table, return its machine address */
uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
{
@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
clear_page(vaddr);
if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
- sync_cache(vaddr, PAGE_SIZE);
+ cache_writeback(vaddr, PAGE_SIZE);
unmap_domain_page(vaddr);
cur_pg++;
}
@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
iommu->nr_pt_levels = agaw_to_level(agaw);
if ( !ecap_coherent(iommu->ecap) )
- vtd_ops.sync_cache = sync_cache;
+ vtd_ops.sync_cache = cache_writeback;
/* allocate domain id bitmap */
iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 6681dccd6970..55f0faa521cb 100644
--- a/xen/drivers/passthrough/vtd/x86/vtd.c
+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
unmap_domain_page(va);
}
-unsigned int get_cache_line_size(void)
-{
- return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
-}
-
void flush_all_cache()
{
wbinvd();
diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
index 1f7173d8c72c..e4770efb22b9 100644
--- a/xen/include/asm-x86/cache.h
+++ b/xen/include/asm-x86/cache.h
@@ -11,4 +11,11 @@
#define __read_mostly __section(".data.read_mostly")
+#ifndef __ASSEMBLY__
+
+void cache_flush(const void *addr, unsigned int size);
+void cache_writeback(const void *addr, unsigned int size);
+
+#endif
+
#endif

View File

@ -1,83 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/amd: Work around CLFLUSH ordering on older parts
On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
including reads and writes to the address, and LFENCE/SFENCE instructions.
This creates a multitude of problematic corner cases, laid out in the manual.
Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
This is part of XSA-402.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index a8e37dbb1f5c..b3b9a0df5fed 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_has_lfence_dispatch)
__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+ /*
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
+ * everything, including reads and writes to address, and
+ * LFENCE/SFENCE instructions.
+ */
+ if (!cpu_has_clflushopt)
+ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
+
switch(c->x86)
{
case 0xf ... 0x11:
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 0c912b8669f8..dcbb4064012e 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
return flags;
}
+/*
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
+ * including reads and writes to address, and LFENCE/SFENCE instructions.
+ *
+ * This function only works safely after alternatives have run. Luckily, at
+ * the time of writing, we don't flush the caches that early.
+ */
void cache_flush(const void *addr, unsigned int size)
{
/*
@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
const void *end = addr + size;
+ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
+
addr -= (unsigned long)addr & (clflush_size - 1);
for ( ; addr < end; addr += clflush_size )
{
@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
[p] "m" (*(const char *)(addr)));
}
- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+ alternative_2("",
+ "sfence", X86_FEATURE_CLFLUSHOPT,
+ "mfence", X86_BUG_CLFLUSH_MFENCE);
}
void cache_writeback(const void *addr, unsigned int size)
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
index 7413febd7ad8..ff3157d52d13 100644
--- a/xen/include/asm-x86/cpufeatures.h
+++ b/xen/include/asm-x86/cpufeatures.h
@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch
#define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
#define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
+#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
/* Total number of capability words, inc synth and bug words. */
#define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */

View File

@ -1,148 +0,0 @@
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Track and flush non-coherent mappings of RAM
There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
devices that make non-coherent writes. The Linux sound subsystem makes
extensive use of this technique.
For such usecases, the guest's DMA buffer is mapped and consistently used as
WC, and Xen doesn't interact with the buffer.
However, a mischevious guest can use WC mappings to deliberately create
non-coherency between the cache and RAM, and use this to trick Xen into
validating a pagetable which isn't actually safe.
Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set
it whenever a non-coherent writeable mapping is created. If the page is used
as anything other than PGT_writable_page, force a cache flush before
validation. Also force a cache flush before the page is returned to the heap.
This is CVE-2022-26364, part of XSA-402.
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index ab32d13a1a0d..bab9624fabb7 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -997,6 +997,15 @@ get_page_from_l1e(
return -EACCES;
}
+ /*
+ * Track writeable non-coherent mappings to RAM pages, to trigger a cache
+ * flush later if the target is used as anything but a PGT_writeable page.
+ * We care about all writeable mappings, including foreign mappings.
+ */
+ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
+ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
+ set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
+
return 0;
could_not_pin:
@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
}
}
+ /*
+ * Flush the cache if there were previously non-coherent writeable
+ * mappings of this page. This forces the page to be coherent before it
+ * is freed back to the heap.
+ */
+ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
+ {
+ void *addr = __map_domain_page(page);
+
+ cache_flush(addr, PAGE_SIZE);
+ unmap_domain_page(addr);
+ }
+
return rc;
}
@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
if ( unlikely(!(nx & PGT_validated)) )
{
/*
+ * Flush the cache if there were previously non-coherent mappings of
+ * this page, and we're trying to use it as anything other than a
+ * writeable page. This forces the page to be coherent before we
+ * validate its contents for safety.
+ */
+ if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
+ {
+ void *addr = __map_domain_page(page);
+
+ cache_flush(addr, PAGE_SIZE);
+ unmap_domain_page(addr);
+
+ page->u.inuse.type_info &= ~PGT_non_coherent;
+ }
+
+ /*
* No special validation needed for writable or shared pages. Page
* tables and GDT/LDT need to have their contents audited.
*
diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
index 0325618c9883..81c72e61ed55 100644
--- a/xen/arch/x86/pv/grant_table.c
+++ b/xen/arch/x86/pv/grant_table.c
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
ol1e = *pl1e;
if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+ {
+ /*
+ * We always create mappings in this path. However, our caller,
+ * map_grant_ref(), only passes potentially non-zero cache_flags for
+ * MMIO frames, so this path doesn't create non-coherent mappings of
+ * RAM frames and there's no need to calculate PGT_non_coherent.
+ */
+ ASSERT(!cache_flags || is_iomem_page(frame));
+
rc = GNTST_okay;
+ }
out_unlock:
page_unlock(page);
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
l1e_get_flags(ol1e), addr, grant_pte_flags);
if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+ {
+ /*
+ * Generally, replace_grant_pv_mapping() is used to destroy mappings
+ * (n1le = l1e_empty()), but it can be a present mapping on the
+ * GNTABOP_unmap_and_replace path.
+ *
+ * In such cases, the PTE is fully transplanted from its old location
+ * via steal_linear_addr(), so we need not perform PGT_non_coherent
+ * checking here.
+ */
rc = GNTST_okay;
+ }
out_unlock:
page_unlock(page);
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 8a9a43bb0a9d..7464167ae192 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -53,8 +53,12 @@
#define _PGT_partial PG_shift(8)
#define PGT_partial PG_mask(1, 8)
+/* Has this page been mapped writeable with a non-coherent memory type? */
+#define _PGT_non_coherent PG_shift(9)
+#define PGT_non_coherent PG_mask(1, 9)
+
/* Count of uses of this frame as its current type. */
-#define PGT_count_width PG_shift(8)
+#define PGT_count_width PG_shift(9)
#define PGT_count_mask ((1UL<<PGT_count_width)-1)
/* Are the 'type mask' bits identical? */

View File

@ -0,0 +1,59 @@
From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001
From: Julien Grall <jgrall@amazon.com>
Date: Mon, 6 Jun 2022 06:17:25 +0000
Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying
During the domain destroy process, the domain will still be accessible
until it is fully destroyed. So does the P2M because we don't bail
out early if is_dying is non-zero. If a domain has permission to
modify the other domain's P2M (i.e. dom0, or a stubdomain), then
foreign mapping can be added past relinquish_p2m_mapping().
Therefore, we need to prevent mapping to be added when the domain
is dying. This commit prevents such adding of mapping by adding the
d->is_dying check to p2m_set_entry(). Also this commit enhances the
check in relinquish_p2m_mapping() to make sure that no mappings can
be added in the P2M after the P2M lock is released.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Tested-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
xen/arch/arm/p2m.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index fb71fa4c1c90..cbeff90f4371 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m,
{
int rc = 0;
+ /*
+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will
+ * be dropped in relinquish_p2m_mapping(). As the P2M will still
+ * be accessible after, we need to prevent mapping to be added when the
+ * domain is dying.
+ */
+ if ( unlikely(p2m->domain->is_dying) )
+ return -ENOMEM;
+
while ( nr )
{
unsigned long mask;
@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d)
unsigned int order;
gfn_t start, end;
+ BUG_ON(!d->is_dying);
+ /* No mappings can be added in the P2M after the P2M lock is released. */
p2m_write_lock(p2m);
start = p2m->lowest_mapped_gfn;
--
2.37.1

View File

@ -0,0 +1,165 @@
From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001
From: Julien Grall <jgrall@amazon.com>
Date: Mon, 6 Jun 2022 06:17:26 +0000
Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate
page tables
At the moment the P2M page tables will be freed when the domain structure
is freed without any preemption. As the P2M is quite large, iterating
through this may take more time than it is reasonable without intermediate
preemption (to run softirqs and perhaps scheduler).
Split p2m_teardown() in two parts: one preemptible and called when
relinquishing the resources, the other one non-preemptible and called
when freeing the domain structure.
As we are now freeing the P2M pages early, we also need to prevent
further allocation if someone call p2m_set_entry() past p2m_teardown()
(I wasn't able to prove this will never happen). This is done by
the checking domain->is_dying from previous patch in p2m_set_entry().
Similarly, we want to make sure that no-one can accessed the free
pages. Therefore the root is cleared before freeing pages.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Tested-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
xen/arch/arm/domain.c | 10 +++++++--
xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++---
xen/include/asm-arm/p2m.h | 13 +++++++++--
3 files changed, 63 insertions(+), 7 deletions(-)
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 96e1b235501d..2694c39127c5 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -789,10 +789,10 @@ fail:
void arch_domain_destroy(struct domain *d)
{
/* IOMMU page table is shared with P2M, always call
- * iommu_domain_destroy() before p2m_teardown().
+ * iommu_domain_destroy() before p2m_final_teardown().
*/
iommu_domain_destroy(d);
- p2m_teardown(d);
+ p2m_final_teardown(d);
domain_vgic_free(d);
domain_vuart_free(d);
free_xenheap_page(d->shared_info);
@@ -996,6 +996,7 @@ enum {
PROG_xen,
PROG_page,
PROG_mapping,
+ PROG_p2m,
PROG_done,
};
@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d)
if ( ret )
return ret;
+ PROGRESS(p2m):
+ ret = p2m_teardown(d);
+ if ( ret )
+ return ret;
+
PROGRESS(done):
break;
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index cbeff90f4371..3bcd1e897e88 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d)
spin_unlock(&vmid_alloc_lock);
}
-void p2m_teardown(struct domain *d)
+int p2m_teardown(struct domain *d)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ unsigned long count = 0;
struct page_info *pg;
+ unsigned int i;
+ int rc = 0;
+
+ p2m_write_lock(p2m);
+
+ /*
+ * We are about to free the intermediate page-tables, so clear the
+ * root to prevent any walk to use them.
+ */
+ for ( i = 0; i < P2M_ROOT_PAGES; i++ )
+ clear_and_clean_page(p2m->root + i);
+
+ /*
+ * The domain will not be scheduled anymore, so in theory we should
+ * not need to flush the TLBs. Do it for safety purpose.
+ *
+ * Note that all the devices have already been de-assigned. So we don't
+ * need to flush the IOMMU TLB here.
+ */
+ p2m_force_tlb_flush_sync(p2m);
+
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
+ {
+ free_domheap_page(pg);
+ count++;
+ /* Arbitrarily preempt every 512 iterations */
+ if ( !(count % 512) && hypercall_preempt_check() )
+ {
+ rc = -ERESTART;
+ break;
+ }
+ }
+
+ p2m_write_unlock(p2m);
+
+ return rc;
+}
+
+void p2m_final_teardown(struct domain *d)
+{
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
/* p2m not actually initialized */
if ( !p2m->domain )
return;
- while ( (pg = page_list_remove_head(&p2m->pages)) )
- free_domheap_page(pg);
+ ASSERT(page_list_empty(&p2m->pages));
if ( p2m->root )
free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index 8f11d9c97b5d..b3ba83283e11 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -192,8 +192,17 @@ void setup_virt_paging(void);
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
-/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct domain *d);
+/*
+ * The P2M resources are freed in two parts:
+ * - p2m_teardown() will be called when relinquish the resources. It
+ * will free large resources (e.g. intermediate page-tables) that
+ * requires preemption.
+ * - p2m_final_teardown() will be called when domain struct is been
+ * freed. This *cannot* be preempted and therefore one small
+ * resources should be freed here.
+ */
+int p2m_teardown(struct domain *d);
+void p2m_final_teardown(struct domain *d);
/*
* Remove mapping refcount on each mapping page in the p2m
--
2.37.1

View File

@ -0,0 +1,113 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown()
Add a new parameter to p2m_teardown() in order to select whether the
root page table should also be freed. Note that all users are
adjusted to pass the parameter to remove the root page tables, so
behavior is not modified.
No functional change intended.
This is part of CVE-2022-33746 / XSA-410.
Suggested-by: Julien Grall <julien@xen.org>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
int p2m_alloc_table(struct p2m_domain *p2m);
/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct p2m_domain *p2m);
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
void p2m_final_teardown(struct domain *d);
/* Add a page to a domain's p2m table */
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d
}
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i]);
+ p2m_teardown(d->arch.altp2m_p2m[i], true);
}
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
- p2m_teardown(d->arch.nested_p2m[i]);
+ p2m_teardown(d->arch.nested_p2m[i], true);
}
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d, NULL);
- p2m_teardown(p2m_get_hostp2m(d));
+ p2m_teardown(p2m_get_hostp2m(d), true);
/* Free any memory that the p2m teardown released */
paging_lock(d);
hap_set_allocation(d, 0, NULL);
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p
* hvm fixme: when adding support for pvh non-hardware domains, this path must
* cleanup any foreign p2m types (release refcnts on them).
*/
-void p2m_teardown(struct p2m_domain *p2m)
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
/* Return all the p2m pages to Xen.
* We know we don't have any extra mappings to these pages */
{
- struct page_info *pg;
+ struct page_info *pg, *root_pg = NULL;
struct domain *d;
if (p2m == NULL)
@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_lock(p2m);
ASSERT(atomic_read(&d->shr_pages) == 0);
- p2m->phys_table = pagetable_null();
+
+ if ( remove_root )
+ p2m->phys_table = pagetable_null();
+ else if ( !pagetable_is_null(p2m->phys_table) )
+ {
+ root_pg = pagetable_get_page(p2m->phys_table);
+ clear_domain_page(pagetable_get_mfn(p2m->phys_table));
+ }
while ( (pg = page_list_remove_head(&p2m->pages)) )
- d->arch.paging.free_page(d, pg);
+ if ( pg != root_pg )
+ d->arch.paging.free_page(d, pg);
+
+ if ( root_pg )
+ page_list_add(root_pg, &p2m->pages);
+
p2m_unlock(p2m);
}
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32
paging_unlock(d);
out_unlocked:
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
- p2m_teardown(p2m);
+ p2m_teardown(p2m, true);
if ( rv != 0 && pg != NULL )
{
pg->count_info &= ~PGC_count_mask;
@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain
shadow_teardown(d, NULL);
/* It is now safe to pull down the p2m map. */
- p2m_teardown(p2m_get_hostp2m(d));
+ p2m_teardown(p2m_get_hostp2m(d), true);
/* Free any shadow memory that the p2m teardown released */
paging_lock(d);
shadow_set_allocation(d, 0, NULL);

View File

@ -0,0 +1,62 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/HAP: adjust monitor table related error handling
hap_make_monitor_table() will return INVALID_MFN if it encounters an
error condition, but hap_update_paging_modes() wasnt handling this
value, resulting in an inappropriate value being stored in
monitor_table. This would subsequently misguide at least
hap_vcpu_teardown(). Avoid this by bailing early.
Further, when a domain has/was already crashed or (perhaps less
important as there's no such path known to lead here) is already dying,
avoid calling domain_crash() on it again - that's at best confusing.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -39,6 +39,7 @@
#include <asm/domain.h>
#include <xen/numa.h>
#include <asm/hvm/nestedhvm.h>
+#include <public/sched.h>
#include "private.h"
@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru
return m4mfn;
oom:
- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n");
- domain_crash(d);
+ if ( !d->is_dying &&
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ {
+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n",
+ d);
+ domain_crash(d);
+ }
return INVALID_MFN;
}
@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru
if ( pagetable_is_null(v->arch.hvm.monitor_table) )
{
mfn_t mmfn = hap_make_monitor_table(v);
+
+ if ( mfn_eq(mmfn, INVALID_MFN) )
+ goto unlock;
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
make_cr3(v, mmfn);
hvm_update_host_cr3(v);
@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru
/* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
hap_update_cr3(v, 0, false);
+ unlock:
paging_unlock(d);
put_gfn(d, cr3_gfn);
}

View File

@ -0,0 +1,60 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow()
Subsequently sh_set_toplevel_shadow() will be adjusted to install a
blank entry in case prealloc fails. There are, in fact, pre-existing
error paths which would put in place a blank entry. The 4- and 2-level
code in sh_update_cr3(), however, assume the top level entry to be
valid.
Hence bail from the function in the unlikely event that it's not. Note
that 3-level logic works differently: In particular a guest is free to
supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid)
entries. The guest will crash, but we already cope with that.
Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(),
and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change
in security context, but add a respective assertion.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu
/* Now figure out the new contents: is this a valid guest MFN? */
if ( !mfn_valid(gmfn) )
{
+ ASSERT(mfn_eq(gmfn, INVALID_MFN));
new_entry = pagetable_null();
goto install_new_entry;
}
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
guest_flush_tlb_mask(d, d->dirty_cpumask);
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
+ {
+ ASSERT(d->is_dying || d->is_shutting_down);
+ return;
+ }
if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
{
mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]);
@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
guest_flush_tlb_mask(d, d->dirty_cpumask);
sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
+ {
+ ASSERT(d->is_dying || d->is_shutting_down);
+ return;
+ }
#else
#error This should never happen
#endif

View File

@ -0,0 +1,255 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/shadow: tolerate failure in shadow_prealloc()
Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
the pre-allocation and instead return true/false. Modify
shadow_prealloc() to crash the domain on allocation failure (if the
domain is not already dying), as shadow cannot operate normally after
that. Modify callers to also gracefully handle {_,}shadow_prealloc()
failing to fulfill the request.
Note this in turn requires adjusting the callers of
sh_make_monitor_table() also to handle it returning INVALID_MFN.
sh_update_paging_modes() is also modified to add additional error
paths in case of allocation failure, some of those will return with
null monitor page tables (and the domain likely crashed). This is no
different that current error paths, but the newly introduced ones are
more likely to trigger.
The now added failure points in sh_update_paging_modes() also require
that on some error return paths the previous structures are cleared,
and thus monitor table is null.
While there adjust the 'type' parameter type of shadow_prealloc() to
unsigned int rather than u32.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -36,6 +36,7 @@
#include <asm/flushtlb.h>
#include <asm/shadow.h>
#include <xen/numa.h>
+#include <public/sched.h>
#include "private.h"
DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc
/* Make sure there are at least count order-sized pages
* available in the shadow page pool. */
-static void _shadow_prealloc(struct domain *d, unsigned int pages)
+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
{
struct vcpu *v;
struct page_info *sp, *t;
mfn_t smfn;
int i;
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ return true;
/* Shouldn't have enabled shadows if we've no vcpus. */
ASSERT(d->vcpu && d->vcpu[0]);
@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma
sh_unpin(d, smfn);
/* See if that freed up enough space */
- if ( d->arch.paging.shadow.free_pages >= pages ) return;
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ return true;
}
/* Stage two: all shadow pages are in use in hierarchies that are
@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma
if ( d->arch.paging.shadow.free_pages >= pages )
{
guest_flush_tlb_mask(d, d->dirty_cpumask);
- return;
+ return true;
}
}
}
@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma
d->arch.paging.shadow.total_pages,
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
- BUG();
+
+ ASSERT(d->is_dying);
+
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+
+ return false;
}
/* Make sure there are at least count pages of the order according to
@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma
* This must be called before any calls to shadow_alloc(). Since this
* will free existing shadows to make room, it must be called early enough
* to avoid freeing shadows that the caller is currently working on. */
-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
{
- return _shadow_prealloc(d, shadow_size(type) * count);
+ bool ret = _shadow_prealloc(d, shadow_size(type) * count);
+
+ if ( !ret && !d->is_dying &&
+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ /*
+ * Failing to allocate memory required for shadow usage can only result in
+ * a domain crash, do it here rather that relying on every caller to do it.
+ */
+ domain_crash(d);
+
+ return ret;
}
/* Deliberately free all the memory we can: this will tear down all of
@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t
static struct page_info *
shadow_alloc_p2m_page(struct domain *d)
{
- struct page_info *pg;
+ struct page_info *pg = NULL;
/* This is called both from the p2m code (which never holds the
* paging lock) and the log-dirty code (which always does). */
@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
d->arch.paging.shadow.p2m_pages,
shadow_min_acceptable_pages(d));
}
- paging_unlock(d);
- return NULL;
+ goto out;
}
- shadow_prealloc(d, SH_type_p2m_table, 1);
+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
+ goto out;
+
pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
d->arch.paging.shadow.p2m_pages++;
d->arch.paging.shadow.total_pages--;
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+ out:
paging_unlock(d);
return pg;
@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain
else if ( d->arch.paging.shadow.total_pages > pages )
{
/* Need to return memory to domheap */
- _shadow_prealloc(d, 1);
+ if ( !_shadow_prealloc(d, 1) )
+ return -ENOMEM;
+
sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
ASSERT(sp);
/*
@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc
if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
{
int i;
+
+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
+ return;
+
for(i = 0; i < SHADOW_OOS_PAGES; i++)
- {
- shadow_prealloc(d, SH_type_oos_snapshot, 1);
v->arch.paging.shadow.oos_snapshot[i] =
shadow_alloc(d, SH_type_oos_snapshot, 0);
- }
}
#endif /* OOS */
@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc
mfn_t mmfn = sh_make_monitor_table(
v, v->arch.paging.mode->shadow.shadow_levels);
+ if ( mfn_eq(mmfn, INVALID_MFN) )
+ return;
+
v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
make_cr3(v, mmfn);
hvm_update_host_cr3(v);
@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc
v->arch.hvm.monitor_table = pagetable_null();
new_mfn = sh_make_monitor_table(
v, v->arch.paging.mode->shadow.shadow_levels);
+ if ( mfn_eq(new_mfn, INVALID_MFN) )
+ {
+ sh_destroy_monitor_table(v, old_mfn,
+ old_mode->shadow.shadow_levels);
+ return;
+ }
v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
mfn_x(new_mfn));
@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu
if ( !mfn_valid(smfn) )
{
/* Make sure there's enough free shadow memory. */
- shadow_prealloc(d, root_type, 1);
+ if ( !shadow_prealloc(d, root_type, 1) )
+ {
+ new_entry = pagetable_null();
+ goto install_new_entry;
+ }
+
/* Shadow the page. */
smfn = make_shadow(v, gmfn, root_type);
}
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct
ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
/* Guarantee we can get the memory we need */
- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
+ return INVALID_MFN;
+
m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
mfn_to_page(m4mfn)->shadow_flags = 4;
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v,
* Preallocate shadow pages *before* removing writable accesses
* otherwhise an OOS L1 might be demoted and promoted again with
* writable mappings. */
- shadow_prealloc(d,
- SH_type_l1_shadow,
- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
+ if ( !shadow_prealloc(d, SH_type_l1_shadow,
+ GUEST_PAGING_LEVELS < 4
+ ? 1 : GUEST_PAGING_LEVELS - 1) )
+ {
+ paging_unlock(d);
+ put_gfn(d, gfn_x(gfn));
+ return 0;
+ }
rc = gw_remove_write_accesses(v, va, &gw);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf
void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
/* Shadow page allocation functions */
-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
+ unsigned int count);
mfn_t shadow_alloc(struct domain *d,
u32 shadow_type,
unsigned long backpointer);

View File

@ -0,0 +1,82 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: refuse new allocations for dying domains
This will in particular prevent any attempts to add entries to the p2m,
once - in a subsequent change - non-root entries have been removed.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc
ASSERT(paging_locked_by_me(d));
+ if ( unlikely(d->is_dying) )
+ return NULL;
+
pg = page_list_remove_head(&d->arch.paging.hap.freelist);
if ( unlikely(!pg) )
return NULL;
@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p
d->arch.paging.hap.p2m_pages++;
ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
}
- else if ( !d->arch.paging.p2m_alloc_failed )
+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying )
{
d->arch.paging.p2m_alloc_failed = 1;
dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n",
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo
if ( d->arch.paging.shadow.free_pages >= pages )
return true;
+ if ( unlikely(d->is_dying) )
+ /* No reclaim when the domain is dying, teardown will take care of it. */
+ return false;
+
/* Shouldn't have enabled shadows if we've no vcpus. */
ASSERT(d->vcpu && d->vcpu[0]);
@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
- ASSERT(d->is_dying);
+ ASSERT_UNREACHABLE();
guest_flush_tlb_mask(d, d->dirty_cpumask);
@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo
* to avoid freeing shadows that the caller is currently working on. */
bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
{
- bool ret = _shadow_prealloc(d, shadow_size(type) * count);
+ bool ret;
+
+ if ( unlikely(d->is_dying) )
+ return false;
- if ( !ret && !d->is_dying &&
- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ ret = _shadow_prealloc(d, shadow_size(type) * count);
+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
/*
* Failing to allocate memory required for shadow usage can only result in
* a domain crash, do it here rather that relying on every caller to do it.
@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d)
{
struct page_info *pg = NULL;
+ if ( unlikely(d->is_dying) )
+ return NULL;
+
/* This is called both from the p2m code (which never holds the
* paging lock) and the log-dirty code (which always does). */
paging_lock_recursive(d);

View File

@ -0,0 +1,96 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: truly free paging pool memory for dying domains
Modify {hap,shadow}_free to free the page immediately if the domain is
dying, so that pages don't accumulate in the pool when
{shadow,hap}_final_teardown() get called. This is to limit the amount of
work which needs to be done there (in a non-preemptable manner).
Note the call to shadow_free() in shadow_free_p2m_page() is moved after
increasing total_pages, so that the decrease done in shadow_free() in
case the domain is dying doesn't underflow the counter, even if just for
a short interval.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m
ASSERT(paging_locked_by_me(d));
+ /*
+ * For dying domains, actually free the memory here. This way less work is
+ * left to hap_final_teardown(), which cannot easily have preemption checks
+ * added.
+ */
+ if ( unlikely(d->is_dying) )
+ {
+ free_domheap_page(pg);
+ d->arch.paging.hap.total_pages--;
+ return;
+ }
+
d->arch.paging.hap.free_pages++;
page_list_add_tail(pg, &d->arch.paging.hap.freelist);
}
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d,
void shadow_free(struct domain *d, mfn_t smfn)
{
struct page_info *next = NULL, *sp = mfn_to_page(smfn);
+ bool dying = ACCESS_ONCE(d->is_dying);
struct page_list_head *pin_list;
unsigned int pages;
u32 shadow_type;
@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t
* just before the allocator hands the page out again. */
page_set_tlbflush_timestamp(sp);
perfc_decr(shadow_alloc_count);
- page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
+
+ /*
+ * For dying domains, actually free the memory here. This way less
+ * work is left to shadow_final_teardown(), which cannot easily have
+ * preemption checks added.
+ */
+ if ( unlikely(dying) )
+ {
+ /*
+ * The backpointer field (sh.back) used by shadow code aliases the
+ * domain owner field, unconditionally clear it here to avoid
+ * free_domheap_page() attempting to parse it.
+ */
+ page_set_owner(sp, NULL);
+ free_domheap_page(sp);
+ }
+ else
+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
+
sp = next;
}
- d->arch.paging.shadow.free_pages += pages;
+ if ( unlikely(dying) )
+ d->arch.paging.shadow.total_pages -= pages;
+ else
+ d->arch.paging.shadow.free_pages += pages;
}
/* Divert a page from the pool to be used by the p2m mapping.
@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s
* paging lock) and the log-dirty code (which always does). */
paging_lock_recursive(d);
- shadow_free(d, page_to_mfn(pg));
d->arch.paging.shadow.p2m_pages--;
d->arch.paging.shadow.total_pages++;
+ shadow_free(d, page_to_mfn(pg));
paging_unlock(d);
}

View File

@ -0,0 +1,159 @@
From: Roger Pau Monné <roger.pau@citrix.com>
Subject: x86/p2m: free the paging memory pool preemptively
The paging memory pool is currently freed in two different places:
from {shadow,hap}_teardown() via domain_relinquish_resources() and
from {shadow,hap}_final_teardown() via complete_domain_destroy().
While the former does handle preemption, the later doesn't.
Attempt to move as much p2m related freeing as possible to happen
before the call to {shadow,hap}_teardown(), so that most memory can be
freed in a preemptive way. In order to avoid causing issues to
existing callers leave the root p2m page tables set and free them in
{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free
the page immediately if the domain is dying, so that pages don't
accumulate in the pool when {shadow,hap}_final_teardown() get called.
Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's
the place where altp2m_active gets disabled now.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -38,7 +38,6 @@
#include <xen/livepatch.h>
#include <public/sysctl.h>
#include <public/hvm/hvm_vcpu.h>
-#include <asm/altp2m.h>
#include <asm/regs.h>
#include <asm/mc146818rtc.h>
#include <asm/system.h>
@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d
vpmu_destroy(v);
}
- if ( altp2m_active(d) )
- {
- for_each_vcpu ( d, v )
- altp2m_vcpu_disable_ve(v);
- }
-
if ( is_pv_domain(d) )
{
for_each_vcpu ( d, v )
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -28,6 +28,7 @@
#include <xen/domain_page.h>
#include <xen/guest_access.h>
#include <xen/keyhandler.h>
+#include <asm/altp2m.h>
#include <asm/event.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d
unsigned int i;
if ( hvm_altp2m_supported() )
- {
- d->arch.altp2m_active = 0;
-
- if ( d->arch.altp2m_eptp )
- {
- free_xenheap_page(d->arch.altp2m_eptp);
- d->arch.altp2m_eptp = NULL;
- }
-
- if ( d->arch.altp2m_visible_eptp )
- {
- free_xenheap_page(d->arch.altp2m_visible_eptp);
- d->arch.altp2m_visible_eptp = NULL;
- }
-
for ( i = 0; i < MAX_ALTP2M; i++ )
p2m_teardown(d->arch.altp2m_p2m[i], true);
- }
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d
paging_lock(d);
hap_set_allocation(d, 0, NULL);
ASSERT(d->arch.paging.hap.p2m_pages == 0);
+ ASSERT(d->arch.paging.hap.free_pages == 0);
+ ASSERT(d->arch.paging.hap.total_pages == 0);
paging_unlock(d);
}
@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v)
void hap_teardown(struct domain *d, bool *preempted)
{
struct vcpu *v;
+ unsigned int i;
ASSERT(d->is_dying);
ASSERT(d != current->domain);
@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool
for_each_vcpu ( d, v )
hap_vcpu_teardown(v);
+ /* Leave the root pt in case we get further attempts to modify the p2m. */
+ if ( hvm_altp2m_supported() )
+ {
+ if ( altp2m_active(d) )
+ for_each_vcpu ( d, v )
+ altp2m_vcpu_disable_ve(v);
+
+ d->arch.altp2m_active = 0;
+
+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp);
+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
+
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+ p2m_teardown(d->arch.altp2m_p2m[i], false);
+ }
+
+ /* Destroy nestedp2m's after altp2m. */
+ for ( i = 0; i < MAX_NESTEDP2M; i++ )
+ p2m_teardown(d->arch.nested_p2m[i], false);
+
+ p2m_teardown(p2m_get_hostp2m(d), false);
+
paging_lock(d); /* Keep various asserts happy */
if ( d->arch.paging.hap.total_pages != 0 )
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b
for_each_vcpu ( d, v )
shadow_vcpu_teardown(v);
+ p2m_teardown(p2m_get_hostp2m(d), false);
+
paging_lock(d);
+ /*
+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find
+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages
+ * because the domain is dying.
+ */
+ shadow_blow_tables(d);
+
#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
/* Free the virtual-TLB array attached to each vcpu */
for_each_vcpu(d, v)
@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain
d->arch.paging.shadow.total_pages,
d->arch.paging.shadow.free_pages,
d->arch.paging.shadow.p2m_pages);
+ ASSERT(!d->arch.paging.shadow.total_pages);
+ ASSERT(!d->arch.paging.shadow.free_pages);
+ ASSERT(!d->arch.paging.shadow.p2m_pages);
paging_unlock(d);
}

View File

@ -0,0 +1,171 @@
From: Julien Grall <jgrall@amazon.com>
Subject: xen/x86: p2m: Add preemption in p2m_teardown()
The list p2m->pages contain all the pages used by the P2M. On large
instance this can be quite large and the time spent to call
d->arch.paging.free_page() will take more than 1ms for a 80GB guest
on a Xen running in nested environment on a c5.metal.
By extrapolation, it would take > 100ms for a 8TB guest (what we
current security support). So add some preemption in p2m_teardown()
and propagate to the callers. Note there are 3 places where
the preemption is not enabled:
- hap_final_teardown()/shadow_final_teardown(): We are
preventing update the P2M once the domain is dying (so
no more pages could be allocated) and most of the P2M pages
will be freed in preemptive manneer when relinquishing the
resources. So this is fine to disable preemption.
- shadow_enable(): This is fine because it will undo the allocation
that may have been made by p2m_alloc_table() (so only the root
page table).
The preemption is arbitrarily checked every 1024 iterations.
Note that with the current approach, Xen doesn't keep track on whether
the alt/nested P2Ms have been cleared. So there are some redundant work.
However, this is not expected to incurr too much overhead (the P2M lock
shouldn't be contended during teardown). So this is optimization is
left outside of the security event.
This is part of CVE-2022-33746 / XSA-410.
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
int p2m_alloc_table(struct p2m_domain *p2m);
/* Return all the p2m resources to Xen. */
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted);
void p2m_final_teardown(struct domain *d);
/* Add a page to a domain's p2m table */
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d
if ( hvm_altp2m_supported() )
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i], true);
+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL);
/* Destroy nestedp2m's first */
for (i = 0; i < MAX_NESTEDP2M; i++) {
- p2m_teardown(d->arch.nested_p2m[i], true);
+ p2m_teardown(d->arch.nested_p2m[i], true, NULL);
}
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d, NULL);
- p2m_teardown(p2m_get_hostp2m(d), true);
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
/* Free any memory that the p2m teardown released */
paging_lock(d);
hap_set_allocation(d, 0, NULL);
@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool
FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
for ( i = 0; i < MAX_ALTP2M; i++ )
- p2m_teardown(d->arch.altp2m_p2m[i], false);
+ {
+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted);
+ if ( preempted && *preempted )
+ return;
+ }
}
/* Destroy nestedp2m's after altp2m. */
for ( i = 0; i < MAX_NESTEDP2M; i++ )
- p2m_teardown(d->arch.nested_p2m[i], false);
+ {
+ p2m_teardown(d->arch.nested_p2m[i], false, preempted);
+ if ( preempted && *preempted )
+ return;
+ }
- p2m_teardown(p2m_get_hostp2m(d), false);
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
+ if ( preempted && *preempted )
+ return;
paging_lock(d); /* Keep various asserts happy */
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p
* hvm fixme: when adding support for pvh non-hardware domains, this path must
* cleanup any foreign p2m types (release refcnts on them).
*/
-void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted)
/* Return all the p2m pages to Xen.
* We know we don't have any extra mappings to these pages */
{
struct page_info *pg, *root_pg = NULL;
struct domain *d;
+ unsigned int i = 0;
if (p2m == NULL)
return;
@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m
}
while ( (pg = page_list_remove_head(&p2m->pages)) )
- if ( pg != root_pg )
- d->arch.paging.free_page(d, pg);
+ {
+ if ( pg == root_pg )
+ continue;
+
+ d->arch.paging.free_page(d, pg);
+
+ /* Arbitrarily check preemption every 1024 iterations */
+ if ( preempted && !(++i % 1024) && general_preempt_check() )
+ {
+ *preempted = true;
+ break;
+ }
+ }
if ( root_pg )
page_list_add(root_pg, &p2m->pages);
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32
out_locked:
paging_unlock(d);
out_unlocked:
+ /*
+ * This is fine to ignore the preemption here because only the root
+ * will be allocated by p2m_alloc_table().
+ */
if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
- p2m_teardown(p2m, true);
+ p2m_teardown(p2m, true, NULL);
if ( rv != 0 && pg != NULL )
{
pg->count_info &= ~PGC_count_mask;
@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b
for_each_vcpu ( d, v )
shadow_vcpu_teardown(v);
- p2m_teardown(p2m_get_hostp2m(d), false);
+ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
+ if ( preempted && *preempted )
+ return;
paging_lock(d);
@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain
shadow_teardown(d, NULL);
/* It is now safe to pull down the p2m map. */
- p2m_teardown(p2m_get_hostp2m(d), true);
+ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
/* Free any shadow memory that the p2m teardown released */
paging_lock(d);
shadow_set_allocation(d, 0, NULL);

View File

@ -0,0 +1,55 @@
From: Jan Beulich <jbeulich@suse.com>
Subject: gnttab: correct locking on transitive grant copy error path
While the comment next to the lock dropping in preparation of
recursively calling acquire_grant_for_copy() mistakenly talks about the
rd == td case (excluded a few lines further up), the same concerns apply
to the calling of release_grant_for_copy() on a subsequent error path.
This is CVE-2022-33748 / XSA-411.
Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend code comment.
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -2622,9 +2622,8 @@ acquire_grant_for_copy(
trans_domid);
/*
- * acquire_grant_for_copy() could take the lock on the
- * remote table (if rd == td), so we have to drop the lock
- * here and reacquire.
+ * acquire_grant_for_copy() will take the lock on the remote table,
+ * so we have to drop the lock here and reacquire.
*/
active_entry_release(act);
grant_read_unlock(rgt);
@@ -2661,11 +2660,25 @@ acquire_grant_for_copy(
act->trans_gref != trans_gref ||
!act->is_sub_page)) )
{
+ /*
+ * Like above for acquire_grant_for_copy() we need to drop and then
+ * re-acquire the locks here to prevent lock order inversion issues.
+ * Unlike for acquire_grant_for_copy() we don't need to re-check
+ * anything, as release_grant_for_copy() doesn't depend on the grant
+ * table entry: It only updates internal state and the status flags.
+ */
+ active_entry_release(act);
+ grant_read_unlock(rgt);
+
release_grant_for_copy(td, trans_gref, readonly);
rcu_unlock_domain(td);
+
+ grant_read_lock(rgt);
+ act = active_entry_acquire(rgt, gref);
reduce_status_for_pin(rd, act, status, readonly);
active_entry_release(act);
grant_read_unlock(rgt);
+
put_page(*page);
*page = NULL;
return ERESTART;