2019-05-29 22:12:40 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2013-01-21 07:28:06 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 - Virtual Open Systems and Columbia University
|
|
|
|
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
|
|
|
|
*/
|
2013-01-21 07:28:06 +08:00
|
|
|
|
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/kvm_host.h>
|
|
|
|
#include <linux/io.h>
|
2012-11-02 00:14:45 +08:00
|
|
|
#include <linux/hugetlb.h>
|
2017-06-21 00:11:48 +08:00
|
|
|
#include <linux/sched/signal.h>
|
2013-01-21 07:43:58 +08:00
|
|
|
#include <trace/events/kvm.h>
|
2013-01-21 07:28:06 +08:00
|
|
|
#include <asm/pgalloc.h>
|
2013-01-21 07:28:12 +08:00
|
|
|
#include <asm/cacheflush.h>
|
2013-01-21 07:28:06 +08:00
|
|
|
#include <asm/kvm_arm.h>
|
|
|
|
#include <asm/kvm_mmu.h>
|
2020-09-11 21:25:12 +08:00
|
|
|
#include <asm/kvm_pgtable.h>
|
2019-01-30 02:48:49 +08:00
|
|
|
#include <asm/kvm_ras.h>
|
2013-01-21 07:28:07 +08:00
|
|
|
#include <asm/kvm_asm.h>
|
2013-01-21 07:28:12 +08:00
|
|
|
#include <asm/kvm_emulate.h>
|
2015-01-29 19:59:54 +08:00
|
|
|
#include <asm/virt.h>
|
2013-01-21 07:28:07 +08:00
|
|
|
|
|
|
|
#include "trace.h"
|
2013-01-21 07:28:06 +08:00
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
static struct kvm_pgtable *hyp_pgtable;
|
2013-01-21 07:28:06 +08:00
|
|
|
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
|
|
|
|
|
ARM: KVM: switch to a dual-step HYP init code
Our HYP init code suffers from two major design issues:
- it cannot support CPU hotplug, as we tear down the idmap very early
- it cannot perform a TLB invalidation when switching from init to
runtime mappings, as pages are manipulated from PL1 exclusively
The hotplug problem mandates that we keep two sets of page tables
(boot and runtime). The TLB problem mandates that we're able to
transition from one PGD to another while in HYP, invalidating the TLBs
in the process.
To be able to do this, we need to share a page between the two page
tables. A page that will have the same VA in both configurations. All we
need is a VA that has the following properties:
- This VA can't be used to represent a kernel mapping.
- This VA will not conflict with the physical address of the kernel text
The vectors page seems to satisfy this requirement:
- The kernel never maps anything else there
- The kernel text being copied at the beginning of the physical memory,
it is unlikely to use the last 64kB (I doubt we'll ever support KVM
on a system with something like 4MB of RAM, but patches are very
welcome).
Let's call this VA the trampoline VA.
Now, we map our init page at 3 locations:
- idmap in the boot pgd
- trampoline VA in the boot pgd
- trampoline VA in the runtime pgd
The init scenario is now the following:
- We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
runtime stack, runtime vectors
- Enable the MMU with the boot pgd
- Jump to a target into the trampoline page (remember, this is the same
physical page!)
- Now switch to the runtime pgd (same VA, and still the same physical
page!)
- Invalidate TLBs
- Set stack and vectors
- Profit! (or eret, if you only care about the code).
Note that we keep the boot mapping permanently (it is not strictly an
idmap anymore) to allow for CPU hotplug in later patches.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
2013-04-13 02:12:06 +08:00
|
|
|
static unsigned long hyp_idmap_start;
|
|
|
|
static unsigned long hyp_idmap_end;
|
|
|
|
static phys_addr_t hyp_idmap_vector;
|
|
|
|
|
2017-12-05 01:04:38 +08:00
|
|
|
static unsigned long io_map_base;
|
|
|
|
|
2019-12-12 00:56:48 +08:00
|
|
|
|
2020-09-11 21:25:17 +08:00
|
|
|
/*
|
|
|
|
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
|
|
|
|
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
|
|
|
|
* CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
|
|
|
|
* long will also starve other vCPUs. We have to also make sure that the page
|
|
|
|
* tables are not freed while we released the lock.
|
|
|
|
*/
|
|
|
|
static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
|
|
|
|
phys_addr_t end,
|
|
|
|
int (*fn)(struct kvm_pgtable *, u64, u64),
|
|
|
|
bool resched)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
u64 next;
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
|
|
|
|
if (!pgt)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
next = stage2_pgd_addr_end(kvm, addr, end);
|
|
|
|
ret = fn(pgt, addr, next - addr);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (resched && next != end)
|
|
|
|
cond_resched_lock(&kvm->mmu_lock);
|
|
|
|
} while (addr = next, addr != end);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-09-11 21:25:21 +08:00
|
|
|
#define stage2_apply_range_resched(kvm, addr, end, fn) \
|
|
|
|
stage2_apply_range(kvm, addr, end, fn, true)
|
|
|
|
|
2015-01-16 07:58:58 +08:00
|
|
|
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
|
|
|
|
{
|
|
|
|
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
|
2015-01-16 07:59:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
|
|
|
|
* @kvm: pointer to kvm structure.
|
|
|
|
*
|
|
|
|
* Interface to HYP function to flush all VM TLB entries
|
|
|
|
*/
|
|
|
|
void kvm_flush_remote_tlbs(struct kvm *kvm)
|
|
|
|
{
|
2021-08-17 08:26:39 +08:00
|
|
|
++kvm->stat.generic.remote_tlb_flush_requests;
|
2019-01-05 04:09:05 +08:00
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
|
2015-01-16 07:58:58 +08:00
|
|
|
}
|
2012-11-02 00:14:45 +08:00
|
|
|
|
ARM/arm64: KVM: test properly for a PTE's uncachedness
The open coded tests for checking whether a PTE maps a page as
uncached use a flawed '(pte_val(xxx) & CONST) != CONST' pattern,
which is not guaranteed to work since the type of a mapping is
not a set of mutually exclusive bits
For HYP mappings, the type is an index into the MAIR table (i.e, the
index itself does not contain any information whatsoever about the
type of the mapping), and for stage-2 mappings it is a bit field where
normal memory and device types are defined as follows:
#define MT_S2_NORMAL 0xf
#define MT_S2_DEVICE_nGnRE 0x1
I.e., masking *and* comparing with the latter matches on the former,
and we have been getting lucky merely because the S2 device mappings
also have the PTE_UXN bit set, or we would misidentify memory mappings
as device mappings.
Since the unmap_range() code path (which contains one instance of the
flawed test) is used both for HYP mappings and stage-2 mappings, and
considering the difference between the two, it is non-trivial to fix
this by rewriting the tests in place, as it would involve passing
down the type of mapping through all the functions.
However, since HYP mappings and stage-2 mappings both deal with host
physical addresses, we can simply check whether the mapping is backed
by memory that is managed by the host kernel, and only perform the
D-cache maintenance if this is the case.
Cc: stable@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Pavel Fedin <p.fedin@samsung.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
2015-11-10 22:11:20 +08:00
|
|
|
static bool kvm_is_device_pfn(unsigned long pfn)
|
|
|
|
{
|
2021-07-01 09:51:19 +08:00
|
|
|
return !pfn_is_map_memory(pfn);
|
ARM/arm64: KVM: test properly for a PTE's uncachedness
The open coded tests for checking whether a PTE maps a page as
uncached use a flawed '(pte_val(xxx) & CONST) != CONST' pattern,
which is not guaranteed to work since the type of a mapping is
not a set of mutually exclusive bits
For HYP mappings, the type is an index into the MAIR table (i.e, the
index itself does not contain any information whatsoever about the
type of the mapping), and for stage-2 mappings it is a bit field where
normal memory and device types are defined as follows:
#define MT_S2_NORMAL 0xf
#define MT_S2_DEVICE_nGnRE 0x1
I.e., masking *and* comparing with the latter matches on the former,
and we have been getting lucky merely because the S2 device mappings
also have the PTE_UXN bit set, or we would misidentify memory mappings
as device mappings.
Since the unmap_range() code path (which contains one instance of the
flawed test) is used both for HYP mappings and stage-2 mappings, and
considering the difference between the two, it is non-trivial to fix
this by rewriting the tests in place, as it would involve passing
down the type of mapping through all the functions.
However, since HYP mappings and stage-2 mappings both deal with host
physical addresses, we can simply check whether the mapping is backed
by memory that is managed by the host kernel, and only perform the
D-cache maintenance if this is the case.
Cc: stable@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Pavel Fedin <p.fedin@samsung.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
2015-11-10 22:11:20 +08:00
|
|
|
}
|
|
|
|
|
2021-03-19 18:01:14 +08:00
|
|
|
static void *stage2_memcache_zalloc_page(void *arg)
|
|
|
|
{
|
|
|
|
struct kvm_mmu_memory_cache *mc = arg;
|
|
|
|
|
|
|
|
/* Allocated with __GFP_ZERO, so no need to zero */
|
|
|
|
return kvm_mmu_memory_cache_alloc(mc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *kvm_host_zalloc_pages_exact(size_t size)
|
|
|
|
{
|
|
|
|
return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void kvm_host_get_page(void *addr)
|
|
|
|
{
|
|
|
|
get_page(virt_to_page(addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void kvm_host_put_page(void *addr)
|
|
|
|
{
|
|
|
|
put_page(virt_to_page(addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int kvm_host_page_count(void *addr)
|
|
|
|
{
|
|
|
|
return page_count(virt_to_page(addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static phys_addr_t kvm_host_pa(void *addr)
|
|
|
|
{
|
|
|
|
return __pa(addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *kvm_host_va(phys_addr_t phys)
|
|
|
|
{
|
|
|
|
return __va(phys);
|
|
|
|
}
|
|
|
|
|
2021-06-17 18:58:23 +08:00
|
|
|
static void clean_dcache_guest_page(void *va, size_t size)
|
|
|
|
{
|
|
|
|
__clean_dcache_guest_page(va, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void invalidate_icache_guest_page(void *va, size_t size)
|
|
|
|
{
|
|
|
|
__invalidate_icache_guest_page(va, size);
|
|
|
|
}
|
|
|
|
|
2014-12-20 00:48:06 +08:00
|
|
|
/*
|
|
|
|
* Unmapping vs dcache management:
|
|
|
|
*
|
|
|
|
* If a guest maps certain memory pages as uncached, all writes will
|
|
|
|
* bypass the data cache and go directly to RAM. However, the CPUs
|
|
|
|
* can still speculate reads (not writes) and fill cache lines with
|
|
|
|
* data.
|
|
|
|
*
|
|
|
|
* Those cache lines will be *clean* cache lines though, so a
|
|
|
|
* clean+invalidate operation is equivalent to an invalidate
|
|
|
|
* operation, because no cache lines are marked dirty.
|
|
|
|
*
|
|
|
|
* Those clean cache lines could be filled prior to an uncached write
|
|
|
|
* by the guest, and the cache coherent IO subsystem would therefore
|
|
|
|
* end up writing old data to disk.
|
|
|
|
*
|
|
|
|
* This is why right after unmapping a page/section and invalidating
|
2020-09-11 21:25:17 +08:00
|
|
|
* the corresponding TLBs, we flush to make sure the IO subsystem will
|
|
|
|
* never hit in the cache.
|
2018-04-06 19:27:28 +08:00
|
|
|
*
|
|
|
|
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
|
|
|
|
* we then fully enforce cacheability of RAM, no matter what the guest
|
|
|
|
* does.
|
2014-12-20 00:48:06 +08:00
|
|
|
*/
|
2016-03-23 20:08:02 +08:00
|
|
|
/**
|
|
|
|
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
|
2020-09-17 09:47:49 +08:00
|
|
|
* @mmu: The KVM stage-2 MMU pointer
|
2016-03-23 20:08:02 +08:00
|
|
|
* @start: The intermediate physical base address of the range to unmap
|
|
|
|
* @size: The size of the area to unmap
|
2020-09-17 09:47:49 +08:00
|
|
|
* @may_block: Whether or not we are permitted to block
|
2016-03-23 20:08:02 +08:00
|
|
|
*
|
|
|
|
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
|
|
|
|
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
|
|
|
|
* destroying the VM), otherwise another faulting VCPU may come in and mess
|
|
|
|
* with things behind our backs.
|
|
|
|
*/
|
2020-08-11 18:27:25 +08:00
|
|
|
static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
|
|
|
|
bool may_block)
|
2014-05-10 05:31:31 +08:00
|
|
|
{
|
2021-03-19 18:01:28 +08:00
|
|
|
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
|
2020-09-11 21:25:17 +08:00
|
|
|
phys_addr_t end = start + size;
|
2014-05-10 05:31:31 +08:00
|
|
|
|
2017-04-03 22:12:43 +08:00
|
|
|
assert_spin_locked(&kvm->mmu_lock);
|
2018-05-21 11:05:30 +08:00
|
|
|
WARN_ON(size & ~PAGE_MASK);
|
2020-09-11 21:25:17 +08:00
|
|
|
WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
|
|
|
|
may_block));
|
2013-03-05 10:43:17 +08:00
|
|
|
}
|
|
|
|
|
2020-08-11 18:27:25 +08:00
|
|
|
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
|
|
|
|
{
|
|
|
|
__unmap_stage2_range(mmu, start, size, true);
|
|
|
|
}
|
|
|
|
|
2014-01-15 20:50:23 +08:00
|
|
|
static void stage2_flush_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *memslot)
|
|
|
|
{
|
|
|
|
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
|
|
|
|
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
|
|
|
|
|
2020-09-11 21:25:23 +08:00
|
|
|
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
|
2014-01-15 20:50:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* stage2_flush_vm - Invalidate cache for pages mapped in stage 2
|
|
|
|
* @kvm: The struct kvm pointer
|
|
|
|
*
|
|
|
|
* Go through the stage 2 page tables and invalidate any cache lines
|
|
|
|
* backing memory already mapped to the VM.
|
|
|
|
*/
|
2014-12-20 00:05:31 +08:00
|
|
|
static void stage2_flush_vm(struct kvm *kvm)
|
2014-01-15 20:50:23 +08:00
|
|
|
{
|
|
|
|
struct kvm_memslots *slots;
|
|
|
|
struct kvm_memory_slot *memslot;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&kvm->srcu);
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
|
|
|
|
|
|
slots = kvm_memslots(kvm);
|
|
|
|
kvm_for_each_memslot(memslot, slots)
|
|
|
|
stage2_flush_memslot(kvm, memslot);
|
|
|
|
|
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
|
|
srcu_read_unlock(&kvm->srcu, idx);
|
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:06 +08:00
|
|
|
/**
|
2013-04-13 02:12:05 +08:00
|
|
|
* free_hyp_pgds - free Hyp-mode page tables
|
2013-01-21 07:28:06 +08:00
|
|
|
*/
|
2013-04-13 02:12:05 +08:00
|
|
|
void free_hyp_pgds(void)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2013-04-13 02:12:07 +08:00
|
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
2020-09-11 21:25:12 +08:00
|
|
|
if (hyp_pgtable) {
|
|
|
|
kvm_pgtable_hyp_destroy(hyp_pgtable);
|
|
|
|
kfree(hyp_pgtable);
|
2021-03-19 18:01:26 +08:00
|
|
|
hyp_pgtable = NULL;
|
2016-07-01 01:40:46 +08:00
|
|
|
}
|
2013-01-21 07:28:06 +08:00
|
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
|
|
|
}
|
|
|
|
|
2021-03-19 18:01:26 +08:00
|
|
|
static bool kvm_host_owns_hyp_mappings(void)
|
|
|
|
{
|
|
|
|
if (static_branch_likely(&kvm_protected_mode_initialized))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This can happen at boot time when __create_hyp_mappings() is called
|
|
|
|
* after the hyp protection has been enabled, but the static key has
|
|
|
|
* not been flipped yet.
|
|
|
|
*/
|
|
|
|
if (!hyp_pgtable && is_protected_kvm_enabled())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
WARN_ON(!hyp_pgtable);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
static int __create_hyp_mappings(unsigned long start, unsigned long size,
|
|
|
|
unsigned long phys, enum kvm_pgtable_prot prot)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2020-09-11 21:25:12 +08:00
|
|
|
int err;
|
2013-01-21 07:28:06 +08:00
|
|
|
|
2021-08-09 23:24:47 +08:00
|
|
|
if (WARN_ON(!kvm_host_owns_hyp_mappings()))
|
|
|
|
return -EINVAL;
|
2021-03-19 18:01:26 +08:00
|
|
|
|
2013-01-21 07:28:06 +08:00
|
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
2020-09-11 21:25:12 +08:00
|
|
|
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
|
2013-01-21 07:28:06 +08:00
|
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
2020-09-11 21:25:12 +08:00
|
|
|
|
2013-01-21 07:28:06 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2013-11-16 05:14:12 +08:00
|
|
|
static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
|
|
|
|
{
|
|
|
|
if (!is_vmalloc_addr(kaddr)) {
|
|
|
|
BUG_ON(!virt_addr_valid(kaddr));
|
|
|
|
return __pa(kaddr);
|
|
|
|
} else {
|
|
|
|
return page_to_phys(vmalloc_to_page(kaddr)) +
|
|
|
|
offset_in_page(kaddr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-09 23:24:47 +08:00
|
|
|
static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
|
|
|
|
{
|
|
|
|
phys_addr_t addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) {
|
|
|
|
ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp,
|
|
|
|
__phys_to_pfn(addr));
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:06 +08:00
|
|
|
/**
|
2012-10-28 08:09:14 +08:00
|
|
|
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
|
2013-01-21 07:28:06 +08:00
|
|
|
* @from: The virtual kernel start address of the range
|
|
|
|
* @to: The virtual kernel end address of the range (exclusive)
|
2016-06-13 22:00:45 +08:00
|
|
|
* @prot: The protection to be applied to this range
|
2013-01-21 07:28:06 +08:00
|
|
|
*
|
2012-10-28 08:09:14 +08:00
|
|
|
* The same virtual address as the kernel virtual address is also used
|
|
|
|
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
|
|
|
|
* physical pages.
|
2013-01-21 07:28:06 +08:00
|
|
|
*/
|
2020-09-11 21:25:12 +08:00
|
|
|
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2013-11-16 05:14:12 +08:00
|
|
|
phys_addr_t phys_addr;
|
|
|
|
unsigned long virt_addr;
|
2016-07-01 01:40:51 +08:00
|
|
|
unsigned long start = kern_hyp_va((unsigned long)from);
|
|
|
|
unsigned long end = kern_hyp_va((unsigned long)to);
|
2013-04-13 02:12:01 +08:00
|
|
|
|
2015-01-29 19:59:54 +08:00
|
|
|
if (is_kernel_in_hyp_mode())
|
|
|
|
return 0;
|
|
|
|
|
2021-08-09 23:24:47 +08:00
|
|
|
if (!kvm_host_owns_hyp_mappings()) {
|
|
|
|
if (WARN_ON(prot != PAGE_HYP))
|
|
|
|
return -EPERM;
|
|
|
|
return pkvm_share_hyp(kvm_kaddr_to_phys(from),
|
|
|
|
kvm_kaddr_to_phys(to));
|
|
|
|
}
|
|
|
|
|
2013-11-16 05:14:12 +08:00
|
|
|
start = start & PAGE_MASK;
|
|
|
|
end = PAGE_ALIGN(end);
|
2013-04-13 02:12:01 +08:00
|
|
|
|
2013-11-16 05:14:12 +08:00
|
|
|
for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
|
|
|
|
int err;
|
2013-04-13 02:12:01 +08:00
|
|
|
|
2013-11-16 05:14:12 +08:00
|
|
|
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
|
2020-09-11 21:25:12 +08:00
|
|
|
err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
|
2016-06-13 22:00:45 +08:00
|
|
|
prot);
|
2013-11-16 05:14:12 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2013-01-21 07:28:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-13 19:00:29 +08:00
|
|
|
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
|
2020-09-11 21:25:12 +08:00
|
|
|
unsigned long *haddr,
|
|
|
|
enum kvm_pgtable_prot prot)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2017-12-05 01:04:38 +08:00
|
|
|
unsigned long base;
|
|
|
|
int ret = 0;
|
2013-04-13 02:12:01 +08:00
|
|
|
|
2021-03-19 18:01:26 +08:00
|
|
|
if (!kvm_host_owns_hyp_mappings()) {
|
|
|
|
base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
|
|
|
|
phys_addr, size, prot);
|
|
|
|
if (IS_ERR_OR_NULL((void *)base))
|
|
|
|
return PTR_ERR((void *)base);
|
|
|
|
*haddr = base;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-12-05 01:04:38 +08:00
|
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
2013-04-13 02:12:01 +08:00
|
|
|
|
2017-12-05 01:04:38 +08:00
|
|
|
/*
|
2020-04-01 22:03:10 +08:00
|
|
|
* This assumes that we have enough space below the idmap
|
2017-12-05 01:04:38 +08:00
|
|
|
* page to allocate our VAs. If not, the check below will
|
|
|
|
* kick. A potential alternative would be to detect that
|
|
|
|
* overflow and switch to an allocation above the idmap.
|
|
|
|
*
|
|
|
|
* The allocated size is always a multiple of PAGE_SIZE.
|
|
|
|
*/
|
|
|
|
size = PAGE_ALIGN(size + offset_in_page(phys_addr));
|
|
|
|
base = io_map_base - size;
|
2017-12-05 00:43:23 +08:00
|
|
|
|
2017-12-05 01:04:38 +08:00
|
|
|
/*
|
|
|
|
* Verify that BIT(VA_BITS - 1) hasn't been flipped by
|
|
|
|
* allocating the new area, as it would indicate we've
|
|
|
|
* overflowed the idmap/IO address range.
|
|
|
|
*/
|
|
|
|
if ((base ^ io_map_base) & BIT(VA_BITS - 1))
|
|
|
|
ret = -ENOMEM;
|
|
|
|
else
|
|
|
|
io_map_base = base;
|
|
|
|
|
|
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
ret = __create_hyp_mappings(base, size, phys_addr, prot);
|
2017-12-05 01:04:38 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2018-02-13 19:00:29 +08:00
|
|
|
*haddr = base + offset_in_page(phys_addr);
|
2017-12-05 01:04:38 +08:00
|
|
|
out:
|
2018-02-13 19:00:29 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* create_hyp_io_mappings - Map IO into both kernel and HYP
|
|
|
|
* @phys_addr: The physical start address which gets mapped
|
|
|
|
* @size: Size of the region being mapped
|
|
|
|
* @kaddr: Kernel VA for this mapping
|
|
|
|
* @haddr: HYP VA for this mapping
|
|
|
|
*/
|
|
|
|
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
|
|
|
|
void __iomem **kaddr,
|
|
|
|
void __iomem **haddr)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
*kaddr = ioremap(phys_addr, size);
|
|
|
|
if (!*kaddr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (is_kernel_in_hyp_mode()) {
|
|
|
|
*haddr = *kaddr;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = __create_hyp_private_mapping(phys_addr, size,
|
|
|
|
&addr, PAGE_HYP_DEVICE);
|
2017-12-05 00:43:23 +08:00
|
|
|
if (ret) {
|
|
|
|
iounmap(*kaddr);
|
|
|
|
*kaddr = NULL;
|
2018-02-13 19:00:29 +08:00
|
|
|
*haddr = NULL;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
*haddr = (void __iomem *)addr;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* create_hyp_exec_mappings - Map an executable range into HYP
|
|
|
|
* @phys_addr: The physical start address which gets mapped
|
|
|
|
* @size: Size of the region being mapped
|
|
|
|
* @haddr: HYP VA for this mapping
|
|
|
|
*/
|
|
|
|
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
|
|
|
|
void **haddr)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(is_kernel_in_hyp_mode());
|
|
|
|
|
|
|
|
ret = __create_hyp_private_mapping(phys_addr, size,
|
|
|
|
&addr, PAGE_HYP_EXEC);
|
|
|
|
if (ret) {
|
|
|
|
*haddr = NULL;
|
2017-12-05 00:43:23 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-02-13 19:00:29 +08:00
|
|
|
*haddr = (void *)addr;
|
2017-12-05 00:43:23 +08:00
|
|
|
return 0;
|
2013-01-21 07:28:06 +08:00
|
|
|
}
|
|
|
|
|
2021-07-26 23:35:48 +08:00
|
|
|
static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
|
|
|
|
/* We shouldn't need any other callback to walk the PT */
|
|
|
|
.phys_to_virt = kvm_host_va,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int get_user_mapping_size(struct kvm *kvm, u64 addr)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable pgt = {
|
|
|
|
.pgd = (kvm_pte_t *)kvm->mm->pgd,
|
|
|
|
.ia_bits = VA_BITS,
|
|
|
|
.start_level = (KVM_PGTABLE_MAX_LEVELS -
|
|
|
|
CONFIG_PGTABLE_LEVELS),
|
|
|
|
.mm_ops = &kvm_user_mm_ops,
|
|
|
|
};
|
|
|
|
kvm_pte_t pte = 0; /* Keep GCC quiet... */
|
|
|
|
u32 level = ~0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
|
|
|
|
VM_BUG_ON(ret);
|
|
|
|
VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
|
|
|
|
VM_BUG_ON(!(pte & PTE_VALID));
|
|
|
|
|
|
|
|
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
|
|
|
|
}
|
|
|
|
|
2021-03-19 18:01:14 +08:00
|
|
|
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
|
|
|
|
.zalloc_page = stage2_memcache_zalloc_page,
|
|
|
|
.zalloc_pages_exact = kvm_host_zalloc_pages_exact,
|
|
|
|
.free_pages_exact = free_pages_exact,
|
|
|
|
.get_page = kvm_host_get_page,
|
|
|
|
.put_page = kvm_host_put_page,
|
|
|
|
.page_count = kvm_host_page_count,
|
|
|
|
.phys_to_virt = kvm_host_va,
|
|
|
|
.virt_to_phys = kvm_host_pa,
|
2021-06-17 18:58:24 +08:00
|
|
|
.dcache_clean_inval_poc = clean_dcache_guest_page,
|
|
|
|
.icache_inval_pou = invalidate_icache_guest_page,
|
2021-03-19 18:01:14 +08:00
|
|
|
};
|
|
|
|
|
2013-01-21 07:28:07 +08:00
|
|
|
/**
|
2019-01-05 04:09:05 +08:00
|
|
|
* kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
|
|
|
|
* @kvm: The pointer to the KVM structure
|
|
|
|
* @mmu: The pointer to the s2 MMU structure
|
2013-01-21 07:28:07 +08:00
|
|
|
*
|
2020-09-11 21:25:13 +08:00
|
|
|
* Allocates only the stage-2 HW PGD level table(s).
|
2013-01-21 07:28:07 +08:00
|
|
|
* Note we don't need locking here as this is only called when the VM is
|
|
|
|
* created, which can only be done once.
|
|
|
|
*/
|
2019-01-05 04:09:05 +08:00
|
|
|
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2020-09-11 21:25:13 +08:00
|
|
|
int cpu, err;
|
|
|
|
struct kvm_pgtable *pgt;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2020-09-11 21:25:13 +08:00
|
|
|
if (mmu->pgt != NULL) {
|
2013-01-21 07:28:07 +08:00
|
|
|
kvm_err("kvm_arch already initialized?\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2021-09-07 20:31:12 +08:00
|
|
|
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
|
2020-09-11 21:25:13 +08:00
|
|
|
if (!pgt)
|
2015-03-11 03:06:59 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2021-03-19 18:01:27 +08:00
|
|
|
err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops);
|
2020-09-11 21:25:13 +08:00
|
|
|
if (err)
|
|
|
|
goto out_free_pgtable;
|
2018-12-11 22:26:31 +08:00
|
|
|
|
2019-01-05 04:09:05 +08:00
|
|
|
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
|
|
|
|
if (!mmu->last_vcpu_ran) {
|
2020-09-11 21:25:13 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out_destroy_pgtable;
|
2019-01-05 04:09:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for_each_possible_cpu(cpu)
|
|
|
|
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
|
|
|
|
|
2021-03-19 18:01:28 +08:00
|
|
|
mmu->arch = &kvm->arch;
|
2020-09-11 21:25:13 +08:00
|
|
|
mmu->pgt = pgt;
|
|
|
|
mmu->pgd_phys = __pa(pgt->pgd);
|
2021-08-06 19:31:08 +08:00
|
|
|
WRITE_ONCE(mmu->vmid.vmid_gen, 0);
|
2013-01-21 07:28:07 +08:00
|
|
|
return 0;
|
2020-09-11 21:25:13 +08:00
|
|
|
|
|
|
|
out_destroy_pgtable:
|
|
|
|
kvm_pgtable_stage2_destroy(pgt);
|
|
|
|
out_free_pgtable:
|
|
|
|
kfree(pgt);
|
|
|
|
return err;
|
2013-01-21 07:28:07 +08:00
|
|
|
}
|
|
|
|
|
2014-11-27 17:35:03 +08:00
|
|
|
static void stage2_unmap_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *memslot)
|
|
|
|
{
|
|
|
|
hva_t hva = memslot->userspace_addr;
|
|
|
|
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
|
|
|
|
phys_addr_t size = PAGE_SIZE * memslot->npages;
|
|
|
|
hva_t reg_end = hva + size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A memory region could potentially cover multiple VMAs, and any holes
|
|
|
|
* between them, so iterate over all of them to find out if we should
|
|
|
|
* unmap any of them.
|
|
|
|
*
|
|
|
|
* +--------------------------------------------+
|
|
|
|
* +---------------+----------------+ +----------------+
|
|
|
|
* | : VMA 1 | VMA 2 | | VMA 3 : |
|
|
|
|
* +---------------+----------------+ +----------------+
|
|
|
|
* | memory region |
|
|
|
|
* +--------------------------------------------+
|
|
|
|
*/
|
|
|
|
do {
|
2021-03-16 12:11:25 +08:00
|
|
|
struct vm_area_struct *vma;
|
2014-11-27 17:35:03 +08:00
|
|
|
hva_t vm_start, vm_end;
|
|
|
|
|
2021-03-16 12:11:25 +08:00
|
|
|
vma = find_vma_intersection(current->mm, hva, reg_end);
|
|
|
|
if (!vma)
|
2014-11-27 17:35:03 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Take the intersection of this VMA with the memory region
|
|
|
|
*/
|
|
|
|
vm_start = max(hva, vma->vm_start);
|
|
|
|
vm_end = min(reg_end, vma->vm_end);
|
|
|
|
|
|
|
|
if (!(vma->vm_flags & VM_PFNMAP)) {
|
|
|
|
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
|
2019-01-05 04:09:05 +08:00
|
|
|
unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
|
2014-11-27 17:35:03 +08:00
|
|
|
}
|
|
|
|
hva = vm_end;
|
|
|
|
} while (hva < reg_end);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* stage2_unmap_vm - Unmap Stage-2 RAM mappings
|
|
|
|
* @kvm: The struct kvm pointer
|
|
|
|
*
|
2020-04-01 22:03:10 +08:00
|
|
|
* Go through the memregions and unmap any regular RAM
|
2014-11-27 17:35:03 +08:00
|
|
|
* backing memory already mapped to the VM.
|
|
|
|
*/
|
|
|
|
void stage2_unmap_vm(struct kvm *kvm)
|
|
|
|
{
|
|
|
|
struct kvm_memslots *slots;
|
|
|
|
struct kvm_memory_slot *memslot;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&kvm->srcu);
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_lock(current->mm);
|
2014-11-27 17:35:03 +08:00
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
|
|
|
|
|
|
slots = kvm_memslots(kvm);
|
|
|
|
kvm_for_each_memslot(memslot, slots)
|
|
|
|
stage2_unmap_memslot(kvm, memslot);
|
|
|
|
|
|
|
|
spin_unlock(&kvm->mmu_lock);
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_unlock(current->mm);
|
2014-11-27 17:35:03 +08:00
|
|
|
srcu_read_unlock(&kvm->srcu, idx);
|
|
|
|
}
|
|
|
|
|
2019-01-05 04:09:05 +08:00
|
|
|
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2021-03-19 18:01:28 +08:00
|
|
|
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
|
2020-09-11 21:25:13 +08:00
|
|
|
struct kvm_pgtable *pgt = NULL;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2017-04-03 22:12:43 +08:00
|
|
|
spin_lock(&kvm->mmu_lock);
|
2020-09-11 21:25:13 +08:00
|
|
|
pgt = mmu->pgt;
|
|
|
|
if (pgt) {
|
|
|
|
mmu->pgd_phys = 0;
|
|
|
|
mmu->pgt = NULL;
|
|
|
|
free_percpu(mmu->last_vcpu_ran);
|
2017-05-03 22:17:51 +08:00
|
|
|
}
|
2017-04-03 22:12:43 +08:00
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
|
|
|
2020-09-11 21:25:13 +08:00
|
|
|
if (pgt) {
|
|
|
|
kvm_pgtable_stage2_destroy(pgt);
|
|
|
|
kfree(pgt);
|
2019-01-05 04:09:05 +08:00
|
|
|
}
|
2013-01-21 07:28:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kvm_phys_addr_ioremap - map a device range to guest IPA
|
|
|
|
*
|
|
|
|
* @kvm: The KVM pointer
|
|
|
|
* @guest_ipa: The IPA at which to insert the mapping
|
|
|
|
* @pa: The physical address of the device
|
|
|
|
* @size: The size of the mapping
|
2020-09-17 09:47:49 +08:00
|
|
|
* @writable: Whether or not to create a writable mapping
|
2013-01-21 07:28:07 +08:00
|
|
|
*/
|
|
|
|
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
2014-09-18 05:56:18 +08:00
|
|
|
phys_addr_t pa, unsigned long size, bool writable)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2020-09-11 21:25:15 +08:00
|
|
|
phys_addr_t addr;
|
2013-01-21 07:28:07 +08:00
|
|
|
int ret = 0;
|
2020-07-03 10:35:42 +08:00
|
|
|
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
|
2020-09-11 21:25:15 +08:00
|
|
|
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
|
|
|
|
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
|
|
|
|
KVM_PGTABLE_PROT_R |
|
|
|
|
(writable ? KVM_PGTABLE_PROT_W : 0);
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2020-09-11 21:25:15 +08:00
|
|
|
size += offset_in_page(guest_ipa);
|
|
|
|
guest_ipa &= PAGE_MASK;
|
2014-09-18 05:56:18 +08:00
|
|
|
|
2020-09-11 21:25:15 +08:00
|
|
|
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
|
2020-07-03 10:35:42 +08:00
|
|
|
ret = kvm_mmu_topup_memory_cache(&cache,
|
|
|
|
kvm_mmu_cache_min_pages(kvm));
|
2013-01-21 07:28:07 +08:00
|
|
|
if (ret)
|
2020-09-11 21:25:15 +08:00
|
|
|
break;
|
|
|
|
|
2013-01-21 07:28:07 +08:00
|
|
|
spin_lock(&kvm->mmu_lock);
|
2020-09-11 21:25:15 +08:00
|
|
|
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
|
|
|
|
&cache);
|
2013-01-21 07:28:07 +08:00
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
|
|
if (ret)
|
2020-09-11 21:25:15 +08:00
|
|
|
break;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2020-09-11 21:25:15 +08:00
|
|
|
pa += PAGE_SIZE;
|
2013-01-21 07:28:07 +08:00
|
|
|
}
|
|
|
|
|
2020-07-03 10:35:42 +08:00
|
|
|
kvm_mmu_free_memory_cache(&cache);
|
2013-01-21 07:28:07 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-01-16 07:58:56 +08:00
|
|
|
/**
|
|
|
|
* stage2_wp_range() - write protect stage2 memory region range
|
2020-09-17 09:47:49 +08:00
|
|
|
* @mmu: The KVM stage-2 MMU pointer
|
2015-01-16 07:58:56 +08:00
|
|
|
* @addr: Start address of range
|
|
|
|
* @end: End address of range
|
|
|
|
*/
|
2019-01-05 04:09:05 +08:00
|
|
|
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
|
2015-01-16 07:58:56 +08:00
|
|
|
{
|
2021-03-19 18:01:28 +08:00
|
|
|
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
|
2020-09-11 21:25:21 +08:00
|
|
|
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
|
2015-01-16 07:58:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
|
|
|
|
* @kvm: The KVM pointer
|
|
|
|
* @slot: The memory slot to write protect
|
|
|
|
*
|
|
|
|
* Called to start logging dirty pages after memory region
|
|
|
|
* KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
|
2018-12-12 01:10:37 +08:00
|
|
|
* all present PUD, PMD and PTEs are write protected in the memory region.
|
2015-01-16 07:58:56 +08:00
|
|
|
* Afterwards read of dirty page log can be called.
|
|
|
|
*
|
|
|
|
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
|
|
|
|
* serializing operations for VM memory regions.
|
|
|
|
*/
|
2021-03-16 12:11:24 +08:00
|
|
|
static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
|
2015-01-16 07:58:56 +08:00
|
|
|
{
|
2015-05-17 22:20:07 +08:00
|
|
|
struct kvm_memslots *slots = kvm_memslots(kvm);
|
|
|
|
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
|
2020-02-19 05:07:31 +08:00
|
|
|
phys_addr_t start, end;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(!memslot))
|
|
|
|
return;
|
|
|
|
|
|
|
|
start = memslot->base_gfn << PAGE_SHIFT;
|
|
|
|
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
|
2015-01-16 07:58:56 +08:00
|
|
|
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
2019-01-05 04:09:05 +08:00
|
|
|
stage2_wp_range(&kvm->arch.mmu, start, end);
|
2015-01-16 07:58:56 +08:00
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
|
|
kvm_flush_remote_tlbs(kvm);
|
|
|
|
}
|
2015-01-16 07:58:57 +08:00
|
|
|
|
|
|
|
/**
|
2015-01-28 10:54:23 +08:00
|
|
|
* kvm_mmu_write_protect_pt_masked() - write protect dirty pages
|
2015-01-16 07:58:57 +08:00
|
|
|
* @kvm: The KVM pointer
|
|
|
|
* @slot: The memory slot associated with mask
|
|
|
|
* @gfn_offset: The gfn offset in memory slot
|
|
|
|
* @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
|
|
|
|
* slot to be write protected
|
|
|
|
*
|
|
|
|
* Walks bits set in mask write protects the associated pte's. Caller must
|
|
|
|
* acquire kvm_mmu_lock.
|
|
|
|
*/
|
2015-01-28 10:54:23 +08:00
|
|
|
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
|
2015-01-16 07:58:57 +08:00
|
|
|
struct kvm_memory_slot *slot,
|
|
|
|
gfn_t gfn_offset, unsigned long mask)
|
|
|
|
{
|
|
|
|
phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
|
|
|
|
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
|
|
|
|
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
|
|
|
|
|
2019-01-05 04:09:05 +08:00
|
|
|
stage2_wp_range(&kvm->arch.mmu, start, end);
|
2015-01-16 07:58:57 +08:00
|
|
|
}
|
2015-01-16 07:58:56 +08:00
|
|
|
|
2015-01-28 10:54:23 +08:00
|
|
|
/*
|
|
|
|
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
|
|
|
|
* dirty pages.
|
|
|
|
*
|
|
|
|
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
|
|
|
|
* enable dirty logging for them.
|
|
|
|
*/
|
|
|
|
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *slot,
|
|
|
|
gfn_t gfn_offset, unsigned long mask)
|
|
|
|
{
|
|
|
|
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
|
|
|
|
}
|
|
|
|
|
2019-12-17 20:38:09 +08:00
|
|
|
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
|
2017-06-21 00:11:48 +08:00
|
|
|
{
|
2018-04-17 02:39:10 +08:00
|
|
|
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
|
2017-06-21 00:11:48 +08:00
|
|
|
}
|
|
|
|
|
2019-03-12 17:52:51 +08:00
|
|
|
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
|
|
|
|
unsigned long hva,
|
|
|
|
unsigned long map_size)
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
{
|
2019-02-19 17:22:21 +08:00
|
|
|
gpa_t gpa_start;
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
hva_t uaddr_start, uaddr_end;
|
|
|
|
size_t size;
|
|
|
|
|
2020-05-07 20:35:45 +08:00
|
|
|
/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
|
|
|
|
if (map_size == PAGE_SIZE)
|
|
|
|
return true;
|
|
|
|
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
size = memslot->npages * PAGE_SIZE;
|
|
|
|
|
|
|
|
gpa_start = memslot->base_gfn << PAGE_SHIFT;
|
|
|
|
|
|
|
|
uaddr_start = memslot->userspace_addr;
|
|
|
|
uaddr_end = uaddr_start + size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pages belonging to memslots that don't have the same alignment
|
2019-03-12 17:52:51 +08:00
|
|
|
* within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
|
|
|
|
* PMD/PUD entries, because we'll end up mapping the wrong pages.
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
*
|
|
|
|
* Consider a layout like the following:
|
|
|
|
*
|
|
|
|
* memslot->userspace_addr:
|
|
|
|
* +-----+--------------------+--------------------+---+
|
2019-03-12 17:52:51 +08:00
|
|
|
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
* +-----+--------------------+--------------------+---+
|
|
|
|
*
|
2020-05-07 20:35:45 +08:00
|
|
|
* memslot->base_gfn << PAGE_SHIFT:
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
* +---+--------------------+--------------------+-----+
|
2019-03-12 17:52:51 +08:00
|
|
|
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
* +---+--------------------+--------------------+-----+
|
|
|
|
*
|
2019-03-12 17:52:51 +08:00
|
|
|
* If we create those stage-2 blocks, we'll end up with this incorrect
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
* mapping:
|
|
|
|
* d -> f
|
|
|
|
* e -> g
|
|
|
|
* f -> h
|
|
|
|
*/
|
2019-03-12 17:52:51 +08:00
|
|
|
if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Next, let's make sure we're not trying to map anything not covered
|
2019-03-12 17:52:51 +08:00
|
|
|
* by the memslot. This means we have to prohibit block size mappings
|
|
|
|
* for the beginning and end of a non-block aligned and non-block sized
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
* memory slot (illustrated by the head and tail parts of the
|
|
|
|
* userspace view above containing pages 'abcde' and 'xyz',
|
|
|
|
* respectively).
|
|
|
|
*
|
|
|
|
* Note that it doesn't matter if we do the check using the
|
|
|
|
* userspace_addr or the base_gfn, as both are equally aligned (per
|
|
|
|
* the check above) and equally sized.
|
|
|
|
*/
|
2019-03-12 17:52:51 +08:00
|
|
|
return (hva & ~(map_size - 1)) >= uaddr_start &&
|
|
|
|
(hva & ~(map_size - 1)) + map_size <= uaddr_end;
|
KVM: arm/arm64: Fix unintended stage 2 PMD mappings
There are two things we need to take care of when we create block
mappings in the stage 2 page tables:
(1) The alignment within a PMD between the host address range and the
guest IPA range must be the same, since otherwise we end up mapping
pages with the wrong offset.
(2) The head and tail of a memory slot may not cover a full block
size, and we have to take care to not map those with block
descriptors, since we could expose memory to the guest that the host
did not intend to expose.
So far, we have been taking care of (1), but not (2), and our commentary
describing (1) was somewhat confusing.
This commit attempts to factor out the checks of both into a common
function, and if we don't pass the check, we won't attempt any PMD
mappings for neither hugetlbfs nor THP.
Note that we used to only check the alignment for THP, not for
hugetlbfs, but as far as I can tell the check needs to be applied to
both scenarios.
Cc: Ralph Palutke <ralph.palutke@fau.de>
Cc: Lukas Braun <koomi@moshbit.net>
Reported-by: Lukas Braun <koomi@moshbit.net>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2018-11-02 15:53:22 +08:00
|
|
|
}
|
|
|
|
|
2020-05-07 20:35:46 +08:00
|
|
|
/*
|
|
|
|
* Check if the given hva is backed by a transparent huge page (THP) and
|
|
|
|
* whether it can be mapped using block mapping in stage2. If so, adjust
|
|
|
|
* the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
|
|
|
|
* supported. This will need to be updated to support other THP sizes.
|
|
|
|
*
|
|
|
|
* Returns the size of the mapping.
|
|
|
|
*/
|
|
|
|
static unsigned long
|
2021-07-26 23:35:48 +08:00
|
|
|
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
2020-05-07 20:35:46 +08:00
|
|
|
unsigned long hva, kvm_pfn_t *pfnp,
|
|
|
|
phys_addr_t *ipap)
|
|
|
|
{
|
|
|
|
kvm_pfn_t pfn = *pfnp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure the adjustment is done only for THP pages. Also make
|
|
|
|
* sure that the HVA and IPA are sufficiently aligned and that the
|
|
|
|
* block map is contained within the memslot.
|
|
|
|
*/
|
2021-07-26 23:35:48 +08:00
|
|
|
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
|
|
|
|
get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
|
2020-05-07 20:35:46 +08:00
|
|
|
/*
|
|
|
|
* The address we faulted on is backed by a transparent huge
|
|
|
|
* page. However, because we map the compound huge page and
|
|
|
|
* not the individual tail page, we need to transfer the
|
|
|
|
* refcount to the head page. We have to be careful that the
|
|
|
|
* THP doesn't start to split while we are adjusting the
|
|
|
|
* refcounts.
|
|
|
|
*
|
|
|
|
* We are sure this doesn't happen, because mmu_notifier_retry
|
|
|
|
* was successful and we are holding the mmu_lock, so if this
|
|
|
|
* THP is trying to split, it will be blocked in the mmu
|
|
|
|
* notifier before touching any of the pages, specifically
|
|
|
|
* before being able to call __split_huge_page_refcount().
|
|
|
|
*
|
|
|
|
* We can therefore safely transfer the refcount from PG_tail
|
|
|
|
* to PG_head and switch the pfn from a tail page to the head
|
|
|
|
* page accordingly.
|
|
|
|
*/
|
|
|
|
*ipap &= PMD_MASK;
|
|
|
|
kvm_release_pfn_clean(pfn);
|
|
|
|
pfn &= ~(PTRS_PER_PMD - 1);
|
2021-07-26 23:35:51 +08:00
|
|
|
get_page(pfn_to_page(pfn));
|
2020-05-07 20:35:46 +08:00
|
|
|
*pfnp = pfn;
|
|
|
|
|
|
|
|
return PMD_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Use page mapping if we cannot use block mapping. */
|
|
|
|
return PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
2021-05-07 19:03:22 +08:00
|
|
|
static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
|
|
|
|
{
|
|
|
|
unsigned long pa;
|
|
|
|
|
|
|
|
if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
|
|
|
|
return huge_page_shift(hstate_vma(vma));
|
|
|
|
|
|
|
|
if (!(vma->vm_flags & VM_PFNMAP))
|
|
|
|
return PAGE_SHIFT;
|
|
|
|
|
|
|
|
VM_BUG_ON(is_vm_hugetlb_page(vma));
|
|
|
|
|
|
|
|
pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
|
|
|
|
|
|
|
|
#ifndef __PAGETABLE_PMD_FOLDED
|
|
|
|
if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
|
|
|
|
ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
|
|
|
|
ALIGN(hva, PUD_SIZE) <= vma->vm_end)
|
|
|
|
return PUD_SHIFT;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
|
|
|
|
ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
|
|
|
|
ALIGN(hva, PMD_SIZE) <= vma->vm_end)
|
|
|
|
return PMD_SHIFT;
|
|
|
|
|
|
|
|
return PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2021-06-21 19:17:12 +08:00
|
|
|
/*
|
|
|
|
* The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
|
|
|
|
* able to see the page's tags and therefore they must be initialised first. If
|
|
|
|
* PG_mte_tagged is set, tags have already been initialised.
|
|
|
|
*
|
|
|
|
* The race in the test/set of the PG_mte_tagged flag is handled by:
|
|
|
|
* - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
|
|
|
|
* racing to santise the same page
|
|
|
|
* - mmap_lock protects between a VM faulting a page in and the VMM performing
|
|
|
|
* an mprotect() to add VM_MTE
|
|
|
|
*/
|
|
|
|
static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
|
|
|
|
unsigned long size)
|
|
|
|
{
|
|
|
|
unsigned long i, nr_pages = size >> PAGE_SHIFT;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (!kvm_has_mte(kvm))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pfn_to_online_page() is used to reject ZONE_DEVICE pages
|
|
|
|
* that may not support tags.
|
|
|
|
*/
|
|
|
|
page = pfn_to_online_page(pfn);
|
|
|
|
|
|
|
|
if (!page)
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++, page++) {
|
|
|
|
if (!test_bit(PG_mte_tagged, &page->flags)) {
|
|
|
|
mte_clear_page_tags(page_address(page));
|
|
|
|
set_bit(PG_mte_tagged, &page->flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:12 +08:00
|
|
|
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
2014-08-19 18:18:04 +08:00
|
|
|
struct kvm_memory_slot *memslot, unsigned long hva,
|
2013-01-21 07:28:12 +08:00
|
|
|
unsigned long fault_status)
|
|
|
|
{
|
2020-09-30 18:24:42 +08:00
|
|
|
int ret = 0;
|
2018-12-12 01:10:35 +08:00
|
|
|
bool write_fault, writable, force_pte = false;
|
2020-09-11 21:25:25 +08:00
|
|
|
bool exec_fault;
|
|
|
|
bool device = false;
|
2021-06-21 19:17:12 +08:00
|
|
|
bool shared;
|
2013-01-21 07:28:12 +08:00
|
|
|
unsigned long mmu_seq;
|
2012-11-02 00:14:45 +08:00
|
|
|
struct kvm *kvm = vcpu->kvm;
|
2013-01-21 07:28:12 +08:00
|
|
|
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
|
2012-11-02 00:14:45 +08:00
|
|
|
struct vm_area_struct *vma;
|
2019-12-17 20:38:09 +08:00
|
|
|
short vma_shift;
|
2020-09-11 21:25:25 +08:00
|
|
|
gfn_t gfn;
|
kvm: rename pfn_t to kvm_pfn_t
To date, we have implemented two I/O usage models for persistent memory,
PMEM (a persistent "ram disk") and DAX (mmap persistent memory into
userspace). This series adds a third, DAX-GUP, that allows DAX mappings
to be the target of direct-i/o. It allows userspace to coordinate
DMA/RDMA from/to persistent memory.
The implementation leverages the ZONE_DEVICE mm-zone that went into
4.3-rc1 (also discussed at kernel summit) to flag pages that are owned
and dynamically mapped by a device driver. The pmem driver, after
mapping a persistent memory range into the system memmap via
devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus
page-backed pmem-pfns via flags in the new pfn_t type.
The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the
resulting pte(s) inserted into the process page tables with a new
_PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys
off _PAGE_DEVMAP to pin the device hosting the page range active.
Finally, get_page() and put_page() are modified to take references
against the device driver established page mapping.
Finally, this need for "struct page" for persistent memory requires
memory capacity to store the memmap array. Given the memmap array for a
large pool of persistent may exhaust available DRAM introduce a
mechanism to allocate the memmap from persistent memory. The new
"struct vmem_altmap *" parameter to devm_memremap_pages() enables
arch_add_memory() to use reserved pmem capacity rather than the page
allocator.
This patch (of 18):
The core has developed a need for a "pfn_t" type [1]. Move the existing
pfn_t in KVM to kvm_pfn_t [2].
[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html
[2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-16 08:56:11 +08:00
|
|
|
kvm_pfn_t pfn;
|
2015-01-16 07:58:58 +08:00
|
|
|
bool logging_active = memslot_is_logging(memslot);
|
2020-12-02 04:10:34 +08:00
|
|
|
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
|
|
|
|
unsigned long vma_pagesize, fault_granule;
|
2020-09-11 21:25:25 +08:00
|
|
|
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
|
|
|
|
struct kvm_pgtable *pgt;
|
2013-01-21 07:28:12 +08:00
|
|
|
|
2020-12-02 04:10:34 +08:00
|
|
|
fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
|
2014-09-09 18:27:09 +08:00
|
|
|
write_fault = kvm_is_write_fault(vcpu);
|
2020-09-15 18:42:17 +08:00
|
|
|
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
|
2017-10-24 00:11:19 +08:00
|
|
|
VM_BUG_ON(write_fault && exec_fault);
|
|
|
|
|
|
|
|
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
|
2013-01-21 07:28:12 +08:00
|
|
|
kvm_err("Unexpected L2 read permission error\n");
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
2021-05-07 19:03:22 +08:00
|
|
|
/*
|
|
|
|
* Let's check if we will get back a huge page backed by hugetlbfs, or
|
|
|
|
* get block mapping for device MMIO region.
|
|
|
|
*/
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_lock(current->mm);
|
2021-06-29 10:38:59 +08:00
|
|
|
vma = vma_lookup(current->mm, hva);
|
2014-09-18 05:56:17 +08:00
|
|
|
if (unlikely(!vma)) {
|
|
|
|
kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_unlock(current->mm);
|
2014-09-18 05:56:17 +08:00
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
2021-05-07 19:03:22 +08:00
|
|
|
/*
|
|
|
|
* logging_active is guaranteed to never be true for VM_PFNMAP
|
|
|
|
* memslots.
|
|
|
|
*/
|
|
|
|
if (logging_active) {
|
2019-03-12 17:52:51 +08:00
|
|
|
force_pte = true;
|
2020-09-10 21:33:51 +08:00
|
|
|
vma_shift = PAGE_SHIFT;
|
2021-05-07 19:03:22 +08:00
|
|
|
} else {
|
|
|
|
vma_shift = get_vma_page_shift(vma, hva);
|
2020-09-10 21:33:51 +08:00
|
|
|
}
|
|
|
|
|
2021-07-13 19:36:41 +08:00
|
|
|
shared = (vma->vm_flags & VM_SHARED);
|
2021-06-21 19:17:12 +08:00
|
|
|
|
2020-10-26 07:06:26 +08:00
|
|
|
switch (vma_shift) {
|
2020-11-03 08:30:09 +08:00
|
|
|
#ifndef __PAGETABLE_PMD_FOLDED
|
2020-10-26 07:06:26 +08:00
|
|
|
case PUD_SHIFT:
|
|
|
|
if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
|
|
|
|
break;
|
|
|
|
fallthrough;
|
2020-11-03 08:30:09 +08:00
|
|
|
#endif
|
2020-10-26 07:06:26 +08:00
|
|
|
case CONT_PMD_SHIFT:
|
|
|
|
vma_shift = PMD_SHIFT;
|
|
|
|
fallthrough;
|
|
|
|
case PMD_SHIFT:
|
|
|
|
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
|
|
|
|
break;
|
|
|
|
fallthrough;
|
|
|
|
case CONT_PTE_SHIFT:
|
2020-09-10 21:33:51 +08:00
|
|
|
vma_shift = PAGE_SHIFT;
|
2020-10-26 07:06:26 +08:00
|
|
|
force_pte = true;
|
|
|
|
fallthrough;
|
|
|
|
case PAGE_SHIFT:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
|
2019-03-12 17:52:51 +08:00
|
|
|
}
|
|
|
|
|
2020-09-10 21:33:51 +08:00
|
|
|
vma_pagesize = 1UL << vma_shift;
|
2020-09-11 21:25:25 +08:00
|
|
|
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
|
2020-09-10 21:33:51 +08:00
|
|
|
fault_ipa &= ~(vma_pagesize - 1);
|
2020-09-11 21:25:25 +08:00
|
|
|
|
|
|
|
gfn = fault_ipa >> PAGE_SHIFT;
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_unlock(current->mm);
|
2012-11-02 00:14:45 +08:00
|
|
|
|
2020-09-11 21:25:25 +08:00
|
|
|
/*
|
|
|
|
* Permission faults just need to update the existing leaf entry,
|
|
|
|
* and so normally don't require allocations from the memcache. The
|
|
|
|
* only exception to this is when dirty logging is enabled at runtime
|
|
|
|
* and a write fault needs to collapse a block entry into a table.
|
|
|
|
*/
|
|
|
|
if (fault_status != FSC_PERM || (logging_active && write_fault)) {
|
|
|
|
ret = kvm_mmu_topup_memory_cache(memcache,
|
|
|
|
kvm_mmu_cache_min_pages(kvm));
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2013-01-21 07:28:12 +08:00
|
|
|
|
|
|
|
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
|
|
|
/*
|
|
|
|
* Ensure the read of mmu_notifier_seq happens before we call
|
|
|
|
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
|
|
|
|
* the page we just got a reference to gets unmapped before we have a
|
|
|
|
* chance to grab the mmu_lock, which ensure that if the page gets
|
2021-04-02 08:56:51 +08:00
|
|
|
* unmapped afterwards, the call to kvm_unmap_gfn will take it away
|
2013-01-21 07:28:12 +08:00
|
|
|
* from us again properly. This smp_rmb() interacts with the smp_wmb()
|
|
|
|
* in kvm_mmu_notifier_invalidate_<page|range_end>.
|
2021-03-16 12:11:26 +08:00
|
|
|
*
|
|
|
|
* Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
|
|
|
|
* used to avoid unnecessary overhead introduced to locate the memory
|
|
|
|
* slot because it's always fixed even @gfn is adjusted for huge pages.
|
2013-01-21 07:28:12 +08:00
|
|
|
*/
|
|
|
|
smp_rmb();
|
|
|
|
|
2021-03-16 12:11:26 +08:00
|
|
|
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
|
|
|
|
write_fault, &writable, NULL);
|
2017-06-21 00:11:48 +08:00
|
|
|
if (pfn == KVM_PFN_ERR_HWPOISON) {
|
2019-12-17 20:38:09 +08:00
|
|
|
kvm_send_hwpoison_signal(hva, vma_shift);
|
2017-06-21 00:11:48 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2016-08-17 16:46:10 +08:00
|
|
|
if (is_error_noslot_pfn(pfn))
|
2013-01-21 07:28:12 +08:00
|
|
|
return -EFAULT;
|
|
|
|
|
2015-01-16 07:58:58 +08:00
|
|
|
if (kvm_is_device_pfn(pfn)) {
|
2021-05-07 19:03:22 +08:00
|
|
|
/*
|
|
|
|
* If the page was identified as device early by looking at
|
|
|
|
* the VMA flags, vma_pagesize is already representing the
|
|
|
|
* largest quantity we can map. If instead it was mapped
|
|
|
|
* via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
|
|
|
|
* and must not be upgraded.
|
|
|
|
*
|
|
|
|
* In both cases, we don't let transparent_hugepage_adjust()
|
|
|
|
* change things at the last minute.
|
|
|
|
*/
|
2020-09-11 21:25:25 +08:00
|
|
|
device = true;
|
|
|
|
} else if (logging_active && !write_fault) {
|
2015-01-16 07:58:58 +08:00
|
|
|
/*
|
|
|
|
* Only actually map the page as writable if this was a write
|
|
|
|
* fault.
|
|
|
|
*/
|
2020-09-11 21:25:25 +08:00
|
|
|
writable = false;
|
2015-01-16 07:58:58 +08:00
|
|
|
}
|
2014-06-26 08:45:51 +08:00
|
|
|
|
2020-09-11 21:25:25 +08:00
|
|
|
if (exec_fault && device)
|
2019-12-12 00:56:48 +08:00
|
|
|
return -ENOEXEC;
|
|
|
|
|
2012-11-02 00:14:45 +08:00
|
|
|
spin_lock(&kvm->mmu_lock);
|
2020-09-11 21:25:25 +08:00
|
|
|
pgt = vcpu->arch.hw_mmu->pgt;
|
2012-11-02 00:14:45 +08:00
|
|
|
if (mmu_notifier_retry(kvm, mmu_seq))
|
2013-01-21 07:28:12 +08:00
|
|
|
goto out_unlock;
|
2015-01-16 07:58:58 +08:00
|
|
|
|
2020-05-07 20:35:46 +08:00
|
|
|
/*
|
|
|
|
* If we are not forced to use page mapping, check if we are
|
|
|
|
* backed by a THP and thus use block mapping if possible.
|
|
|
|
*/
|
2021-07-26 23:35:49 +08:00
|
|
|
if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
|
|
|
|
if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
|
|
|
|
vma_pagesize = fault_granule;
|
|
|
|
else
|
|
|
|
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
|
|
|
|
hva, &pfn,
|
|
|
|
&fault_ipa);
|
|
|
|
}
|
2012-11-02 00:14:45 +08:00
|
|
|
|
2021-06-22 22:09:34 +08:00
|
|
|
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
|
2021-06-21 19:17:12 +08:00
|
|
|
/* Check the VMM hasn't introduced a new VM_SHARED VMA */
|
2021-06-22 22:09:34 +08:00
|
|
|
if (!shared)
|
|
|
|
ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
|
|
|
|
else
|
2021-06-21 19:17:12 +08:00
|
|
|
ret = -EFAULT;
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2018-12-12 01:10:34 +08:00
|
|
|
|
2021-01-14 20:13:50 +08:00
|
|
|
if (writable)
|
2020-09-11 21:25:25 +08:00
|
|
|
prot |= KVM_PGTABLE_PROT_W;
|
2012-11-02 00:14:45 +08:00
|
|
|
|
2021-06-17 18:58:24 +08:00
|
|
|
if (exec_fault)
|
2020-09-11 21:25:25 +08:00
|
|
|
prot |= KVM_PGTABLE_PROT_X;
|
2018-12-12 01:10:34 +08:00
|
|
|
|
2020-09-11 21:25:25 +08:00
|
|
|
if (device)
|
|
|
|
prot |= KVM_PGTABLE_PROT_DEVICE;
|
|
|
|
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
|
|
|
|
prot |= KVM_PGTABLE_PROT_X;
|
2017-10-24 00:11:15 +08:00
|
|
|
|
2020-12-02 04:10:34 +08:00
|
|
|
/*
|
|
|
|
* Under the premise of getting a FSC_PERM fault, we just need to relax
|
|
|
|
* permissions only if vma_pagesize equals fault_granule. Otherwise,
|
|
|
|
* kvm_pgtable_stage2_map() should be called to change block size.
|
|
|
|
*/
|
|
|
|
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
|
2020-09-11 21:25:25 +08:00
|
|
|
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
|
2012-11-02 00:14:45 +08:00
|
|
|
} else {
|
2020-09-11 21:25:25 +08:00
|
|
|
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
|
|
|
|
__pfn_to_phys(pfn), prot,
|
|
|
|
memcache);
|
2013-01-21 07:28:12 +08:00
|
|
|
}
|
2012-11-02 00:14:45 +08:00
|
|
|
|
2021-01-14 20:13:50 +08:00
|
|
|
/* Mark the page dirty only if the fault is handled successfully */
|
|
|
|
if (writable && !ret) {
|
|
|
|
kvm_set_pfn_dirty(pfn);
|
2021-03-16 12:11:26 +08:00
|
|
|
mark_page_dirty_in_slot(kvm, memslot, gfn);
|
2021-01-14 20:13:50 +08:00
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:12 +08:00
|
|
|
out_unlock:
|
2012-11-02 00:14:45 +08:00
|
|
|
spin_unlock(&kvm->mmu_lock);
|
2015-03-13 02:16:51 +08:00
|
|
|
kvm_set_pfn_accessed(pfn);
|
2013-01-21 07:28:12 +08:00
|
|
|
kvm_release_pfn_clean(pfn);
|
2021-01-14 20:13:50 +08:00
|
|
|
return ret != -EAGAIN ? ret : 0;
|
2013-01-21 07:28:12 +08:00
|
|
|
}
|
|
|
|
|
2020-09-11 21:25:19 +08:00
|
|
|
/* Resolve the access fault by making the page young again. */
|
2015-03-13 02:16:52 +08:00
|
|
|
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
|
|
|
|
{
|
2020-09-11 21:25:19 +08:00
|
|
|
pte_t pte;
|
|
|
|
kvm_pte_t kpte;
|
|
|
|
struct kvm_s2_mmu *mmu;
|
2015-03-13 02:16:52 +08:00
|
|
|
|
|
|
|
trace_kvm_access_fault(fault_ipa);
|
|
|
|
|
|
|
|
spin_lock(&vcpu->kvm->mmu_lock);
|
2020-09-11 21:25:19 +08:00
|
|
|
mmu = vcpu->arch.hw_mmu;
|
|
|
|
kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
|
2015-03-13 02:16:52 +08:00
|
|
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
2020-09-11 21:25:19 +08:00
|
|
|
|
|
|
|
pte = __pte(kpte);
|
|
|
|
if (pte_valid(pte))
|
|
|
|
kvm_set_pfn_accessed(pte_pfn(pte));
|
2015-03-13 02:16:52 +08:00
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:12 +08:00
|
|
|
/**
|
|
|
|
* kvm_handle_guest_abort - handles all 2nd stage aborts
|
|
|
|
* @vcpu: the VCPU pointer
|
|
|
|
*
|
|
|
|
* Any abort that gets to the host is almost guaranteed to be caused by a
|
|
|
|
* missing second stage translation table entry, which can mean that either the
|
|
|
|
* guest simply needs more memory and we must allocate an appropriate page or it
|
|
|
|
* can mean that the guest tried to access I/O memory, which is emulated by user
|
|
|
|
* space. The distinction is based on the IPA causing the fault and whether this
|
|
|
|
* memory region has been registered as standard RAM by user space.
|
|
|
|
*/
|
2020-06-23 21:14:15 +08:00
|
|
|
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2013-01-21 07:28:12 +08:00
|
|
|
unsigned long fault_status;
|
|
|
|
phys_addr_t fault_ipa;
|
|
|
|
struct kvm_memory_slot *memslot;
|
2014-08-19 18:18:04 +08:00
|
|
|
unsigned long hva;
|
|
|
|
bool is_iabt, write_fault, writable;
|
2013-01-21 07:28:12 +08:00
|
|
|
gfn_t gfn;
|
|
|
|
int ret, idx;
|
|
|
|
|
2017-06-22 02:17:14 +08:00
|
|
|
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
|
|
|
|
|
|
|
|
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
|
2017-07-18 20:37:41 +08:00
|
|
|
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
|
2017-06-22 02:17:14 +08:00
|
|
|
|
2017-07-18 20:37:41 +08:00
|
|
|
/* Synchronous External Abort? */
|
2020-07-29 18:28:18 +08:00
|
|
|
if (kvm_vcpu_abt_issea(vcpu)) {
|
2017-07-18 20:37:41 +08:00
|
|
|
/*
|
|
|
|
* For RAS the host kernel may handle this abort.
|
|
|
|
* There is no need to pass the error into the guest.
|
|
|
|
*/
|
2020-07-29 18:28:19 +08:00
|
|
|
if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
|
2017-07-18 20:37:41 +08:00
|
|
|
kvm_inject_vabt(vcpu);
|
2020-07-29 18:28:19 +08:00
|
|
|
|
|
|
|
return 1;
|
2016-09-06 21:02:15 +08:00
|
|
|
}
|
|
|
|
|
2020-06-30 09:57:05 +08:00
|
|
|
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
|
2012-09-18 02:27:09 +08:00
|
|
|
kvm_vcpu_get_hfar(vcpu), fault_ipa);
|
2013-01-21 07:28:12 +08:00
|
|
|
|
|
|
|
/* Check the stage-2 fault is trans. fault or write fault */
|
2015-03-13 02:16:51 +08:00
|
|
|
if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
|
|
|
|
fault_status != FSC_ACCESS) {
|
2014-09-26 18:29:34 +08:00
|
|
|
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
|
|
|
|
kvm_vcpu_trap_get_class(vcpu),
|
|
|
|
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
|
2020-06-30 09:57:05 +08:00
|
|
|
(unsigned long)kvm_vcpu_get_esr(vcpu));
|
2013-01-21 07:28:12 +08:00
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
|
|
|
|
|
|
|
gfn = fault_ipa >> PAGE_SHIFT;
|
2014-08-19 18:18:04 +08:00
|
|
|
memslot = gfn_to_memslot(vcpu->kvm, gfn);
|
|
|
|
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
|
2014-09-09 18:27:09 +08:00
|
|
|
write_fault = kvm_is_write_fault(vcpu);
|
2014-08-19 18:18:04 +08:00
|
|
|
if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
|
2020-07-29 18:28:21 +08:00
|
|
|
/*
|
|
|
|
* The guest has put either its instructions or its page-tables
|
|
|
|
* somewhere it shouldn't have. Userspace won't be able to do
|
|
|
|
* anything about this (there's no syndrome for a start), so
|
|
|
|
* re-inject the abort back into the guest.
|
|
|
|
*/
|
2013-01-21 07:28:12 +08:00
|
|
|
if (is_iabt) {
|
2019-12-12 00:56:48 +08:00
|
|
|
ret = -ENOEXEC;
|
|
|
|
goto out;
|
2013-01-21 07:28:12 +08:00
|
|
|
}
|
|
|
|
|
2020-09-15 18:42:17 +08:00
|
|
|
if (kvm_vcpu_abt_iss1tw(vcpu)) {
|
2020-07-29 18:28:21 +08:00
|
|
|
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
|
|
|
|
ret = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2016-01-29 23:01:28 +08:00
|
|
|
/*
|
|
|
|
* Check for a cache maintenance operation. Since we
|
|
|
|
* ended-up here, we know it is outside of any memory
|
|
|
|
* slot. But we can't find out if that is for a device,
|
|
|
|
* or if the guest is just being stupid. The only thing
|
|
|
|
* we know for sure is that this range cannot be cached.
|
|
|
|
*
|
|
|
|
* So let's assume that the guest is just being
|
|
|
|
* cautious, and skip the instruction.
|
|
|
|
*/
|
2020-07-29 18:28:20 +08:00
|
|
|
if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
|
2020-10-14 16:29:27 +08:00
|
|
|
kvm_incr_pc(vcpu);
|
2016-01-29 23:01:28 +08:00
|
|
|
ret = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2012-12-12 22:42:09 +08:00
|
|
|
/*
|
|
|
|
* The IPA is reported as [MAX:12], so we need to
|
|
|
|
* complement it with the bottom 12 bits from the
|
|
|
|
* faulting VA. This is always 12 bits, irrespective
|
|
|
|
* of the page size.
|
|
|
|
*/
|
|
|
|
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
|
2020-06-23 21:14:15 +08:00
|
|
|
ret = io_mem_abort(vcpu, fault_ipa);
|
2013-01-21 07:28:12 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2014-10-10 18:14:29 +08:00
|
|
|
/* Userspace should not be able to register out-of-bounds IPAs */
|
2018-09-27 00:32:44 +08:00
|
|
|
VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
|
2014-10-10 18:14:29 +08:00
|
|
|
|
2015-03-13 02:16:52 +08:00
|
|
|
if (fault_status == FSC_ACCESS) {
|
|
|
|
handle_access_fault(vcpu, fault_ipa);
|
|
|
|
ret = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2014-08-19 18:18:04 +08:00
|
|
|
ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
|
2013-01-21 07:28:12 +08:00
|
|
|
if (ret == 0)
|
|
|
|
ret = 1;
|
2019-12-12 00:56:48 +08:00
|
|
|
out:
|
|
|
|
if (ret == -ENOEXEC) {
|
|
|
|
kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
|
|
|
|
ret = 1;
|
|
|
|
}
|
2013-01-21 07:28:12 +08:00
|
|
|
out_unlock:
|
|
|
|
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
|
|
|
return ret;
|
2013-01-21 07:28:06 +08:00
|
|
|
}
|
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2021-04-02 08:56:51 +08:00
|
|
|
if (!kvm->arch.mmu.pgt)
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
|
|
|
|
(range->end - range->start) << PAGE_SHIFT,
|
|
|
|
range->may_block);
|
2020-08-11 18:27:25 +08:00
|
|
|
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2013-01-21 07:28:07 +08:00
|
|
|
}
|
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2021-04-02 08:56:51 +08:00
|
|
|
kvm_pfn_t pfn = pte_pfn(range->pte);
|
2021-06-21 19:17:12 +08:00
|
|
|
int ret;
|
2021-04-02 08:56:51 +08:00
|
|
|
|
2020-09-11 21:25:26 +08:00
|
|
|
if (!kvm->arch.mmu.pgt)
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
WARN_ON(range->end - range->start != 1);
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2021-06-21 19:17:12 +08:00
|
|
|
ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
|
|
|
|
if (ret)
|
|
|
|
return false;
|
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
/*
|
2021-06-17 18:58:24 +08:00
|
|
|
* We've moved a page around, probably through CoW, so let's treat
|
|
|
|
* it just like a translation fault and the map handler will clean
|
|
|
|
* the cache to the PoC.
|
|
|
|
*
|
2020-09-11 21:25:16 +08:00
|
|
|
* The MMU notifiers will have unmapped a huge PMD before calling
|
2021-04-02 08:56:51 +08:00
|
|
|
* ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
|
2020-09-11 21:25:16 +08:00
|
|
|
* therefore we never need to clear out a huge PMD through this
|
|
|
|
* calling path and a memcache is not required.
|
2015-01-16 07:58:58 +08:00
|
|
|
*/
|
2021-04-02 08:56:51 +08:00
|
|
|
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
|
|
|
|
PAGE_SIZE, __pfn_to_phys(pfn),
|
|
|
|
KVM_PGTABLE_PROT_R, NULL);
|
|
|
|
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2013-01-21 07:28:07 +08:00
|
|
|
}
|
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
2013-01-21 07:28:07 +08:00
|
|
|
{
|
2021-04-02 08:56:51 +08:00
|
|
|
u64 size = (range->end - range->start) << PAGE_SHIFT;
|
|
|
|
kvm_pte_t kpte;
|
|
|
|
pte_t pte;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2020-09-11 21:25:16 +08:00
|
|
|
if (!kvm->arch.mmu.pgt)
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2018-12-12 01:10:40 +08:00
|
|
|
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
|
2021-04-02 08:56:51 +08:00
|
|
|
|
|
|
|
kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
|
|
|
|
range->start << PAGE_SHIFT);
|
2020-09-11 21:25:19 +08:00
|
|
|
pte = __pte(kpte);
|
|
|
|
return pte_valid(pte) && pte_young(pte);
|
2015-03-13 02:16:51 +08:00
|
|
|
}
|
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
2015-03-13 02:16:51 +08:00
|
|
|
{
|
2020-09-11 21:25:26 +08:00
|
|
|
if (!kvm->arch.mmu.pgt)
|
2021-04-27 06:33:57 +08:00
|
|
|
return false;
|
2021-03-26 10:19:48 +08:00
|
|
|
|
2021-04-02 08:56:51 +08:00
|
|
|
return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
|
|
|
|
range->start << PAGE_SHIFT);
|
2015-03-13 02:16:51 +08:00
|
|
|
}
|
|
|
|
|
2013-01-21 07:28:06 +08:00
|
|
|
phys_addr_t kvm_mmu_get_httbr(void)
|
|
|
|
{
|
2020-09-11 21:25:12 +08:00
|
|
|
return __pa(hyp_pgtable->pgd);
|
2013-01-21 07:28:06 +08:00
|
|
|
}
|
|
|
|
|
ARM: KVM: switch to a dual-step HYP init code
Our HYP init code suffers from two major design issues:
- it cannot support CPU hotplug, as we tear down the idmap very early
- it cannot perform a TLB invalidation when switching from init to
runtime mappings, as pages are manipulated from PL1 exclusively
The hotplug problem mandates that we keep two sets of page tables
(boot and runtime). The TLB problem mandates that we're able to
transition from one PGD to another while in HYP, invalidating the TLBs
in the process.
To be able to do this, we need to share a page between the two page
tables. A page that will have the same VA in both configurations. All we
need is a VA that has the following properties:
- This VA can't be used to represent a kernel mapping.
- This VA will not conflict with the physical address of the kernel text
The vectors page seems to satisfy this requirement:
- The kernel never maps anything else there
- The kernel text being copied at the beginning of the physical memory,
it is unlikely to use the last 64kB (I doubt we'll ever support KVM
on a system with something like 4MB of RAM, but patches are very
welcome).
Let's call this VA the trampoline VA.
Now, we map our init page at 3 locations:
- idmap in the boot pgd
- trampoline VA in the boot pgd
- trampoline VA in the runtime pgd
The init scenario is now the following:
- We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
runtime stack, runtime vectors
- Enable the MMU with the boot pgd
- Jump to a target into the trampoline page (remember, this is the same
physical page!)
- Now switch to the runtime pgd (same VA, and still the same physical
page!)
- Invalidate TLBs
- Set stack and vectors
- Profit! (or eret, if you only care about the code).
Note that we keep the boot mapping permanently (it is not strictly an
idmap anymore) to allow for CPU hotplug in later patches.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
2013-04-13 02:12:06 +08:00
|
|
|
phys_addr_t kvm_get_idmap_vector(void)
|
|
|
|
{
|
|
|
|
return hyp_idmap_vector;
|
|
|
|
}
|
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
static int kvm_map_idmap_text(void)
|
2016-07-01 01:40:43 +08:00
|
|
|
{
|
2020-09-11 21:25:12 +08:00
|
|
|
unsigned long size = hyp_idmap_end - hyp_idmap_start;
|
|
|
|
int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
|
|
|
|
PAGE_HYP_EXEC);
|
2016-07-01 01:40:43 +08:00
|
|
|
if (err)
|
|
|
|
kvm_err("Failed to idmap %lx-%lx\n",
|
|
|
|
hyp_idmap_start, hyp_idmap_end);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2021-03-19 18:01:14 +08:00
|
|
|
static void *kvm_hyp_zalloc_page(void *arg)
|
|
|
|
{
|
|
|
|
return (void *)get_zeroed_page(GFP_KERNEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
|
|
|
|
.zalloc_page = kvm_hyp_zalloc_page,
|
|
|
|
.get_page = kvm_host_get_page,
|
|
|
|
.put_page = kvm_host_put_page,
|
|
|
|
.phys_to_virt = kvm_host_va,
|
|
|
|
.virt_to_phys = kvm_host_pa,
|
|
|
|
};
|
|
|
|
|
2021-03-19 18:01:26 +08:00
|
|
|
int kvm_mmu_init(u32 *hyp_va_bits)
|
2013-01-21 07:28:06 +08:00
|
|
|
{
|
2013-04-13 02:12:03 +08:00
|
|
|
int err;
|
|
|
|
|
2020-05-19 18:40:36 +08:00
|
|
|
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
|
2018-03-12 22:25:10 +08:00
|
|
|
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
|
2020-05-19 18:40:36 +08:00
|
|
|
hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
|
2018-03-12 22:25:10 +08:00
|
|
|
hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
|
2020-05-19 18:40:36 +08:00
|
|
|
hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
|
ARM: KVM: switch to a dual-step HYP init code
Our HYP init code suffers from two major design issues:
- it cannot support CPU hotplug, as we tear down the idmap very early
- it cannot perform a TLB invalidation when switching from init to
runtime mappings, as pages are manipulated from PL1 exclusively
The hotplug problem mandates that we keep two sets of page tables
(boot and runtime). The TLB problem mandates that we're able to
transition from one PGD to another while in HYP, invalidating the TLBs
in the process.
To be able to do this, we need to share a page between the two page
tables. A page that will have the same VA in both configurations. All we
need is a VA that has the following properties:
- This VA can't be used to represent a kernel mapping.
- This VA will not conflict with the physical address of the kernel text
The vectors page seems to satisfy this requirement:
- The kernel never maps anything else there
- The kernel text being copied at the beginning of the physical memory,
it is unlikely to use the last 64kB (I doubt we'll ever support KVM
on a system with something like 4MB of RAM, but patches are very
welcome).
Let's call this VA the trampoline VA.
Now, we map our init page at 3 locations:
- idmap in the boot pgd
- trampoline VA in the boot pgd
- trampoline VA in the runtime pgd
The init scenario is now the following:
- We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
runtime stack, runtime vectors
- Enable the MMU with the boot pgd
- Jump to a target into the trampoline page (remember, this is the same
physical page!)
- Now switch to the runtime pgd (same VA, and still the same physical
page!)
- Invalidate TLBs
- Set stack and vectors
- Profit! (or eret, if you only care about the code).
Note that we keep the boot mapping permanently (it is not strictly an
idmap anymore) to allow for CPU hotplug in later patches.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
2013-04-13 02:12:06 +08:00
|
|
|
|
ARM, arm64: kvm: get rid of the bounce page
The HYP init bounce page is a runtime construct that ensures that the
HYP init code does not cross a page boundary. However, this is something
we can do perfectly well at build time, by aligning the code appropriately.
For arm64, we just align to 4 KB, and enforce that the code size is less
than 4 KB, regardless of the chosen page size.
For ARM, the whole code is less than 256 bytes, so we tweak the linker
script to align at a power of 2 upper bound of the code size
Note that this also fixes a benign off-by-one error in the original bounce
page code, where a bounce page would be allocated unnecessarily if the code
was exactly 1 page in size.
On ARM, it also fixes an issue with very large kernels reported by Arnd
Bergmann, where stub sections with linker emitted veneers could erroneously
trigger the size/alignment ASSERT() in the linker script.
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:26 +08:00
|
|
|
/*
|
|
|
|
* We rely on the linker script to ensure at build time that the HYP
|
|
|
|
* init code does not cross a page boundary.
|
|
|
|
*/
|
|
|
|
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
|
ARM: KVM: switch to a dual-step HYP init code
Our HYP init code suffers from two major design issues:
- it cannot support CPU hotplug, as we tear down the idmap very early
- it cannot perform a TLB invalidation when switching from init to
runtime mappings, as pages are manipulated from PL1 exclusively
The hotplug problem mandates that we keep two sets of page tables
(boot and runtime). The TLB problem mandates that we're able to
transition from one PGD to another while in HYP, invalidating the TLBs
in the process.
To be able to do this, we need to share a page between the two page
tables. A page that will have the same VA in both configurations. All we
need is a VA that has the following properties:
- This VA can't be used to represent a kernel mapping.
- This VA will not conflict with the physical address of the kernel text
The vectors page seems to satisfy this requirement:
- The kernel never maps anything else there
- The kernel text being copied at the beginning of the physical memory,
it is unlikely to use the last 64kB (I doubt we'll ever support KVM
on a system with something like 4MB of RAM, but patches are very
welcome).
Let's call this VA the trampoline VA.
Now, we map our init page at 3 locations:
- idmap in the boot pgd
- trampoline VA in the boot pgd
- trampoline VA in the runtime pgd
The init scenario is now the following:
- We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
runtime stack, runtime vectors
- Enable the MMU with the boot pgd
- Jump to a target into the trampoline page (remember, this is the same
physical page!)
- Now switch to the runtime pgd (same VA, and still the same physical
page!)
- Invalidate TLBs
- Set stack and vectors
- Profit! (or eret, if you only care about the code).
Note that we keep the boot mapping permanently (it is not strictly an
idmap anymore) to allow for CPU hotplug in later patches.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
2013-04-13 02:12:06 +08:00
|
|
|
|
2021-03-19 18:01:26 +08:00
|
|
|
*hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
|
|
|
|
kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
|
2017-12-04 04:04:51 +08:00
|
|
|
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
|
|
|
|
kvm_debug("HYP VA range: %lx:%lx\n",
|
|
|
|
kern_hyp_va(PAGE_OFFSET),
|
|
|
|
kern_hyp_va((unsigned long)high_memory - 1));
|
2016-07-01 01:40:50 +08:00
|
|
|
|
2016-07-01 01:40:51 +08:00
|
|
|
if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
|
arm64: KVM: Introduce EL2 VA randomisation
The main idea behind randomising the EL2 VA is that we usually have
a few spare bits between the most significant bit of the VA mask
and the most significant bit of the linear mapping.
Those bits could be a bunch of zeroes, and could be useful
to move things around a bit. Of course, the more memory you have,
the less randomisation you get...
Alternatively, these bits could be the result of KASLR, in which
case they are already random. But it would be nice to have a
*different* randomization, just to make the job of a potential
attacker a bit more difficult.
Inserting these random bits is a bit involved. We don't have a spare
register (short of rewriting all the kern_hyp_va call sites), and
the immediate we want to insert is too random to be used with the
ORR instruction. The best option I could come up with is the following
sequence:
and x0, x0, #va_mask
ror x0, x0, #first_random_bit
add x0, x0, #(random & 0xfff)
add x0, x0, #(random >> 12), lsl #12
ror x0, x0, #(63 - first_random_bit)
making it a fairly long sequence, but one that a decent CPU should
be able to execute without breaking a sweat. It is of course NOPed
out on VHE. The last 4 instructions can also be turned into NOPs
if it appears that there is no free bits to use.
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2017-12-04 02:22:49 +08:00
|
|
|
hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
|
2016-08-22 16:01:17 +08:00
|
|
|
hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
|
2016-07-01 01:40:50 +08:00
|
|
|
/*
|
|
|
|
* The idmap page is intersecting with the VA space,
|
|
|
|
* it is not safe to continue further.
|
|
|
|
*/
|
|
|
|
kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
|
|
|
|
if (!hyp_pgtable) {
|
|
|
|
kvm_err("Hyp mode page-table not allocated\n");
|
2013-04-13 02:12:03 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2021-03-19 18:01:26 +08:00
|
|
|
err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
|
2020-09-11 21:25:12 +08:00
|
|
|
if (err)
|
|
|
|
goto out_free_pgtable;
|
2013-01-21 07:28:07 +08:00
|
|
|
|
2020-09-11 21:25:12 +08:00
|
|
|
err = kvm_map_idmap_text();
|
|
|
|
if (err)
|
|
|
|
goto out_destroy_pgtable;
|
ARM: KVM: switch to a dual-step HYP init code
Our HYP init code suffers from two major design issues:
- it cannot support CPU hotplug, as we tear down the idmap very early
- it cannot perform a TLB invalidation when switching from init to
runtime mappings, as pages are manipulated from PL1 exclusively
The hotplug problem mandates that we keep two sets of page tables
(boot and runtime). The TLB problem mandates that we're able to
transition from one PGD to another while in HYP, invalidating the TLBs
in the process.
To be able to do this, we need to share a page between the two page
tables. A page that will have the same VA in both configurations. All we
need is a VA that has the following properties:
- This VA can't be used to represent a kernel mapping.
- This VA will not conflict with the physical address of the kernel text
The vectors page seems to satisfy this requirement:
- The kernel never maps anything else there
- The kernel text being copied at the beginning of the physical memory,
it is unlikely to use the last 64kB (I doubt we'll ever support KVM
on a system with something like 4MB of RAM, but patches are very
welcome).
Let's call this VA the trampoline VA.
Now, we map our init page at 3 locations:
- idmap in the boot pgd
- trampoline VA in the boot pgd
- trampoline VA in the runtime pgd
The init scenario is now the following:
- We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
runtime stack, runtime vectors
- Enable the MMU with the boot pgd
- Jump to a target into the trampoline page (remember, this is the same
physical page!)
- Now switch to the runtime pgd (same VA, and still the same physical
page!)
- Invalidate TLBs
- Set stack and vectors
- Profit! (or eret, if you only care about the code).
Note that we keep the boot mapping permanently (it is not strictly an
idmap anymore) to allow for CPU hotplug in later patches.
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@cs.columbia.edu>
2013-04-13 02:12:06 +08:00
|
|
|
|
2017-12-05 01:04:38 +08:00
|
|
|
io_map_base = hyp_idmap_start;
|
2013-01-21 07:28:07 +08:00
|
|
|
return 0;
|
2020-09-11 21:25:12 +08:00
|
|
|
|
|
|
|
out_destroy_pgtable:
|
|
|
|
kvm_pgtable_hyp_destroy(hyp_pgtable);
|
|
|
|
out_free_pgtable:
|
|
|
|
kfree(hyp_pgtable);
|
|
|
|
hyp_pgtable = NULL;
|
2013-04-13 02:12:03 +08:00
|
|
|
out:
|
|
|
|
return err;
|
2013-01-21 07:28:06 +08:00
|
|
|
}
|
2014-06-06 17:10:23 +08:00
|
|
|
|
|
|
|
void kvm_arch_commit_memory_region(struct kvm *kvm,
|
2015-05-18 19:59:39 +08:00
|
|
|
const struct kvm_userspace_memory_region *mem,
|
2020-02-19 05:07:24 +08:00
|
|
|
struct kvm_memory_slot *old,
|
2015-05-18 19:20:23 +08:00
|
|
|
const struct kvm_memory_slot *new,
|
2014-06-06 17:10:23 +08:00
|
|
|
enum kvm_mr_change change)
|
|
|
|
{
|
2015-01-16 07:58:56 +08:00
|
|
|
/*
|
|
|
|
* At this point memslot has been committed and there is an
|
2020-04-01 22:03:10 +08:00
|
|
|
* allocated dirty_bitmap[], dirty pages will be tracked while the
|
2015-01-16 07:58:56 +08:00
|
|
|
* memory slot is write protected.
|
|
|
|
*/
|
2020-04-13 20:20:23 +08:00
|
|
|
if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
|
|
|
/*
|
|
|
|
* If we're with initial-all-set, we don't need to write
|
|
|
|
* protect any pages because they're all reported as dirty.
|
|
|
|
* Huge pages and normal pages will be write protect gradually.
|
|
|
|
*/
|
|
|
|
if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
|
|
|
|
kvm_mmu_wp_memory_region(kvm, mem->slot);
|
|
|
|
}
|
|
|
|
}
|
2014-06-06 17:10:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
2015-05-18 19:59:39 +08:00
|
|
|
const struct kvm_userspace_memory_region *mem,
|
2021-12-07 03:54:11 +08:00
|
|
|
const struct kvm_memory_slot *old,
|
|
|
|
struct kvm_memory_slot *new,
|
2014-06-06 17:10:23 +08:00
|
|
|
enum kvm_mr_change change)
|
|
|
|
{
|
2014-10-10 23:00:32 +08:00
|
|
|
hva_t hva = mem->userspace_addr;
|
|
|
|
hva_t reg_end = hva + mem->memory_size;
|
|
|
|
int ret = 0;
|
|
|
|
|
2015-01-16 07:58:58 +08:00
|
|
|
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
|
|
|
|
change != KVM_MR_FLAGS_ONLY)
|
2014-10-10 23:00:32 +08:00
|
|
|
return 0;
|
|
|
|
|
2014-10-10 18:14:29 +08:00
|
|
|
/*
|
|
|
|
* Prevent userspace from creating a memory region outside of the IPA
|
|
|
|
* space addressable by the KVM guest IPA space.
|
|
|
|
*/
|
2021-12-07 03:54:11 +08:00
|
|
|
if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
|
2014-10-10 18:14:29 +08:00
|
|
|
return -EFAULT;
|
|
|
|
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_lock(current->mm);
|
2014-10-10 23:00:32 +08:00
|
|
|
/*
|
|
|
|
* A memory region could potentially cover multiple VMAs, and any holes
|
2021-05-07 19:03:21 +08:00
|
|
|
* between them, so iterate over all of them.
|
2014-10-10 23:00:32 +08:00
|
|
|
*
|
|
|
|
* +--------------------------------------------+
|
|
|
|
* +---------------+----------------+ +----------------+
|
|
|
|
* | : VMA 1 | VMA 2 | | VMA 3 : |
|
|
|
|
* +---------------+----------------+ +----------------+
|
|
|
|
* | memory region |
|
|
|
|
* +--------------------------------------------+
|
|
|
|
*/
|
|
|
|
do {
|
2021-03-16 12:11:25 +08:00
|
|
|
struct vm_area_struct *vma;
|
2014-10-10 23:00:32 +08:00
|
|
|
|
2021-03-16 12:11:25 +08:00
|
|
|
vma = find_vma_intersection(current->mm, hva, reg_end);
|
|
|
|
if (!vma)
|
2014-10-10 23:00:32 +08:00
|
|
|
break;
|
|
|
|
|
2021-06-21 19:17:12 +08:00
|
|
|
/*
|
|
|
|
* VM_SHARED mappings are not allowed with MTE to avoid races
|
|
|
|
* when updating the PG_mte_tagged page flag, see
|
|
|
|
* sanitise_mte_tags for more details.
|
|
|
|
*/
|
2021-10-05 20:20:31 +08:00
|
|
|
if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
2021-06-21 19:17:12 +08:00
|
|
|
|
2014-10-10 23:00:32 +08:00
|
|
|
if (vma->vm_flags & VM_PFNMAP) {
|
2015-01-16 07:58:58 +08:00
|
|
|
/* IO region dirty page logging not allowed */
|
2021-12-07 03:54:11 +08:00
|
|
|
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
2017-03-17 02:20:50 +08:00
|
|
|
ret = -EINVAL;
|
2014-10-10 23:00:32 +08:00
|
|
|
break;
|
2021-05-07 19:03:21 +08:00
|
|
|
}
|
2014-10-10 23:00:32 +08:00
|
|
|
}
|
2021-05-07 19:03:21 +08:00
|
|
|
hva = min(reg_end, vma->vm_end);
|
2014-10-10 23:00:32 +08:00
|
|
|
} while (hva < reg_end);
|
|
|
|
|
2020-06-09 12:33:29 +08:00
|
|
|
mmap_read_unlock(current->mm);
|
2014-10-10 23:00:32 +08:00
|
|
|
return ret;
|
2014-06-06 17:10:23 +08:00
|
|
|
}
|
|
|
|
|
2020-02-19 05:07:27 +08:00
|
|
|
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
|
2014-06-06 17:10:23 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2019-02-06 04:54:17 +08:00
|
|
|
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
|
2014-06-06 17:10:23 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void kvm_arch_flush_shadow_all(struct kvm *kvm)
|
|
|
|
{
|
2019-01-05 04:09:05 +08:00
|
|
|
kvm_free_stage2_pgd(&kvm->arch.mmu);
|
2014-06-06 17:10:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *slot)
|
|
|
|
{
|
2014-10-10 23:00:32 +08:00
|
|
|
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
|
|
|
|
phys_addr_t size = slot->npages << PAGE_SHIFT;
|
|
|
|
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
2019-01-05 04:09:05 +08:00
|
|
|
unmap_stage2_range(&kvm->arch.mmu, gpa, size);
|
2014-10-10 23:00:32 +08:00
|
|
|
spin_unlock(&kvm->mmu_lock);
|
2014-06-06 17:10:23 +08:00
|
|
|
}
|
2014-12-20 00:05:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
|
|
|
|
*
|
|
|
|
* Main problems:
|
|
|
|
* - S/W ops are local to a CPU (not broadcast)
|
|
|
|
* - We have line migration behind our back (speculation)
|
|
|
|
* - System caches don't support S/W at all (damn!)
|
|
|
|
*
|
|
|
|
* In the face of the above, the best we can do is to try and convert
|
|
|
|
* S/W ops to VA ops. Because the guest is not allowed to infer the
|
|
|
|
* S/W to PA mapping, it can only use S/W to nuke the whole cache,
|
|
|
|
* which is a rather good thing for us.
|
|
|
|
*
|
|
|
|
* Also, it is only used when turning caches on/off ("The expected
|
|
|
|
* usage of the cache maintenance instructions that operate by set/way
|
|
|
|
* is associated with the cache maintenance instructions associated
|
|
|
|
* with the powerdown and powerup of caches, if this is required by
|
|
|
|
* the implementation.").
|
|
|
|
*
|
|
|
|
* We use the following policy:
|
|
|
|
*
|
|
|
|
* - If we trap a S/W operation, we enable VM trapping to detect
|
|
|
|
* caches being turned on/off, and do a full clean.
|
|
|
|
*
|
|
|
|
* - We flush the caches on both caches being turned on and off.
|
|
|
|
*
|
|
|
|
* - Once the caches are enabled, we stop trapping VM ops.
|
|
|
|
*/
|
|
|
|
void kvm_set_way_flush(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2017-08-03 18:09:05 +08:00
|
|
|
unsigned long hcr = *vcpu_hcr(vcpu);
|
2014-12-20 00:05:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is the first time we do a S/W operation
|
|
|
|
* (i.e. HCR_TVM not set) flush the whole memory, and set the
|
|
|
|
* VM trapping.
|
|
|
|
*
|
|
|
|
* Otherwise, rely on the VM trapping to wait for the MMU +
|
|
|
|
* Caches to be turned off. At that point, we'll be able to
|
|
|
|
* clean the caches again.
|
|
|
|
*/
|
|
|
|
if (!(hcr & HCR_TVM)) {
|
|
|
|
trace_kvm_set_way_flush(*vcpu_pc(vcpu),
|
|
|
|
vcpu_has_cache_enabled(vcpu));
|
|
|
|
stage2_flush_vm(vcpu->kvm);
|
2017-08-03 18:09:05 +08:00
|
|
|
*vcpu_hcr(vcpu) = hcr | HCR_TVM;
|
2014-12-20 00:05:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
|
|
|
|
{
|
|
|
|
bool now_enabled = vcpu_has_cache_enabled(vcpu);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If switching the MMU+caches on, need to invalidate the caches.
|
|
|
|
* If switching it off, need to clean the caches.
|
|
|
|
* Clean + invalidate does the trick always.
|
|
|
|
*/
|
|
|
|
if (now_enabled != was_enabled)
|
|
|
|
stage2_flush_vm(vcpu->kvm);
|
|
|
|
|
|
|
|
/* Caches are now on, stop trapping VM ops (until a S/W op) */
|
|
|
|
if (now_enabled)
|
2017-08-03 18:09:05 +08:00
|
|
|
*vcpu_hcr(vcpu) &= ~HCR_TVM;
|
2014-12-20 00:05:31 +08:00
|
|
|
|
|
|
|
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
|
|
|
|
}
|