Merge branch 'kvm-fixes-for-5.18-rc5' into HEAD
Fixes for (relatively) old bugs, to be merged in both the -rc and next development trees: * Fix potential races when walking host page table * Fix bad user ABI for KVM_EXIT_SYSTEM_EVENT * Fix shadow page table leak when KVM runs nested
This commit is contained in:
commit
73331c5d84
|
@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
|
|||
#define KVM_SYSTEM_EVENT_RESET 2
|
||||
#define KVM_SYSTEM_EVENT_CRASH 3
|
||||
__u32 type;
|
||||
__u64 flags;
|
||||
__u32 ndata;
|
||||
__u64 data[16];
|
||||
} system_event;
|
||||
|
||||
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
|
||||
a system-level event using some architecture specific mechanism (hypercall
|
||||
or some special instruction). In case of ARM64, this is triggered using
|
||||
HVC instruction based PSCI call from the vcpu. The 'type' field describes
|
||||
the system-level event type. The 'flags' field describes architecture
|
||||
specific flags for the system-level event.
|
||||
HVC instruction based PSCI call from the vcpu.
|
||||
|
||||
The 'type' field describes the system-level event type.
|
||||
Valid values for 'type' are:
|
||||
|
||||
- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
|
||||
|
@ -6010,10 +6010,20 @@ Valid values for 'type' are:
|
|||
to ignore the request, or to gather VM memory core dump and/or
|
||||
reset/shutdown of the VM.
|
||||
|
||||
Valid flags are:
|
||||
If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
|
||||
architecture specific information for the system-level event. Only
|
||||
the first `ndata` items (possibly zero) of the data array are valid.
|
||||
|
||||
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
|
||||
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
|
||||
- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
|
||||
the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
|
||||
specification.
|
||||
|
||||
- for RISC-V, data[0] is set to the value of the second argument of the
|
||||
``sbi_system_reset`` call.
|
||||
|
||||
Previous versions of Linux defined a `flags` member in this struct. The
|
||||
field is now aliased to `data[0]`. Userspace can assume that it is only
|
||||
written if ndata is greater than 0.
|
||||
|
||||
::
|
||||
|
||||
|
|
|
@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
|
|||
|
||||
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
|
||||
vcpu->run->system_event.type = type;
|
||||
vcpu->run->system_event.flags = flags;
|
||||
vcpu->run->system_event.ndata = 1;
|
||||
vcpu->run->system_event.data[0] = flags;
|
||||
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
|
||||
}
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
|
|||
|
||||
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
|
||||
struct kvm_run *run,
|
||||
u32 type, u64 flags)
|
||||
u32 type, u64 reason)
|
||||
{
|
||||
unsigned long i;
|
||||
struct kvm_vcpu *tmp;
|
||||
|
@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
|
|||
|
||||
memset(&run->system_event, 0, sizeof(run->system_event));
|
||||
run->system_event.type = type;
|
||||
run->system_event.flags = flags;
|
||||
run->system_event.ndata = 1;
|
||||
run->system_event.data[0] = reason;
|
||||
run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
|
||||
}
|
||||
|
||||
|
|
|
@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
|
|||
return ((2ULL << (e - s)) - 1) << s;
|
||||
}
|
||||
|
||||
/*
|
||||
* The number of non-reserved physical address bits irrespective of features
|
||||
* that repurpose legal bits, e.g. MKTME.
|
||||
*/
|
||||
extern u8 __read_mostly shadow_phys_bits;
|
||||
|
||||
static inline gfn_t kvm_mmu_max_gfn(void)
|
||||
{
|
||||
/*
|
||||
* Note that this uses the host MAXPHYADDR, not the guest's.
|
||||
* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
|
||||
* assuming KVM is running on bare metal, guest accesses beyond
|
||||
* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
|
||||
* (either EPT Violation/Misconfig or #NPF), and so KVM will never
|
||||
* install a SPTE for such addresses. If KVM is running as a VM
|
||||
* itself, on the other hand, it might see a MAXPHYADDR that is less
|
||||
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
|
||||
* disallows such SPTEs entirely and simplifies the TDP MMU.
|
||||
*/
|
||||
int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
|
||||
|
||||
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
|
||||
}
|
||||
|
||||
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
|
||||
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
|
||||
|
||||
|
|
|
@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
|
|||
const struct kvm_memory_slot *slot)
|
||||
{
|
||||
unsigned long hva;
|
||||
pte_t *pte;
|
||||
int level;
|
||||
unsigned long flags;
|
||||
int level = PG_LEVEL_4K;
|
||||
pgd_t pgd;
|
||||
p4d_t p4d;
|
||||
pud_t pud;
|
||||
pmd_t pmd;
|
||||
|
||||
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
|
||||
return PG_LEVEL_4K;
|
||||
|
@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
|
|||
*/
|
||||
hva = __gfn_to_hva_memslot(slot, gfn);
|
||||
|
||||
pte = lookup_address_in_mm(kvm->mm, hva, &level);
|
||||
if (unlikely(!pte))
|
||||
return PG_LEVEL_4K;
|
||||
/*
|
||||
* Lookup the mapping level in the current mm. The information
|
||||
* may become stale soon, but it is safe to use as long as
|
||||
* 1) mmu_notifier_retry was checked after taking mmu_lock, and
|
||||
* 2) mmu_lock is taken now.
|
||||
*
|
||||
* We still need to disable IRQs to prevent concurrent tear down
|
||||
* of page tables.
|
||||
*/
|
||||
local_irq_save(flags);
|
||||
|
||||
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
|
||||
if (pgd_none(pgd))
|
||||
goto out;
|
||||
|
||||
p4d = READ_ONCE(*p4d_offset(&pgd, hva));
|
||||
if (p4d_none(p4d) || !p4d_present(p4d))
|
||||
goto out;
|
||||
|
||||
pud = READ_ONCE(*pud_offset(&p4d, hva));
|
||||
if (pud_none(pud) || !pud_present(pud))
|
||||
goto out;
|
||||
|
||||
if (pud_large(pud)) {
|
||||
level = PG_LEVEL_1G;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pmd = READ_ONCE(*pmd_offset(&pud, hva));
|
||||
if (pmd_none(pmd) || !pmd_present(pmd))
|
||||
goto out;
|
||||
|
||||
if (pmd_large(pmd))
|
||||
level = PG_LEVEL_2M;
|
||||
|
||||
out:
|
||||
local_irq_restore(flags);
|
||||
return level;
|
||||
}
|
||||
|
||||
|
@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
|
|||
/*
|
||||
* If MMIO caching is disabled, emulate immediately without
|
||||
* touching the shadow page tables as attempting to install an
|
||||
* MMIO SPTE will just be an expensive nop.
|
||||
* MMIO SPTE will just be an expensive nop. Do not cache MMIO
|
||||
* whose gfn is greater than host.MAXPHYADDR, any guest that
|
||||
* generates such gfns is running nested and is being tricked
|
||||
* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
|
||||
* and only if L1's MAXPHYADDR is inaccurate with respect to
|
||||
* the hardware's).
|
||||
*/
|
||||
if (unlikely(!shadow_mmio_value)) {
|
||||
if (unlikely(!shadow_mmio_value) ||
|
||||
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
|
||||
*ret_val = RET_PF_EMULATE;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
|
|||
*/
|
||||
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
|
||||
|
||||
/*
|
||||
* The number of non-reserved physical address bits irrespective of features
|
||||
* that repurpose legal bits, e.g. MKTME.
|
||||
*/
|
||||
extern u8 __read_mostly shadow_phys_bits;
|
||||
|
||||
static inline bool is_mmio_spte(u64 spte)
|
||||
{
|
||||
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
|
||||
|
|
|
@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
|
|||
return iter->yielded;
|
||||
}
|
||||
|
||||
static inline gfn_t tdp_mmu_max_gfn_host(void)
|
||||
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
|
||||
{
|
||||
/*
|
||||
* Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
|
||||
* will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
|
||||
* and so KVM will never install a SPTE for such addresses.
|
||||
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
|
||||
* a gpa range that would exceed the max gfn, and KVM does not create
|
||||
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
|
||||
* the slow emulation path every time.
|
||||
*/
|
||||
return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
|
||||
return kvm_mmu_max_gfn() + 1;
|
||||
}
|
||||
|
||||
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
|
@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
{
|
||||
struct tdp_iter iter;
|
||||
|
||||
gfn_t end = tdp_mmu_max_gfn_host();
|
||||
gfn_t end = tdp_mmu_max_gfn_exclusive();
|
||||
gfn_t start = 0;
|
||||
|
||||
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
|
||||
|
@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
{
|
||||
struct tdp_iter iter;
|
||||
|
||||
end = min(end, tdp_mmu_max_gfn_host());
|
||||
end = min(end, tdp_mmu_max_gfn_exclusive());
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
|
|
|
@ -10020,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
|||
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
|
||||
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
|
||||
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
|
||||
vcpu->run->system_event.ndata = 0;
|
||||
r = 0;
|
||||
goto out;
|
||||
}
|
||||
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
|
||||
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
|
||||
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
|
||||
vcpu->run->system_event.ndata = 0;
|
||||
r = 0;
|
||||
goto out;
|
||||
}
|
||||
|
@ -12009,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
|||
struct kvm_memory_slot *new,
|
||||
enum kvm_mr_change change)
|
||||
{
|
||||
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
|
||||
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
|
||||
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
|
||||
return -EINVAL;
|
||||
|
||||
return kvm_alloc_memslot_metadata(kvm, new);
|
||||
}
|
||||
|
||||
if (change == KVM_MR_FLAGS_ONLY)
|
||||
memcpy(&new->arch, &old->arch, sizeof(old->arch));
|
||||
|
|
|
@ -445,7 +445,13 @@ struct kvm_run {
|
|||
#define KVM_SYSTEM_EVENT_RESET 2
|
||||
#define KVM_SYSTEM_EVENT_CRASH 3
|
||||
__u32 type;
|
||||
__u64 flags;
|
||||
__u32 ndata;
|
||||
union {
|
||||
#ifndef __KERNEL__
|
||||
__u64 flags;
|
||||
#endif
|
||||
__u64 data[16];
|
||||
};
|
||||
} system_event;
|
||||
/* KVM_EXIT_S390_STSI */
|
||||
struct {
|
||||
|
@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
|
|||
#define KVM_CAP_S390_MEM_OP_EXTENSION 211
|
||||
#define KVM_CAP_PMU_CAPABILITY 212
|
||||
#define KVM_CAP_DISABLE_QUIRKS2 213
|
||||
/* #define KVM_CAP_VM_TSC_CONTROL 214 */
|
||||
#define KVM_CAP_SYSTEM_EVENT_DATA 215
|
||||
|
||||
#ifdef KVM_CAP_IRQ_ROUTING
|
||||
|
||||
|
|
|
@ -4354,6 +4354,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
|
|||
return 0;
|
||||
#endif
|
||||
case KVM_CAP_BINARY_STATS_FD:
|
||||
case KVM_CAP_SYSTEM_EVENT_DATA:
|
||||
return 1;
|
||||
default:
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue