* Build fixes for PPC KVM
* Miscellaneous bugfixes for ARM KVM * Cleanup of memory barrier and removal of redundant barriers * x86 fixes: page tracking oops, support for old buggy KVM nested on 4.5 * Support for protection keys in guests * Lockdep fix * Another conversion to simple wait queues and raw spinlocks, backported from PREEMPT_RT -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJW8aaXAAoJEL/70l94x66D7voH/i2ytj6PbuWQQobSKDY38x8F MHDFJ5UgTFZPPt8cB8YiCl6Tu0C5I2mNOk0rfb+bcpM5C1U9IAnBbbupyUblp6K9 1u+u+al8IlnOsoLzJSUXKDK5H4mEVrUnwVxTpZol5Ph5qQ8FvpbkxboMu3AGevO5 PIUXucK7fP5WXVV3Nh4YnUnBkeYzuuXcqYcV/TjscNQ4NcMofElcgpBxmE498TTk rvhyuf2chEtY2DsDh3nzeYgxcGLpvE4/l5a+puEoOx4M5CH24wwne9LHAWJz6ofm H3XNhsCz3jIGmrNqqkGUUya5qSkCsq2ha7n+VDw+fiP1TKy3FtkrBYQrDj+ISuc= =UtvF -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull more KVM updates from Paolo Bonzini: "Second round of KVM changes for 4.6: - build fixes for PPC KVM - miscellaneous bugfixes for ARM KVM - cleanup of memory barrier and removal of redundant barriers - x86 fixes: page tracking oops, support for old buggy KVM nested on 4.5 - support for protection keys in guests - lockdep fix - another conversion to simple wait queues and raw spinlocks, backported from PREEMPT_RT" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (27 commits) KVM: page_track: fix access to NULL slot KVM: PPC: do not compile in vfio.o unconditionally kvm, rt: change async pagefault code locking for PREEMPT_RT KVM/PPC: update the comment of memory barrier in the kvmppc_prepare_to_enter() KVM/x86: update the comment of memory barrier in the vcpu_enter_guest() KVM: Replace smp_mb() with smp_load_acquire() in the kvm_flush_remote_tlbs() KVM/x86: Call smp_wmb() before increasing tlbs_dirty KVM: Replace smp_mb() with smp_mb_after_atomic() in the kvm_make_all_cpus_request() KVM/x86: Replace smp_mb() with smp_store_mb/release() in the walk_shadow_page_lockless_begin/end() KVM: Remove redundant smp_mb() in the kvm_mmu_commit_zap_page() KVM, pkeys: expose CPUID/CR4 to guest KVM, pkeys: add pkeys support for permission_fault KVM, pkeys: introduce pkru_mask to cache conditions KVM, pkeys: save/restore PKRU when guest/host switches x86: pkey: introduce write_pkru() for KVM KVM, pkeys: add pkeys support for xsave state KVM, pkeys: disable pkeys for guests in non-paging mode KVM: x86: remove magic number with enum cpuid_leafs KVM: MMU: return page fault error code from permission_fault KVM: fix spin_lock_init order on x86 ...
This commit is contained in:
commit
b91d9c6716
|
@ -373,7 +373,9 @@ static void exit_vm_noop(void *info)
|
|||
|
||||
void force_vm_exit(const cpumask_t *mask)
|
||||
{
|
||||
preempt_disable();
|
||||
smp_call_function_many(mask, exit_vm_noop, NULL, true);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -26,7 +26,13 @@
|
|||
#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0
|
||||
#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
|
||||
|
||||
#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset)
|
||||
#define kvm_ksym_ref(sym) \
|
||||
({ \
|
||||
void *val = &sym; \
|
||||
if (!is_kernel_in_hyp_mode()) \
|
||||
val = phys_to_virt((u64)&sym - kimage_voffset); \
|
||||
val; \
|
||||
})
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
struct kvm;
|
||||
|
|
|
@ -16,3 +16,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
|
|||
obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
|
||||
obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
|
||||
obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
|
||||
|
||||
GCOV_PROFILE := n
|
||||
KASAN_SANITIZE := n
|
||||
UBSAN_SANITIZE := n
|
||||
|
|
|
@ -8,7 +8,8 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
|
|||
KVM := ../../../virt/kvm
|
||||
|
||||
common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
|
||||
$(KVM)/eventfd.o $(KVM)/vfio.o
|
||||
$(KVM)/eventfd.o
|
||||
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
|
||||
|
||||
CFLAGS_e500_mmu.o := -I.
|
||||
CFLAGS_e500_mmu_host.o := -I.
|
||||
|
|
|
@ -209,6 +209,32 @@ fail:
|
|||
return ret;
|
||||
}
|
||||
|
||||
long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
||||
unsigned long ioba, unsigned long tce)
|
||||
{
|
||||
struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
|
||||
long ret;
|
||||
|
||||
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
|
||||
/* liobn, ioba, tce); */
|
||||
|
||||
if (!stt)
|
||||
return H_TOO_HARD;
|
||||
|
||||
ret = kvmppc_ioba_validate(stt, ioba, 1);
|
||||
if (ret != H_SUCCESS)
|
||||
return ret;
|
||||
|
||||
ret = kvmppc_tce_validate(stt, tce);
|
||||
if (ret != H_SUCCESS)
|
||||
return ret;
|
||||
|
||||
kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
|
||||
|
||||
return H_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
|
||||
|
||||
long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
|
||||
unsigned long liobn, unsigned long ioba,
|
||||
unsigned long tce_list, unsigned long npages)
|
||||
|
@ -264,3 +290,29 @@ unlock_exit:
|
|||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
|
||||
|
||||
long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
|
||||
unsigned long liobn, unsigned long ioba,
|
||||
unsigned long tce_value, unsigned long npages)
|
||||
{
|
||||
struct kvmppc_spapr_tce_table *stt;
|
||||
long i, ret;
|
||||
|
||||
stt = kvmppc_find_table(vcpu, liobn);
|
||||
if (!stt)
|
||||
return H_TOO_HARD;
|
||||
|
||||
ret = kvmppc_ioba_validate(stt, ioba, npages);
|
||||
if (ret != H_SUCCESS)
|
||||
return ret;
|
||||
|
||||
/* Check permission bits only to allow userspace poison TCE for debug */
|
||||
if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
|
||||
return H_PARAMETER;
|
||||
|
||||
for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
|
||||
kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
|
||||
|
||||
return H_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
|
||||
|
|
|
@ -180,8 +180,8 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
|
|||
EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
|
||||
|
||||
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
||||
long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
||||
unsigned long ioba, unsigned long tce)
|
||||
long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
||||
unsigned long ioba, unsigned long tce)
|
||||
{
|
||||
struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
|
||||
long ret;
|
||||
|
@ -204,7 +204,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
|||
|
||||
return H_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
|
||||
|
||||
static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
|
||||
unsigned long ua, unsigned long *phpa)
|
||||
|
@ -296,7 +295,7 @@ unlock_exit:
|
|||
return ret;
|
||||
}
|
||||
|
||||
long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
|
||||
long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
|
||||
unsigned long liobn, unsigned long ioba,
|
||||
unsigned long tce_value, unsigned long npages)
|
||||
{
|
||||
|
@ -320,7 +319,6 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
|
|||
|
||||
return H_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
|
||||
|
||||
long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
||||
unsigned long ioba)
|
||||
|
|
|
@ -1942,7 +1942,7 @@ hcall_real_table:
|
|||
.long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_h_protect) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_h_get_tce) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_h_put_tce) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table
|
||||
.long 0 /* 0x24 - H_SET_SPRG0 */
|
||||
.long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
|
||||
.long 0 /* 0x2c */
|
||||
|
@ -2020,7 +2020,7 @@ hcall_real_table:
|
|||
.long 0 /* 0x12c */
|
||||
.long 0 /* 0x130 */
|
||||
.long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table
|
||||
.long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
|
||||
.long 0 /* 0x140 */
|
||||
.long 0 /* 0x144 */
|
||||
|
|
|
@ -96,6 +96,9 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
|
|||
* so we don't miss a request because the requester sees
|
||||
* OUTSIDE_GUEST_MODE and assumes we'll be checking requests
|
||||
* before next entering the guest (and thus doesn't IPI).
|
||||
* This also orders the write to mode from any reads
|
||||
* to the page tables done while the VCPU is running.
|
||||
* Please see the comment in kvm_flush_remote_tlbs.
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
|
|
|
@ -84,7 +84,8 @@
|
|||
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
|
||||
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
|
||||
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
|
||||
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
|
||||
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
|
||||
| X86_CR4_PKE))
|
||||
|
||||
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
|
||||
|
||||
|
@ -187,12 +188,14 @@ enum {
|
|||
#define PFERR_USER_BIT 2
|
||||
#define PFERR_RSVD_BIT 3
|
||||
#define PFERR_FETCH_BIT 4
|
||||
#define PFERR_PK_BIT 5
|
||||
|
||||
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
|
||||
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
|
||||
#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
|
||||
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
|
||||
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
|
||||
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
|
||||
|
||||
/* apic attention bits */
|
||||
#define KVM_APIC_CHECK_VAPIC 0
|
||||
|
@ -335,6 +338,14 @@ struct kvm_mmu {
|
|||
*/
|
||||
u8 permissions[16];
|
||||
|
||||
/*
|
||||
* The pkru_mask indicates if protection key checks are needed. It
|
||||
* consists of 16 domains indexed by page fault error code bits [4:1],
|
||||
* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
|
||||
* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
|
||||
*/
|
||||
u32 pkru_mask;
|
||||
|
||||
u64 *pae_root;
|
||||
u64 *lm_root;
|
||||
|
||||
|
@ -874,6 +885,7 @@ struct kvm_x86_ops {
|
|||
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
|
||||
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
|
||||
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
|
||||
u32 (*get_pkru)(struct kvm_vcpu *vcpu);
|
||||
void (*fpu_activate)(struct kvm_vcpu *vcpu);
|
||||
void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
|
||||
|
||||
|
|
|
@ -107,6 +107,12 @@ static inline u32 read_pkru(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline void write_pkru(u32 pkru)
|
||||
{
|
||||
if (boot_cpu_has(X86_FEATURE_OSPKE))
|
||||
__write_pkru(pkru);
|
||||
}
|
||||
|
||||
static inline int pte_young(pte_t pte)
|
||||
{
|
||||
return pte_flags(pte) & _PAGE_ACCESSED;
|
||||
|
|
|
@ -113,11 +113,27 @@ static inline u32 __read_pkru(void)
|
|||
: "c" (ecx));
|
||||
return pkru;
|
||||
}
|
||||
|
||||
static inline void __write_pkru(u32 pkru)
|
||||
{
|
||||
u32 ecx = 0, edx = 0;
|
||||
|
||||
/*
|
||||
* "wrpkru" instruction. Loads contents in EAX to PKRU,
|
||||
* requires that ecx = edx = 0.
|
||||
*/
|
||||
asm volatile(".byte 0x0f,0x01,0xef\n\t"
|
||||
: : "a" (pkru), "c"(ecx), "d"(edx));
|
||||
}
|
||||
#else
|
||||
static inline u32 __read_pkru(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __write_pkru(u32 pkru)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void native_wbinvd(void)
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include <linux/kprobes.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/swait.h>
|
||||
#include <asm/timer.h>
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/traps.h>
|
||||
|
@ -91,14 +92,14 @@ static void kvm_io_delay(void)
|
|||
|
||||
struct kvm_task_sleep_node {
|
||||
struct hlist_node link;
|
||||
wait_queue_head_t wq;
|
||||
struct swait_queue_head wq;
|
||||
u32 token;
|
||||
int cpu;
|
||||
bool halted;
|
||||
};
|
||||
|
||||
static struct kvm_task_sleep_head {
|
||||
spinlock_t lock;
|
||||
raw_spinlock_t lock;
|
||||
struct hlist_head list;
|
||||
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
|
||||
|
||||
|
@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
|
|||
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
|
||||
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
|
||||
struct kvm_task_sleep_node n, *e;
|
||||
DEFINE_WAIT(wait);
|
||||
DECLARE_SWAITQUEUE(wait);
|
||||
|
||||
rcu_irq_enter();
|
||||
|
||||
spin_lock(&b->lock);
|
||||
raw_spin_lock(&b->lock);
|
||||
e = _find_apf_task(b, token);
|
||||
if (e) {
|
||||
/* dummy entry exist -> wake up was delivered ahead of PF */
|
||||
hlist_del(&e->link);
|
||||
kfree(e);
|
||||
spin_unlock(&b->lock);
|
||||
raw_spin_unlock(&b->lock);
|
||||
|
||||
rcu_irq_exit();
|
||||
return;
|
||||
|
@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
|
|||
n.token = token;
|
||||
n.cpu = smp_processor_id();
|
||||
n.halted = is_idle_task(current) || preempt_count() > 1;
|
||||
init_waitqueue_head(&n.wq);
|
||||
init_swait_queue_head(&n.wq);
|
||||
hlist_add_head(&n.link, &b->list);
|
||||
spin_unlock(&b->lock);
|
||||
raw_spin_unlock(&b->lock);
|
||||
|
||||
for (;;) {
|
||||
if (!n.halted)
|
||||
prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
|
||||
prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
|
||||
if (hlist_unhashed(&n.link))
|
||||
break;
|
||||
|
||||
|
@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
|
|||
}
|
||||
}
|
||||
if (!n.halted)
|
||||
finish_wait(&n.wq, &wait);
|
||||
finish_swait(&n.wq, &wait);
|
||||
|
||||
rcu_irq_exit();
|
||||
return;
|
||||
|
@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
|
|||
hlist_del_init(&n->link);
|
||||
if (n->halted)
|
||||
smp_send_reschedule(n->cpu);
|
||||
else if (waitqueue_active(&n->wq))
|
||||
wake_up(&n->wq);
|
||||
else if (swait_active(&n->wq))
|
||||
swake_up(&n->wq);
|
||||
}
|
||||
|
||||
static void apf_task_wake_all(void)
|
||||
|
@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
|
|||
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
|
||||
struct hlist_node *p, *next;
|
||||
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
|
||||
spin_lock(&b->lock);
|
||||
raw_spin_lock(&b->lock);
|
||||
hlist_for_each_safe(p, next, &b->list) {
|
||||
struct kvm_task_sleep_node *n =
|
||||
hlist_entry(p, typeof(*n), link);
|
||||
if (n->cpu == smp_processor_id())
|
||||
apf_task_wake_one(n);
|
||||
}
|
||||
spin_unlock(&b->lock);
|
||||
raw_spin_unlock(&b->lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
|
|||
}
|
||||
|
||||
again:
|
||||
spin_lock(&b->lock);
|
||||
raw_spin_lock(&b->lock);
|
||||
n = _find_apf_task(b, token);
|
||||
if (!n) {
|
||||
/*
|
||||
|
@ -225,17 +226,17 @@ again:
|
|||
* Allocation failed! Busy wait while other cpu
|
||||
* handles async PF.
|
||||
*/
|
||||
spin_unlock(&b->lock);
|
||||
raw_spin_unlock(&b->lock);
|
||||
cpu_relax();
|
||||
goto again;
|
||||
}
|
||||
n->token = token;
|
||||
n->cpu = smp_processor_id();
|
||||
init_waitqueue_head(&n->wq);
|
||||
init_swait_queue_head(&n->wq);
|
||||
hlist_add_head(&n->link, &b->list);
|
||||
} else
|
||||
apf_task_wake_one(n);
|
||||
spin_unlock(&b->lock);
|
||||
raw_spin_unlock(&b->lock);
|
||||
return;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
|
||||
|
@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
|
|||
paravirt_ops_setup();
|
||||
register_reboot_notifier(&kvm_pv_reboot_nb);
|
||||
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
|
||||
spin_lock_init(&async_pf_sleepers[i].lock);
|
||||
raw_spin_lock_init(&async_pf_sleepers[i].lock);
|
||||
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
|
||||
x86_init.irqs.trap_init = kvm_apf_trap_init;
|
||||
|
||||
|
|
|
@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
|
|||
apic->lapic_timer.timer_mode_mask = 1 << 17;
|
||||
}
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 7, 0);
|
||||
if (best) {
|
||||
/* Update OSPKE bit */
|
||||
if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
|
||||
best->ecx &= ~F(OSPKE);
|
||||
if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
|
||||
best->ecx |= F(OSPKE);
|
||||
}
|
||||
}
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
|
||||
if (!best) {
|
||||
vcpu->arch.guest_supported_xcr0 = 0;
|
||||
|
@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
|
||||
|
||||
/* cpuid 1.edx */
|
||||
const u32 kvm_supported_word0_x86_features =
|
||||
const u32 kvm_cpuid_1_edx_x86_features =
|
||||
F(FPU) | F(VME) | F(DE) | F(PSE) |
|
||||
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
|
||||
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
|
||||
|
@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
|
||||
0 /* HTT, TM, Reserved, PBE */;
|
||||
/* cpuid 0x80000001.edx */
|
||||
const u32 kvm_supported_word1_x86_features =
|
||||
const u32 kvm_cpuid_8000_0001_edx_x86_features =
|
||||
F(FPU) | F(VME) | F(DE) | F(PSE) |
|
||||
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
|
||||
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
|
||||
|
@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
|
||||
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
|
||||
/* cpuid 1.ecx */
|
||||
const u32 kvm_supported_word4_x86_features =
|
||||
const u32 kvm_cpuid_1_ecx_x86_features =
|
||||
/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
|
||||
* but *not* advertised to guests via CPUID ! */
|
||||
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
|
||||
|
@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
|
||||
F(F16C) | F(RDRAND);
|
||||
/* cpuid 0x80000001.ecx */
|
||||
const u32 kvm_supported_word6_x86_features =
|
||||
const u32 kvm_cpuid_8000_0001_ecx_x86_features =
|
||||
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
|
||||
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
|
||||
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
|
||||
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
|
||||
|
||||
/* cpuid 0xC0000001.edx */
|
||||
const u32 kvm_supported_word5_x86_features =
|
||||
const u32 kvm_cpuid_C000_0001_edx_x86_features =
|
||||
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
|
||||
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
|
||||
F(PMM) | F(PMM_EN);
|
||||
|
||||
/* cpuid 7.0.ebx */
|
||||
const u32 kvm_supported_word9_x86_features =
|
||||
const u32 kvm_cpuid_7_0_ebx_x86_features =
|
||||
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
|
||||
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
|
||||
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
|
||||
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
|
||||
|
||||
/* cpuid 0xD.1.eax */
|
||||
const u32 kvm_supported_word10_x86_features =
|
||||
const u32 kvm_cpuid_D_1_eax_x86_features =
|
||||
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
|
||||
|
||||
/* cpuid 7.0.ecx*/
|
||||
const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
|
||||
|
||||
/* all calls to cpuid_count() should be made on the same cpu */
|
||||
get_cpu();
|
||||
|
||||
|
@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
entry->eax = min(entry->eax, (u32)0xd);
|
||||
break;
|
||||
case 1:
|
||||
entry->edx &= kvm_supported_word0_x86_features;
|
||||
cpuid_mask(&entry->edx, 0);
|
||||
entry->ecx &= kvm_supported_word4_x86_features;
|
||||
cpuid_mask(&entry->ecx, 4);
|
||||
entry->edx &= kvm_cpuid_1_edx_x86_features;
|
||||
cpuid_mask(&entry->edx, CPUID_1_EDX);
|
||||
entry->ecx &= kvm_cpuid_1_ecx_x86_features;
|
||||
cpuid_mask(&entry->ecx, CPUID_1_ECX);
|
||||
/* we support x2apic emulation even if host does not support
|
||||
* it since we emulate x2apic in software */
|
||||
entry->ecx |= F(X2APIC);
|
||||
|
@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
|
||||
/* Mask ebx against host capability word 9 */
|
||||
if (index == 0) {
|
||||
entry->ebx &= kvm_supported_word9_x86_features;
|
||||
cpuid_mask(&entry->ebx, 9);
|
||||
entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
|
||||
cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
|
||||
// TSC_ADJUST is emulated
|
||||
entry->ebx |= F(TSC_ADJUST);
|
||||
} else
|
||||
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
|
||||
cpuid_mask(&entry->ecx, CPUID_7_ECX);
|
||||
/* PKU is not yet implemented for shadow paging. */
|
||||
if (!tdp_enabled)
|
||||
entry->ecx &= ~F(PKU);
|
||||
} else {
|
||||
entry->ebx = 0;
|
||||
entry->ecx = 0;
|
||||
}
|
||||
entry->eax = 0;
|
||||
entry->ecx = 0;
|
||||
entry->edx = 0;
|
||||
break;
|
||||
}
|
||||
|
@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
|
||||
do_cpuid_1_ent(&entry[i], function, idx);
|
||||
if (idx == 1) {
|
||||
entry[i].eax &= kvm_supported_word10_x86_features;
|
||||
entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
|
||||
entry[i].ebx = 0;
|
||||
if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
|
||||
entry[i].ebx =
|
||||
|
@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
entry->eax = min(entry->eax, 0x8000001a);
|
||||
break;
|
||||
case 0x80000001:
|
||||
entry->edx &= kvm_supported_word1_x86_features;
|
||||
cpuid_mask(&entry->edx, 1);
|
||||
entry->ecx &= kvm_supported_word6_x86_features;
|
||||
cpuid_mask(&entry->ecx, 6);
|
||||
entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
|
||||
cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
|
||||
entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
|
||||
cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
|
||||
break;
|
||||
case 0x80000007: /* Advanced power management */
|
||||
/* invariant TSC is CPUID.80000007H:EDX[8] */
|
||||
|
@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|||
entry->eax = min(entry->eax, 0xC0000004);
|
||||
break;
|
||||
case 0xC0000001:
|
||||
entry->edx &= kvm_supported_word5_x86_features;
|
||||
cpuid_mask(&entry->edx, 5);
|
||||
entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
|
||||
cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
|
||||
break;
|
||||
case 3: /* Processor serial number */
|
||||
case 5: /* MONITOR/MWAIT */
|
||||
|
|
|
@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
|
|||
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
|
||||
}
|
||||
|
||||
static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 7, 0);
|
||||
return best && (best->ecx & bit(X86_FEATURE_PKU));
|
||||
}
|
||||
|
||||
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
|
|
@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
|
|||
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
|
||||
}
|
||||
|
||||
static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return kvm_x86_ops->get_pkru(vcpu);
|
||||
}
|
||||
|
||||
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
vcpu->arch.hflags |= HF_GUEST_MASK;
|
||||
|
|
|
@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
|
|||
* kvm_flush_remote_tlbs() IPI to all active vcpus.
|
||||
*/
|
||||
local_irq_disable();
|
||||
vcpu->mode = READING_SHADOW_PAGE_TABLES;
|
||||
|
||||
/*
|
||||
* Make sure a following spte read is not reordered ahead of the write
|
||||
* to vcpu->mode.
|
||||
*/
|
||||
smp_mb();
|
||||
smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
|
||||
}
|
||||
|
||||
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
|
||||
|
@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
|
|||
* reads to sptes. If it does, kvm_commit_zap_page() can see us
|
||||
* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
|
||||
*/
|
||||
smp_mb();
|
||||
vcpu->mode = OUTSIDE_GUEST_MODE;
|
||||
smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
|
@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
|
|||
return;
|
||||
|
||||
/*
|
||||
* wmb: make sure everyone sees our modifications to the page tables
|
||||
* rmb: make sure we see changes to vcpu->mode
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
/*
|
||||
* Wait for all vcpus to exit guest mode and/or lockless shadow
|
||||
* page table walks.
|
||||
* We need to make sure everyone sees our modifications to
|
||||
* the page tables and see changes to vcpu->mode here. The barrier
|
||||
* in the kvm_flush_remote_tlbs() achieves this. This pairs
|
||||
* with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
|
||||
*
|
||||
* In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
|
||||
* guest mode and/or lockless shadow page table walks.
|
||||
*/
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
|
@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* PKU is an additional mechanism by which the paging controls access to
|
||||
* user-mode addresses based on the value in the PKRU register. Protection
|
||||
* key violations are reported through a bit in the page fault error code.
|
||||
* Unlike other bits of the error code, the PK bit is not known at the
|
||||
* call site of e.g. gva_to_gpa; it must be computed directly in
|
||||
* permission_fault based on two bits of PKRU, on some machine state (CR4,
|
||||
* CR0, EFER, CPL), and on other bits of the error code and the page tables.
|
||||
*
|
||||
* In particular the following conditions come from the error code, the
|
||||
* page tables and the machine state:
|
||||
* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
|
||||
* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
|
||||
* - PK is always zero if U=0 in the page tables
|
||||
* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
|
||||
*
|
||||
* The PKRU bitmask caches the result of these four conditions. The error
|
||||
* code (minus the P bit) and the page table's U bit form an index into the
|
||||
* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
|
||||
* with the two bits of the PKRU register corresponding to the protection key.
|
||||
* For the first three conditions above the bits will be 00, thus masking
|
||||
* away both AD and WD. For all reads or if the last condition holds, WD
|
||||
* only will be masked away.
|
||||
*/
|
||||
static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||
bool ept)
|
||||
{
|
||||
unsigned bit;
|
||||
bool wp;
|
||||
|
||||
if (ept) {
|
||||
mmu->pkru_mask = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
|
||||
if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
|
||||
mmu->pkru_mask = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
wp = is_write_protection(vcpu);
|
||||
|
||||
for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
|
||||
unsigned pfec, pkey_bits;
|
||||
bool check_pkey, check_write, ff, uf, wf, pte_user;
|
||||
|
||||
pfec = bit << 1;
|
||||
ff = pfec & PFERR_FETCH_MASK;
|
||||
uf = pfec & PFERR_USER_MASK;
|
||||
wf = pfec & PFERR_WRITE_MASK;
|
||||
|
||||
/* PFEC.RSVD is replaced by ACC_USER_MASK. */
|
||||
pte_user = pfec & PFERR_RSVD_MASK;
|
||||
|
||||
/*
|
||||
* Only need to check the access which is not an
|
||||
* instruction fetch and is to a user page.
|
||||
*/
|
||||
check_pkey = (!ff && pte_user);
|
||||
/*
|
||||
* write access is controlled by PKRU if it is a
|
||||
* user access or CR0.WP = 1.
|
||||
*/
|
||||
check_write = check_pkey && wf && (uf || wp);
|
||||
|
||||
/* PKRU.AD stops both read and write access. */
|
||||
pkey_bits = !!check_pkey;
|
||||
/* PKRU.WD stops write access. */
|
||||
pkey_bits |= (!!check_write) << 1;
|
||||
|
||||
mmu->pkru_mask |= (pkey_bits & 3) << pfec;
|
||||
}
|
||||
}
|
||||
|
||||
static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
|
||||
{
|
||||
unsigned root_level = mmu->root_level;
|
||||
|
@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
|
|||
|
||||
reset_rsvds_bits_mask(vcpu, context);
|
||||
update_permission_bitmask(vcpu, context, false);
|
||||
update_pkru_bitmask(vcpu, context, false);
|
||||
update_last_nonleaf_level(vcpu, context);
|
||||
|
||||
MMU_WARN_ON(!is_pae(vcpu));
|
||||
|
@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
|
|||
|
||||
reset_rsvds_bits_mask(vcpu, context);
|
||||
update_permission_bitmask(vcpu, context, false);
|
||||
update_pkru_bitmask(vcpu, context, false);
|
||||
update_last_nonleaf_level(vcpu, context);
|
||||
|
||||
context->page_fault = paging32_page_fault;
|
||||
|
@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
update_permission_bitmask(vcpu, context, false);
|
||||
update_pkru_bitmask(vcpu, context, false);
|
||||
update_last_nonleaf_level(vcpu, context);
|
||||
reset_tdp_shadow_zero_bits_mask(vcpu, context);
|
||||
}
|
||||
|
@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
|
|||
context->direct_map = false;
|
||||
|
||||
update_permission_bitmask(vcpu, context, true);
|
||||
update_pkru_bitmask(vcpu, context, true);
|
||||
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
|
||||
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
|
||||
}
|
||||
|
@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
update_permission_bitmask(vcpu, g_context, false);
|
||||
update_pkru_bitmask(vcpu, g_context, false);
|
||||
update_last_nonleaf_level(vcpu, g_context);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,10 +10,11 @@
|
|||
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
|
||||
|
||||
#define PT_WRITABLE_SHIFT 1
|
||||
#define PT_USER_SHIFT 2
|
||||
|
||||
#define PT_PRESENT_MASK (1ULL << 0)
|
||||
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
|
||||
#define PT_USER_MASK (1ULL << 2)
|
||||
#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
|
||||
#define PT_PWT_MASK (1ULL << 3)
|
||||
#define PT_PCD_MASK (1ULL << 4)
|
||||
#define PT_ACCESSED_SHIFT 5
|
||||
|
@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
/*
|
||||
* Will a fault with a given page-fault error code (pfec) cause a permission
|
||||
* fault with the given access (in ACC_* format)?
|
||||
* Check if a given access (described through the I/D, W/R and U/S bits of a
|
||||
* page fault error code pfec) causes a permission fault with the given PTE
|
||||
* access rights (in ACC_* format).
|
||||
*
|
||||
* Return zero if the access does not fault; return the page fault error code
|
||||
* if the access faults.
|
||||
*/
|
||||
static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||
unsigned pte_access, unsigned pfec)
|
||||
static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||
unsigned pte_access, unsigned pte_pkey,
|
||||
unsigned pfec)
|
||||
{
|
||||
int cpl = kvm_x86_ops->get_cpl(vcpu);
|
||||
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
|
||||
|
@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
|||
unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
|
||||
int index = (pfec >> 1) +
|
||||
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
|
||||
bool fault = (mmu->permissions[index] >> pte_access) & 1;
|
||||
|
||||
WARN_ON(pfec & PFERR_RSVD_MASK);
|
||||
WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
|
||||
pfec |= PFERR_PRESENT_MASK;
|
||||
|
||||
return (mmu->permissions[index] >> pte_access) & 1;
|
||||
if (unlikely(mmu->pkru_mask)) {
|
||||
u32 pkru_bits, offset;
|
||||
|
||||
/*
|
||||
* PKRU defines 32 bits, there are 16 domains and 2
|
||||
* attribute bits per domain in pkru. pte_pkey is the
|
||||
* index of the protection domain, so pte_pkey * 2 is
|
||||
* is the index of the first bit for the domain.
|
||||
*/
|
||||
pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
|
||||
|
||||
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
|
||||
offset = pfec - 1 +
|
||||
((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
|
||||
|
||||
pkru_bits &= mmu->pkru_mask >> offset;
|
||||
pfec |= -pkru_bits & PFERR_PK_MASK;
|
||||
fault |= (pkru_bits != 0);
|
||||
}
|
||||
|
||||
return -(uint32_t)fault & pfec;
|
||||
}
|
||||
|
||||
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
|
||||
|
|
|
@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
|
|||
bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
enum kvm_page_track_mode mode)
|
||||
{
|
||||
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
|
||||
int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
|
||||
struct kvm_memory_slot *slot;
|
||||
int index;
|
||||
|
||||
if (WARN_ON(!page_track_mode_is_valid(mode)))
|
||||
return false;
|
||||
|
||||
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
|
||||
if (!slot)
|
||||
return false;
|
||||
|
||||
index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
|
||||
return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
|
||||
}
|
||||
|
||||
|
|
|
@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
|
||||
{
|
||||
unsigned pkeys = 0;
|
||||
#if PTTYPE == 64
|
||||
pte_t pte = {.pte = gpte};
|
||||
|
||||
pkeys = pte_flags_pkey(pte_flags(pte));
|
||||
#endif
|
||||
return pkeys;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch a guest pte for a guest virtual address
|
||||
*/
|
||||
|
@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
|
|||
pt_element_t pte;
|
||||
pt_element_t __user *uninitialized_var(ptep_user);
|
||||
gfn_t table_gfn;
|
||||
unsigned index, pt_access, pte_access, accessed_dirty;
|
||||
unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
|
||||
gpa_t pte_gpa;
|
||||
int offset;
|
||||
const int write_fault = access & PFERR_WRITE_MASK;
|
||||
|
@ -359,10 +370,10 @@ retry_walk:
|
|||
walker->ptes[walker->level - 1] = pte;
|
||||
} while (!is_last_gpte(mmu, walker->level, pte));
|
||||
|
||||
if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
|
||||
errcode |= PFERR_PRESENT_MASK;
|
||||
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
|
||||
errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
|
||||
if (unlikely(errcode))
|
||||
goto error;
|
||||
}
|
||||
|
||||
gfn = gpte_to_gfn_lvl(pte, walker->level);
|
||||
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
|
||||
|
@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
return 0;
|
||||
|
||||
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
|
||||
/*
|
||||
* Update spte before increasing tlbs_dirty to make
|
||||
* sure no tlb flush is lost after spte is zapped; see
|
||||
* the comments in kvm_flush_remote_tlbs().
|
||||
*/
|
||||
smp_wmb();
|
||||
vcpu->kvm->tlbs_dirty++;
|
||||
continue;
|
||||
}
|
||||
|
@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
|
||||
if (gfn != sp->gfns[i]) {
|
||||
drop_spte(vcpu->kvm, &sp->spt[i]);
|
||||
/*
|
||||
* The same as above where we are doing
|
||||
* prefetch_invalid_gpte().
|
||||
*/
|
||||
smp_wmb();
|
||||
vcpu->kvm->tlbs_dirty++;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
|
|||
to_svm(vcpu)->vmcb->save.rflags = rflags;
|
||||
}
|
||||
|
||||
static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
|
||||
{
|
||||
switch (reg) {
|
||||
|
@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
|
|||
.cache_reg = svm_cache_reg,
|
||||
.get_rflags = svm_get_rflags,
|
||||
.set_rflags = svm_set_rflags,
|
||||
|
||||
.get_pkru = svm_get_pkru,
|
||||
|
||||
.fpu_activate = svm_fpu_activate,
|
||||
.fpu_deactivate = svm_fpu_deactivate,
|
||||
|
||||
|
|
|
@ -598,6 +598,10 @@ struct vcpu_vmx {
|
|||
struct page *pml_pg;
|
||||
|
||||
u64 current_tsc_ratio;
|
||||
|
||||
bool guest_pkru_valid;
|
||||
u32 guest_pkru;
|
||||
u32 host_pkru;
|
||||
};
|
||||
|
||||
enum segment_cache_field {
|
||||
|
@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
} while (cmpxchg(&pi_desc->control, old.control,
|
||||
new.control) != old.control);
|
||||
}
|
||||
|
||||
/*
|
||||
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
|
||||
* vcpu mutex is already taken.
|
||||
|
@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
}
|
||||
|
||||
vmx_vcpu_pi_load(vcpu, cpu);
|
||||
vmx->host_pkru = read_pkru();
|
||||
}
|
||||
|
||||
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
|
@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
|
|||
vmcs_writel(GUEST_RFLAGS, rflags);
|
||||
}
|
||||
|
||||
static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return to_vmx(vcpu)->guest_pkru;
|
||||
}
|
||||
|
||||
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
|
||||
|
@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
|
|||
} else
|
||||
vmx->nested.nested_vmx_ept_caps = 0;
|
||||
|
||||
/*
|
||||
* Old versions of KVM use the single-context version without
|
||||
* checking for support, so declare that it is supported even
|
||||
* though it is treated as global context. The alternative is
|
||||
* not failing the single-context invvpid, and it is worse.
|
||||
*/
|
||||
if (enable_vpid)
|
||||
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
|
||||
VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
|
||||
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
|
||||
else
|
||||
vmx->nested.nested_vmx_vpid_caps = 0;
|
||||
|
@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
|
||||
if (!enable_unrestricted_guest && !is_paging(vcpu))
|
||||
/*
|
||||
* SMEP/SMAP is disabled if CPU is in non-paging mode in
|
||||
* hardware. However KVM always uses paging mode without
|
||||
* unrestricted guest.
|
||||
* To emulate this behavior, SMEP/SMAP needs to be manually
|
||||
* disabled when guest switches to non-paging mode.
|
||||
* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
|
||||
* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
|
||||
* to be manually disabled when guest switches to non-paging
|
||||
* mode.
|
||||
*
|
||||
* If !enable_unrestricted_guest, the CPU is always running
|
||||
* with CR0.PG=1 and CR4 needs to be modified.
|
||||
* If enable_unrestricted_guest, the CPU automatically
|
||||
* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
|
||||
*/
|
||||
hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
|
||||
hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
|
||||
|
||||
vmcs_writel(CR4_READ_SHADOW, cr4);
|
||||
vmcs_writel(GUEST_CR4, hw_cr4);
|
||||
|
@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
|
|||
if (!(types & (1UL << type))) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
|
|||
if (!(types & (1UL << type))) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
switch (type) {
|
||||
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
|
||||
/*
|
||||
* Old versions of KVM use the single-context version so we
|
||||
* have to support it; just treat it the same as all-context.
|
||||
*/
|
||||
case VMX_VPID_EXTENT_ALL_CONTEXT:
|
||||
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
|
||||
nested_vmx_succeed(vcpu);
|
||||
break;
|
||||
default:
|
||||
/* Trap single context invalidation invvpid calls */
|
||||
/* Trap individual address invalidation invvpid calls */
|
||||
BUG_ON(1);
|
||||
break;
|
||||
}
|
||||
|
@ -8621,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
|
||||
vmx_set_interrupt_shadow(vcpu, 0);
|
||||
|
||||
if (vmx->guest_pkru_valid)
|
||||
__write_pkru(vmx->guest_pkru);
|
||||
|
||||
atomic_switch_perf_msrs(vmx);
|
||||
debugctlmsr = get_debugctlmsr();
|
||||
|
||||
|
@ -8760,6 +8792,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
|
||||
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
|
||||
|
||||
/*
|
||||
* eager fpu is enabled if PKEY is supported and CR4 is switched
|
||||
* back on host, so it is safe to read guest PKRU from current
|
||||
* XSAVE.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_OSPKE)) {
|
||||
vmx->guest_pkru = __read_pkru();
|
||||
if (vmx->guest_pkru != vmx->host_pkru) {
|
||||
vmx->guest_pkru_valid = true;
|
||||
__write_pkru(vmx->host_pkru);
|
||||
} else
|
||||
vmx->guest_pkru_valid = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
|
||||
* we did not inject a still-pending event to L1 now because of
|
||||
|
@ -10884,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
|
|||
.cache_reg = vmx_cache_reg,
|
||||
.get_rflags = vmx_get_rflags,
|
||||
.set_rflags = vmx_set_rflags,
|
||||
|
||||
.get_pkru = vmx_get_pkru,
|
||||
|
||||
.fpu_activate = vmx_fpu_activate,
|
||||
.fpu_deactivate = vmx_fpu_deactivate,
|
||||
|
||||
|
|
|
@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
{
|
||||
unsigned long old_cr4 = kvm_read_cr4(vcpu);
|
||||
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
|
||||
X86_CR4_SMEP | X86_CR4_SMAP;
|
||||
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
|
||||
|
||||
if (cr4 & CR4_RESERVED_BITS)
|
||||
return 1;
|
||||
|
@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
|
||||
return 1;
|
||||
|
||||
if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
|
||||
return 1;
|
||||
|
||||
if (is_long_mode(vcpu)) {
|
||||
if (!(cr4 & X86_CR4_PAE))
|
||||
return 1;
|
||||
|
@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
|
||||
kvm_mmu_reset_context(vcpu);
|
||||
|
||||
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
|
||||
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
|
||||
kvm_update_cpuid(vcpu);
|
||||
|
||||
return 0;
|
||||
|
@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
|
|||
u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
|
||||
| (write ? PFERR_WRITE_MASK : 0);
|
||||
|
||||
/*
|
||||
* currently PKRU is only applied to ept enabled guest so
|
||||
* there is no pkey in EPT page table for L1 guest or EPT
|
||||
* shadow page table for L2 guest.
|
||||
*/
|
||||
if (vcpu_match_mmio_gva(vcpu, gva)
|
||||
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
|
||||
vcpu->arch.access, access)) {
|
||||
vcpu->arch.access, 0, access)) {
|
||||
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
|
||||
(gva & (PAGE_SIZE - 1));
|
||||
trace_vcpu_match_mmio(gva, *gpa, write, false);
|
||||
|
@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
|||
|
||||
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
|
||||
|
||||
/* We should set ->mode before check ->requests,
|
||||
* see the comment in make_all_cpus_request.
|
||||
/*
|
||||
* We should set ->mode before check ->requests,
|
||||
* Please see the comment in kvm_make_all_cpus_request.
|
||||
* This also orders the write to mode from any reads
|
||||
* to the page tables done while the VCPU is running.
|
||||
* Please see the comment in kvm_flush_remote_tlbs.
|
||||
*/
|
||||
smp_mb__after_srcu_read_unlock();
|
||||
|
||||
|
@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
|
|||
|
||||
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
|
||||
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
|
||||
if (sregs->cr4 & X86_CR4_OSXSAVE)
|
||||
if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
|
||||
kvm_update_cpuid(vcpu);
|
||||
|
||||
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
||||
|
|
|
@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void);
|
|||
|
||||
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
|
||||
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
|
||||
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
|
||||
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
|
||||
| XFEATURE_MASK_PKRU)
|
||||
extern u64 host_xcr0;
|
||||
|
||||
extern u64 kvm_supported_xcr0(void);
|
||||
|
|
|
@ -170,8 +170,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
|
|||
kvm_make_request(req, vcpu);
|
||||
cpu = vcpu->cpu;
|
||||
|
||||
/* Set ->requests bit before we read ->mode */
|
||||
smp_mb();
|
||||
/* Set ->requests bit before we read ->mode. */
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (cpus != NULL && cpu != -1 && cpu != me &&
|
||||
kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
|
||||
|
@ -191,9 +191,23 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
|
|||
#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
|
||||
void kvm_flush_remote_tlbs(struct kvm *kvm)
|
||||
{
|
||||
long dirty_count = kvm->tlbs_dirty;
|
||||
/*
|
||||
* Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
|
||||
* kvm_make_all_cpus_request.
|
||||
*/
|
||||
long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
|
||||
|
||||
smp_mb();
|
||||
/*
|
||||
* We want to publish modifications to the page tables before reading
|
||||
* mode. Pairs with a memory barrier in arch-specific code.
|
||||
* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
|
||||
* and smp_mb in walk_shadow_page_lockless_begin/end.
|
||||
* - powerpc: smp_mb in kvmppc_prepare_to_enter.
|
||||
*
|
||||
* There is already an smp_mb__after_atomic() before
|
||||
* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
|
||||
* barrier here.
|
||||
*/
|
||||
if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
|
||||
++kvm->stat.remote_tlb_flush;
|
||||
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
|
||||
|
@ -536,6 +550,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
|||
if (!kvm)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
spin_lock_init(&kvm->mmu_lock);
|
||||
atomic_inc(¤t->mm->mm_count);
|
||||
kvm->mm = current->mm;
|
||||
kvm_eventfd_init(kvm);
|
||||
mutex_init(&kvm->lock);
|
||||
mutex_init(&kvm->irq_lock);
|
||||
mutex_init(&kvm->slots_lock);
|
||||
atomic_set(&kvm->users_count, 1);
|
||||
INIT_LIST_HEAD(&kvm->devices);
|
||||
|
||||
r = kvm_arch_init_vm(kvm, type);
|
||||
if (r)
|
||||
goto out_err_no_disable;
|
||||
|
@ -568,16 +592,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
|||
goto out_err;
|
||||
}
|
||||
|
||||
spin_lock_init(&kvm->mmu_lock);
|
||||
kvm->mm = current->mm;
|
||||
atomic_inc(&kvm->mm->mm_count);
|
||||
kvm_eventfd_init(kvm);
|
||||
mutex_init(&kvm->lock);
|
||||
mutex_init(&kvm->irq_lock);
|
||||
mutex_init(&kvm->slots_lock);
|
||||
atomic_set(&kvm->users_count, 1);
|
||||
INIT_LIST_HEAD(&kvm->devices);
|
||||
|
||||
r = kvm_init_mmu_notifier(kvm);
|
||||
if (r)
|
||||
goto out_err;
|
||||
|
@ -602,6 +616,7 @@ out_err_no_disable:
|
|||
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
|
||||
kvm_free_memslots(kvm, kvm->memslots[i]);
|
||||
kvm_arch_free_vm(kvm);
|
||||
mmdrop(current->mm);
|
||||
return ERR_PTR(r);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue